su2hmc
Loading...
Searching...
No Matches
force.c
Go to the documentation of this file.
1
5#include <matrices.h>
6int Gauge_force(double *dSdpi, Complex_f *ut[2],unsigned int *iu,unsigned int *id, float beta){
7 /*
8 * Calculates dSdpi due to the Wilson Action at each intermediate time
9 *
10 * Calls:
11 * =====
12 * C_Halo_swap_all, C_gather, C_Halo_swap_dir
13 *
14 * Parameters:
15 * =======
16 * double *dSdpi
17 * Complex_f *ut[0]
18 * Complex_f *ut[1]
19 * unsigned int *iu
20 * unsigned int *id
21 * float beta
22 *
23 * Returns:
24 * =======
25 * Zero on success, integer error code otherwise
26 */
27 const char *funcname = "Gauge_force";
28
29 //We define zero halos for debugging
30 // #ifdef _DEBUG
31 // memset(ut[0][kvol], 0, ndim*halo*sizeof(Complex_f));
32 // memset(ut[1][kvol], 0, ndim*halo*sizeof(Complex_f));
33 // #endif
34 //Was a trial field halo exchange here at one point.
35#ifdef __NVCC__
36 cuGauge_force(ut,dSdpi,beta,iu,id,dimGrid,dimBlock);
37#else
38 Complex_f *Sigma[2], *ush[2];
39 Sigma[0] = (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f));
40 Sigma[1]= (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f));
41 ush[0] = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f));
42 ush[1] = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f));
43 //Holders for directions
44 for(int mu=0; mu<ndim; mu++){
45 memset(Sigma[0],0, kvol*sizeof(Complex_f));
46 memset(Sigma[1],0, kvol*sizeof(Complex_f));
47 for(int nu=0; nu<ndim; nu++)
48 if(nu!=mu){
49 //The +ν Staple
50#pragma omp parallel for simd //aligned(ut[0],ut[1],Sigma[0],Sigma[1],iu:AVX)
51 for(int i=0;i<kvol;i++){
52 int uidm = iu[mu+ndim*i];
53 int uidn = iu[nu+ndim*i];
54 Complex_f a11=ut[0][uidm*ndim+nu]*conj(ut[0][uidn*ndim+mu])+\
55 ut[1][uidm*ndim+nu]*conj(ut[1][uidn*ndim+mu]);
56 Complex_f a12=-ut[0][uidm*ndim+nu]*ut[1][uidn*ndim+mu]+\
57 ut[1][uidm*ndim+nu]*ut[0][uidn*ndim+mu];
58 Sigma[0][i]+=a11*conj(ut[0][i*ndim+nu])+a12*conj(ut[1][i*ndim+nu]);
59 Sigma[1][i]+=-a11*ut[1][i*ndim+nu]+a12*ut[0][i*ndim+nu];
60 }
61 C_gather(ush[0], ut[0], kvol, id, nu);
62 C_gather(ush[1], ut[1], kvol, id, nu);
63#if(nproc>1)
64 CHalo_swap_dir(ush[0], 1, mu, DOWN); CHalo_swap_dir(ush[1], 1, mu, DOWN);
65#endif
66 //Next up, the -ν staple
67#pragma omp parallel for simd //aligned(ut[0],ut[1],ush[0],ush[1],Sigma[0],Sigma[1],iu,id:AVX)
68 for(int i=0;i<kvol;i++){
69 int uidm = iu[mu+ndim*i];
70 int didn = id[nu+ndim*i];
71 //uidm is correct here
72 Complex_f a11=conj(ush[0][uidm])*conj(ut[0][didn*ndim+mu])-\
73 ush[1][uidm]*conj(ut[1][didn*ndim+mu]);
74 Complex_f a12=-conj(ush[0][uidm])*ut[1][didn*ndim+mu]-\
75 ush[1][uidm]*ut[0][didn*ndim+mu];
76 Sigma[0][i]+=a11*ut[0][didn*ndim+nu]-a12*conj(ut[1][didn*ndim+nu]);
77 Sigma[1][i]+=a11*ut[1][didn*ndim+nu]+a12*conj(ut[0][didn*ndim+nu]);
78 }
79 }
80#pragma omp parallel for simd //aligned(ut[0],ut[1],Sigma[0],Sigma[1],dSdpi:AVX)
81 for(int i=0;i<kvol;i++){
82 Complex_f a11 = ut[0][i*ndim+mu]*Sigma[1][i]+ut[1][i*ndim+mu]*conj(Sigma[0][i]);
83 Complex_f a12 = ut[0][i*ndim+mu]*Sigma[0][i]+conj(ut[1][i*ndim+mu])*Sigma[1][i];
84
85 dSdpi[(i*nadj)*ndim+mu]=(double)(beta*cimag(a11));
86 dSdpi[(i*nadj+1)*ndim+mu]=(double)(beta*creal(a11));
87 dSdpi[(i*nadj+2)*ndim+mu]=(double)(beta*cimag(a12));
88 }
89 }
90 free(ush[0]); free(ush[1]); free(Sigma[0]); free(Sigma[1]);
91#endif
92 return 0;
93}
94int Force(double *dSdpi, int iflag, double res1, Complex *X0, Complex *X1, Complex *Phi,Complex *ut[2],\
95 Complex_f *ut_f[2],unsigned int *iu,unsigned int *id,Complex *gamval,Complex_f *gamval_f,\
96 int *gamin,double *dk[2], float *dk_f[2],Complex_f jqq,float akappa,float beta,double *ancg){
97 /*
98 * @brief Calculates the force @f$\frac{dS}{d\pi}@f$ at each intermediate time
99 *
100 * @param dSdpi: The force
101 * @param iflag: Invert before evaluating the force?
102 * @param res1: Conjugate gradient residule
103 * @param X0: Up/down partitioned pseudofermion field
104 * @param X1: Holder for the partitioned fermion field, then the conjugate gradient output
105 * @param Phi: Pseudofermion field
106 * @param ut[0],ut[1] Double precision colour fields
107 * @param ut_f[0],ut_f[1]: Single precision colour fields
108 * @param iu,id: Lattice indices
109 * @param gamin: Gamma indices
110 * @param gamval: Double precision gamma matrices
111 * @param gamval_f: Single precision gamma matrices
112 * @param dk[0]: @f$\left(1+\gamma_0\right)e^{-\mu}@f$
113 * @param dk[1]: @f$\left(1-\gamma_0\right)e^\mu@f$
114 * @param dk_f[0]: @f$\left(1+\gamma_0\right)e^{-\mu}@f$ float
115 * @param dk_f[1]: @f$\left(1-\gamma_0\right)e^\mu@f$ float
116 * @param jqq: Diquark source
117 * @param akappa: Hopping parameter
118 * @param beta: Inverse gauge coupling
119 * @param ancg: Counter for conjugate gradient iterations
120 *
121 * @return Zero on success, integer error code otherwise
122 */
123 const char *funcname = "Force";
124#ifdef __NVCC__
125 int device=-1;
126 cudaGetDevice(&device);
127#endif
128#ifndef NO_GAUGE
129 Gauge_force(dSdpi,ut_f,iu,id,beta);
130#endif
131 if(!akappa)
132 return 0;
133 //X1=(M†M)^{1} Phi
134 int itercg=1;
135#ifdef __NVCC__
136 Complex_f *X1_f, *X2_f;
137 cudaMallocAsync((void **)&X2_f,kferm2*sizeof(Complex_f),streams[0]);
138 cudaMallocAsync((void **)&X1_f,kferm2*sizeof(Complex_f),NULL);
139 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
140#else
141 Complex *X2= (Complex *)aligned_alloc(AVX,kferm2Halo*sizeof(Complex));
142#endif
143 for(int na = 0; na<nf; na++){
144#ifdef __NVCC__
145 cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,NULL);
146#else
147 memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
148#endif
149 if(!iflag){
150#ifdef __NVCC__
151 Complex *smallPhi;
152 cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),streams[0]);
153#else
154 Complex *smallPhi = (Complex *)aligned_alloc(AVX,kferm2*sizeof(Complex));
155#endif
156 Fill_Small_Phi(na, smallPhi, Phi);
157 // Congradq(na, res1,smallPhi, &itercg );
158 Congradq(na,res1,X1,smallPhi,ut_f,iu,id,gamval_f,gamin,dk_f,jqq,akappa,&itercg);
159#ifdef __NVCC__
160 cudaFreeAsync(smallPhi,streams[0]);
161#else
162 free(smallPhi);
163#endif
164 *ancg+=itercg;
165#ifdef __NVCC__
166 Complex blasa=2.0; double blasb=-1.0;
167 cublasZdscal(cublas_handle,kferm2,&blasb,(cuDoubleComplex *)(X0+na*kferm2),1);
168 cublasZaxpy(cublas_handle,kferm2,(cuDoubleComplex *)&blasa,(cuDoubleComplex *)X1,1,(cuDoubleComplex *)(X0+na*kferm2),1);
169 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
170 //HDslash launches a different stream so we need a barrieer
171 cudaDeviceSynchronise();
172#elif (defined __INTEL_MKL__)
173 Complex blasa=2.0; Complex blasb=-1.0;
174 //This is not a general BLAS Routine. BLIS and MKl support it
175 //CUDA and GSL does not support it
176 cblas_zaxpby(kferm2, &blasa, X1, 1, &blasb, X0+na*kferm2, 1);
177#elif defined USE_BLAS
178 Complex blasa=2.0; double blasb=-1.0;
179 cblas_zdscal(kferm2,blasb,X0+na*kferm2,1);
180 cblas_zaxpy(kferm2,&blasa,X1,1,X0+na*kferm2,1);
181#else
182#pragma omp parallel for simd collapse(2)
183 for(int i=0;i<kvol;i++)
184 for(int idirac=0;idirac<ndirac;idirac++){
185 X0[((na*kvol+i)*ndirac+idirac)*nc]=
186 2*X1[(i*ndirac+idirac)*nc]-X0[((na*kvol+i)*ndirac+idirac)*nc];
187 X0[((na*kvol+i)*ndirac+idirac)*nc+1]=
188 2*X1[(i*ndirac+idirac)*nc+1]-X0[((na*kvol+i)*ndirac+idirac)*nc+1];
189 }
190#endif
191 }
192 #ifdef __NVCC__
193 Hdslash_f(X2_f,X1_f,ut_f,iu,id,gamval_f,gamin,dk_f,akappa);
194 #else
195 Hdslash(X2,X1,ut,iu,id,gamval,gamin,dk,akappa);
196 #endif
197#ifdef __NVCC__
198 float blasd=2.0;
199 cudaDeviceSynchronise();
200 cublasCsscal(cublas_handle,kferm2, &blasd, (cuComplex *)X2_f, 1);
201#elif defined USE_BLAS
202 double blasd=2.0;
203 cblas_zdscal(kferm2, blasd, X2, 1);
204#else
205#pragma unroll
206 for(int i=0;i<kferm2;i++)
207 X2[i]*=2;
208#endif
209#if(npx>1)
210 ZHalo_swap_dir(X1,8,0,DOWN);
211 ZHalo_swap_dir(X2,8,0,DOWN);
212#endif
213#if(npy>1)
214 ZHalo_swap_dir(X1,8,1,DOWN);
215 ZHalo_swap_dir(X2,8,1,DOWN);
216#endif
217#if(npz>1)
218 ZHalo_swap_dir(X1,8,2,DOWN);
219 ZHalo_swap_dir(X2,8,2,DOWN);
220#endif
221#if(npt>1)
222 ZHalo_swap_dir(X1,8,3,DOWN);
223 ZHalo_swap_dir(X2,8,3,DOWN);
224#endif
225
226 // The original FORTRAN Comment:
227 // dSdpi=dSdpi-Re(X1*(d(Mdagger)dp)*X2) -- Yikes!
228 // we're gonna need drugs for this one......
229 //
230 // Makes references to X1(.,.,iu(i,mu)) AND X2(.,.,iu(i,mu))
231 // as a result, need to swap the DOWN halos in all dirs for
232 // both these arrays, each of which has 8 cpts
233 //
234#ifdef __NVCC__
235 cuForce(dSdpi,ut_f,X1_f,X2_f,gamval_f,dk_f,iu,gamin,akappa,dimGrid,dimBlock);
236 cudaDeviceSynchronise();
237#else
238#pragma omp parallel for
239 for(int i=0;i<kvol;i++)
240 for(int idirac=0;idirac<ndirac;idirac++){
241 int mu, uid, igork1;
242#ifndef NO_SPACE
243#pragma omp simd //aligned(dSdpi,X1,X2,ut[0],ut[1],iu:AVX)
244 for(mu=0; mu<3; mu++){
245 //Long term ambition. I used the diff command on the different
246 //spacial components of dSdpi and saw a lot of the values required
247 //for them are duplicates (u11(i,mu)*X2(1,idirac,i) is used again with
248 //a minus in front for example. Why not evaluate them first /and then plug
249 //them into the equation? Reduce the number of evaluations needed and look
250 //a bit neater (although harder to follow as a consequence).
251
252 //Up indices
253 uid = iu[mu+ndim*i];
254 igork1 = gamin[mu*ndirac+idirac];
255
256 //REMINDER. Gamma is already scaled by kappa when we defined them. So if yer trying to rederive this from
257 //Montvay and Munster and notice a missing kappa in the code, that is why.
258 dSdpi[(i*nadj)*ndim+mu]+=akappa*creal(I*
259 (conj(X1[(i*ndirac+idirac)*nc])*
260 (-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
261 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
262 +conj(X1[(uid*ndirac+idirac)*nc])*
263 ( ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
264 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
265 +conj(X1[(i*ndirac+idirac)*nc+1])*
266 (ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
267 +ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
268 +conj(X1[(uid*ndirac+idirac)*nc+1])*
269 (-ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
270 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
271 dSdpi[(i*nadj)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
272 (conj(X1[(i*ndirac+idirac)*nc])*
273 (-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
274 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
275 +conj(X1[(uid*ndirac+idirac)*nc])*
276 (-ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
277 +conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
278 +conj(X1[(i*ndirac+idirac)*nc+1])*
279 (ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
280 +ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
281 +conj(X1[(uid*ndirac+idirac)*nc+1])*
282 (ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
283 +conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
284
285 dSdpi[(i*nadj+1)*ndim+mu]+=akappa*creal(
286 (conj(X1[(i*ndirac+idirac)*nc])*
287 (-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
288 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
289 +conj(X1[(uid*ndirac+idirac)*nc])*
290 (-ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
291 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
292 +conj(X1[(i*ndirac+idirac)*nc+1])*
293 (-ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
294 -ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
295 +conj(X1[(uid*ndirac+idirac)*nc+1])*
296 (ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
297 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
298 dSdpi[(i*nadj+1)*ndim+mu]+=creal(gamval[mu*ndirac+idirac]*
299 (conj(X1[(i*ndirac+idirac)*nc])*
300 (-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
301 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
302 +conj(X1[(uid*ndirac+idirac)*nc])*
303 (ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
304 +conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
305 +conj(X1[(i*ndirac+idirac)*nc+1])*
306 (-ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
307 -ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
308 +conj(X1[(uid*ndirac+idirac)*nc+1])*
309 (-ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
310 +conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
311
312 dSdpi[(i*nadj+2)*ndim+mu]+=akappa*creal(I*
313 (conj(X1[(i*ndirac+idirac)*nc])*
314 (ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
315 +ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
316 +conj(X1[(uid*ndirac+idirac)*nc])*
317 (-conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
318 -ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])
319 +conj(X1[(i*ndirac+idirac)*nc+1])*
320 (conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
321 -conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
322 +conj(X1[(uid*ndirac+idirac)*nc+1])*
323 (-conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
324 +ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])));
325 dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
326 (conj(X1[(i*ndirac+idirac)*nc])*
327 (ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
328 +ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
329 +conj(X1[(uid*ndirac+idirac)*nc])*
330 (conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
331 +ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])
332 +conj(X1[(i*ndirac+idirac)*nc+1])*
333 (conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
334 -conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
335 +conj(X1[(uid*ndirac+idirac)*nc+1])*
336 (conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
337 -ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])));
338
339 }
340#endif
341 //We're not done tripping yet!! Time like term is different. dk4? shows up
342 //For consistency we'll leave mu in instead of hard coding.
343 mu=3;
344 uid = iu[mu+ndim*i];
345 igork1 = gamin[mu*ndirac+idirac];
346#ifndef NO_TIME
347 dSdpi[(i*nadj)*ndim+mu]+=creal(I*
348 (conj(X1[(i*ndirac+idirac)*nc])*
349 (dk[0][i]*(-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
350 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
351 +conj(X1[(uid*ndirac+idirac)*nc])*
352 (dk[1][i]* (+ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
353 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
354 +conj(X1[(i*ndirac+idirac)*nc+1])*
355 (dk[0][i]* (ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
356 +ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
357 +conj(X1[(uid*ndirac+idirac)*nc+1])*
358 (dk[1][i]* (-ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
359 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))))
360 +creal(I*
361 (conj(X1[(i*ndirac+idirac)*nc])*
362 (dk[0][i]*(-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
363 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
364 +conj(X1[(uid*ndirac+idirac)*nc])*
365 (-dk[1][i]* (ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
366 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
367 +conj(X1[(i*ndirac+idirac)*nc+1])*
368 (dk[0][i]* (ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
369 +ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
370 +conj(X1[(uid*ndirac+idirac)*nc+1])*
371 (-dk[1][i]* (-ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
372 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
373
374 dSdpi[(i*nadj+1)*ndim+mu]+=creal(
375 conj(X1[(i*ndirac+idirac)*nc])*
376 (dk[0][i]*(-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
377 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
378 +conj(X1[(uid*ndirac+idirac)*nc])*
379 (dk[1][i]* (-ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
380 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
381 +conj(X1[(i*ndirac+idirac)*nc+1])*
382 (dk[0][i]* (-ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
383 -ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
384 +conj(X1[(uid*ndirac+idirac)*nc+1])*
385 (dk[1][i]* ( ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
386 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])))
387 +creal(
388 (conj(X1[(i*ndirac+idirac)*nc])*
389 (dk[0][i]*(-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
390 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
391 +conj(X1[(uid*ndirac+idirac)*nc])*
392 (-dk[1][i]* (-ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
393 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
394 +conj(X1[(i*ndirac+idirac)*nc+1])*
395 (dk[0][i]* (-ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
396 -ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
397 +conj(X1[(uid*ndirac+idirac)*nc+1])*
398 (-dk[1][i]* (ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
399 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
400
401 dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*
402 (conj(X1[(i*ndirac+idirac)*nc])*
403 (dk[0][i]* (ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
404 +ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
405 +conj(X1[(uid*ndirac+idirac)*nc])*
406 (dk[1][i]*(-conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
407 -ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))
408 +conj(X1[(i*ndirac+idirac)*nc+1])*
409 (dk[0][i]* (conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
410 -conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
411 +conj(X1[(uid*ndirac+idirac)*nc+1])*
412 (dk[1][i]*(-conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
413 +ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))))
414 +creal(I*
415 (conj(X1[(i*ndirac+idirac)*nc])*
416 (dk[0][i]* (ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
417 +ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
418 +conj(X1[(uid*ndirac+idirac)*nc])*
419 (-dk[1][i]*(-conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
420 -ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))
421 +conj(X1[(i*ndirac+idirac)*nc+1])*
422 (dk[0][i]* (conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
423 -conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
424 +conj(X1[(uid*ndirac+idirac)*nc+1])*
425 (-dk[1][i]*(-conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
426 +ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))));
427
428#endif
429 }
430#endif
431 }
432#ifdef __NVCC__
433 cudaFreeAsync(X1_f,streams[0]); cudaFreeAsync(X2_f,streams[1]);
434#else
435 free(X2);
436#endif
437 return 0;
438}
int Force(double *dSdpi, int iflag, double res1, Complex *X0, Complex *X1, Complex *Phi, Complex *ut[2], Complex_f *ut_f[2], unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], Complex_f jqq, float akappa, float beta, double *ancg)
Calculates the force at each intermediate time.
Definition force.c:94
int Gauge_force(double *dSdpi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
Definition force.c:6
Matrix multiplication and related declarations.
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates in single precision.
Definition matrices.c:712
int Hdslash(Complex *phi, Complex *r, Complex *u11t[2], unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk[2], float akappa)
Evaluates in double precision.
Definition matrices.c:268
#define DOWN
Flag for send down.
Definition par_mpi.h:35
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int CHalo_swap_dir(Complex_f *c, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define nadj
adjacent spatial indices
Definition sizes.h:175
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define nf
Fermion flavours (double it)
Definition sizes.h:151
#define ndirac
Dirac indices.
Definition sizes.h:177
#define halo
Total Halo size.
Definition sizes.h:222
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
Definition congrad.c:7
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
Definition su2hmc.c:385
int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
Extracts all the single precision gauge links in the direction only.
Definition su2hmc.c:357