su2hmc
Loading...
Searching...
No Matches
force.c
Go to the documentation of this file.
1
5#include <matrices.h>
6int Gauge_force(double *dSdpi, Complex_f *u11t, Complex_f *u12t,unsigned int *iu,unsigned int *id, float beta){
7 /*
8 * Calculates dSdpi due to the Wilson Action at each intermediate time
9 *
10 * Calls:
11 * =====
12 * C_Halo_swap_all, C_gather, C_Halo_swap_dir
13 *
14 * Parameters:
15 * =======
16 * double *dSdpi
17 * Complex_f *u11t
18 * Complex_f *u12t
19 * unsigned int *iu
20 * unsigned int *id
21 * float beta
22 *
23 * Returns:
24 * =======
25 * Zero on success, integer error code otherwise
26 */
27 const char *funcname = "Gauge_force";
28
29 //We define zero halos for debugging
30 // #ifdef _DEBUG
31 // memset(u11t[kvol], 0, ndim*halo*sizeof(Complex_f));
32 // memset(u12t[kvol], 0, ndim*halo*sizeof(Complex_f));
33 // #endif
34 //Was a trial field halo exchange here at one point.
35#ifdef __NVCC__
36 int device=-1;
37 cudaGetDevice(&device);
38 Complex_f *Sigma11, *Sigma12, *u11sh, *u12sh;
39 cudaMallocAsync((void **)&Sigma11,kvol*sizeof(Complex_f),streams[0]);
40 cudaMallocAsync((void **)&Sigma12,kvol*sizeof(Complex_f),streams[1]);
41 cudaMallocManaged((void **)&u11sh,(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);
42 cudaMallocManaged((void **)&u12sh,(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);
43#else
44 Complex_f *Sigma11 = (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f));
45 Complex_f *Sigma12= (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f));
46 Complex_f *u11sh = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f));
47 Complex_f *u12sh = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f));
48#endif
49 //Holders for directions
50 for(int mu=0; mu<ndim; mu++){
51#ifdef __NVCC__
52 cudaMemset(Sigma11,0, kvol*sizeof(Complex_f));
53 cudaMemset(Sigma12,0, kvol*sizeof(Complex_f));
54#else
55 memset(Sigma11,0, kvol*sizeof(Complex_f));
56 memset(Sigma12,0, kvol*sizeof(Complex_f));
57#endif
58 for(int nu=0; nu<ndim; nu++)
59 if(nu!=mu){
60 //The +ν Staple
61#ifdef __NVCC__
62 cuPlus_staple(mu,nu,iu,Sigma11,Sigma12,u11t,u12t,dimGrid,dimBlock);
63#else
64#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12,iu:AVX)
65 for(int i=0;i<kvol;i++){
66 int uidm = iu[mu+ndim*i];
67 int uidn = iu[nu+ndim*i];
68 Complex_f a11=u11t[uidm*ndim+nu]*conj(u11t[uidn*ndim+mu])+\
69 u12t[uidm*ndim+nu]*conj(u12t[uidn*ndim+mu]);
70 Complex_f a12=-u11t[uidm*ndim+nu]*u12t[uidn*ndim+mu]+\
71 u12t[uidm*ndim+nu]*u11t[uidn*ndim+mu];
72 Sigma11[i]+=a11*conj(u11t[i*ndim+nu])+a12*conj(u12t[i*ndim+nu]);
73 Sigma12[i]+=-a11*u12t[i*ndim+nu]+a12*u11t[i*ndim+nu];
74 }
75#endif
76 C_gather(u11sh, u11t, kvol, id, nu);
77 C_gather(u12sh, u12t, kvol, id, nu);
78#if(nproc>1)
79#ifdef __NVCC__
80 //Prefetch to the CPU for until we get NCCL working
81 cudaMemPrefetchAsync(u11sh, kvol*sizeof(Complex_f),cudaCpuDeviceId,streams[0]);
82 cudaMemPrefetchAsync(u12sh, kvol*sizeof(Complex_f),cudaCpuDeviceId,streams[1]);
83#endif
84 CHalo_swap_dir(u11sh, 1, mu, DOWN); CHalo_swap_dir(u12sh, 1, mu, DOWN);
85#ifdef __NVCC__
86 cudaMemPrefetchAsync(u11sh+kvol, halo*sizeof(Complex_f),device,streams[0]);
87 cudaMemPrefetchAsync(u12sh+kvol, halo*sizeof(Complex_f),device,streams[1]);
88#endif
89#endif
90 //Next up, the -ν staple
91#ifdef __NVCC__
92 cudaDeviceSynchronise();
93 cuMinus_staple(mu,nu,iu,id,Sigma11,Sigma12,u11sh,u12sh,u11t,u12t,dimGrid,dimBlock);
94#else
95#pragma omp parallel for simd aligned(u11t,u12t,u11sh,u12sh,Sigma11,Sigma12,iu,id:AVX)
96 for(int i=0;i<kvol;i++){
97 int uidm = iu[mu+ndim*i];
98 int didn = id[nu+ndim*i];
99 //uidm is correct here
100 Complex_f a11=conj(u11sh[uidm])*conj(u11t[didn*ndim+mu])-\
101 u12sh[uidm]*conj(u12t[didn*ndim+mu]);
102 Complex_f a12=-conj(u11sh[uidm])*u12t[didn*ndim+mu]-\
103 u12sh[uidm]*u11t[didn*ndim+mu];
104 Sigma11[i]+=a11*u11t[didn*ndim+nu]-a12*conj(u12t[didn*ndim+nu]);
105 Sigma12[i]+=a11*u12t[didn*ndim+nu]+a12*conj(u11t[didn*ndim+nu]);
106 }
107#endif
108 }
109#ifdef __NVCC__
110 cuGauge_force(mu,Sigma11,Sigma12,u11t,u12t,dSdpi,beta,dimGrid,dimBlock);
111#else
112#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12,dSdpi:AVX)
113 for(int i=0;i<kvol;i++){
114 Complex_f a11 = u11t[i*ndim+mu]*Sigma12[i]+u12t[i*ndim+mu]*conj(Sigma11[i]);
115 Complex_f a12 = u11t[i*ndim+mu]*Sigma11[i]+conj(u12t[i*ndim+mu])*Sigma12[i];
116
117 dSdpi[(i*nadj)*ndim+mu]=(double)(beta*cimag(a11));
118 dSdpi[(i*nadj+1)*ndim+mu]=(double)(beta*creal(a11));
119 dSdpi[(i*nadj+2)*ndim+mu]=(double)(beta*cimag(a12));
120 }
121#endif
122 }
123#ifdef __NVCC__
124 cudaDeviceSynchronise();
125 cudaFreeAsync(Sigma11,streams[0]); cudaFreeAsync(Sigma12,streams[1]); cudaFree(u11sh); cudaFree(u12sh);
126#else
127 free(u11sh); free(u12sh); free(Sigma11); free(Sigma12);
128#endif
129 return 0;
130}
131int Force(double *dSdpi, int iflag, double res1, Complex *X0, Complex *X1, Complex *Phi,Complex *u11t, Complex *u12t,\
132 Complex_f *u11t_f,Complex_f *u12t_f,unsigned int *iu,unsigned int *id,Complex *gamval,Complex_f *gamval_f,\
133 int *gamin,double *dk4m, double *dk4p, float *dk4m_f,float *dk4p_f,Complex_f jqq,\
134 float akappa,float beta,double *ancg){
135 /*
136 * @brief Calculates the force @f$\frac{dS}{d\pi}@f$ at each intermediate time
137 *
138 * @param dSdpi: The force
139 * @param iflag: Invert before evaluating the force?
140 * @param res1: Conjugate gradient residule
141 * @param X0: Up/down partitioned pseudofermion field
142 * @param X1: Holder for the partitioned fermion field, then the conjugate gradient output
143 * @param Phi: Pseudofermion field
144 * @param u11t,u12t Double precision colour fields
145 * @param u11t_f,u12t_f: Single precision colour fields
146 * @param iu,id: Lattice indices
147 * @param gamin: Gamma indices
148 * @param gamval: Double precision gamma matrices
149 * @param gamval_f: Single precision gamma matrices
150 * @param dk4m: @f$\left(1+\gamma_0\right)e^{-\mu}@f$
151 * @param dk4p: @f$\left(1-\gamma_0\right)e^\mu@f$
152 * @param dk4m_f: @f$\left(1+\gamma_0\right)e^{-\mu}@f$ float
153 * @param dk4p_f: @f$\left(1-\gamma_0\right)e^\mu@f$ float
154 * @param jqq: Diquark source
155 * @param akappa: Hopping parameter
156 * @param beta: Inverse gauge coupling
157 * @param ancg: Counter for conjugate gradient iterations
158 *
159 * @return Zero on success, integer error code otherwise
160 */
161 const char *funcname = "Force";
162#ifdef __NVCC__
163 int device=-1;
164 cudaGetDevice(&device);
165#endif
166#ifndef NO_GAUGE
167 Gauge_force(dSdpi,u11t_f,u12t_f,iu,id,beta);
168#endif
169 //X1=(M†M)^{1} Phi
170 int itercg=1;
171#ifdef __NVCC__
172 Complex *X2;
173 cudaMallocManaged((void **)&X2,kferm2Halo*sizeof(Complex),cudaMemAttachGlobal);
174#else
175 Complex *X2= (Complex *)aligned_alloc(AVX,kferm2Halo*sizeof(Complex));
176#endif
177 for(int na = 0; na<nf; na++){
178#ifdef __NVCC__
179 cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,NULL);
180#else
181 memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
182#endif
183 if(!iflag){
184#ifdef __NVCC__
185 Complex *smallPhi;
186 cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),streams[0]);
187#else
188 Complex *smallPhi = (Complex *)aligned_alloc(AVX,kferm2*sizeof(Complex));
189#endif
190 Fill_Small_Phi(na, smallPhi, Phi);
191 // Congradq(na, res1,smallPhi, &itercg );
192 Congradq(na,res1,X1,smallPhi,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg);
193#ifdef __NVCC__
194 cudaFreeAsync(smallPhi,streams[0]);
195#else
196 free(smallPhi);
197#endif
198 *ancg+=itercg;
199#ifdef __NVCC__
200 Complex blasa=2.0; double blasb=-1.0;
201 cublasZdscal(cublas_handle,kferm2,&blasb,(cuDoubleComplex *)(X0+na*kferm2),1);
202 cublasZaxpy(cublas_handle,kferm2,(cuDoubleComplex *)&blasa,(cuDoubleComplex *)X1,1,(cuDoubleComplex *)(X0+na*kferm2),1);
203 //HDslash launches a different stream so we need a barrieer
204 cudaDeviceSynchronise();
205#elif (defined __INTEL_MKL__)
206 Complex blasa=2.0; Complex blasb=-1.0;
207 //This is not a general BLAS Routine. BLIS and MKl support it
208 //CUDA and GSL does not support it
209 cblas_zaxpby(kferm2, &blasa, X1, 1, &blasb, X0+na*kferm2, 1);
210#elif defined USE_BLAS
211 Complex blasa=2.0; double blasb=-1.0;
212 cblas_zdscal(kferm2,blasb,X0+na*kferm2,1);
213 cblas_zaxpy(kferm2,&blasa,X1,1,X0+na*kferm2,1);
214#else
215#pragma omp parallel for simd collapse(2)
216 for(int i=0;i<kvol;i++)
217 for(int idirac=0;idirac<ndirac;idirac++){
218 X0[((na*kvol+i)*ndirac+idirac)*nc]=
219 2*X1[(i*ndirac+idirac)*nc]-X0[((na*kvol+i)*ndirac+idirac)*nc];
220 X0[((na*kvol+i)*ndirac+idirac)*nc+1]=
221 2*X1[(i*ndirac+idirac)*nc+1]-X0[((na*kvol+i)*ndirac+idirac)*nc+1];
222 }
223#endif
224 }
225 Hdslash(X2,X1,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,akappa);
226#ifdef __NVCC__
227 double blasd=2.0;
228 cudaDeviceSynchronise();
229 cublasZdscal(cublas_handle,kferm2, &blasd, (cuDoubleComplex *)X2, 1);
230#elif defined USE_BLAS
231 double blasd=2.0;
232 cblas_zdscal(kferm2, blasd, X2, 1);
233#else
234#pragma unroll
235 for(int i=0;i<kferm2;i++)
236 X2[i]*=2;
237#endif
238#if(npx>1)
239 ZHalo_swap_dir(X1,8,0,DOWN);
240 ZHalo_swap_dir(X2,8,0,DOWN);
241#endif
242#if(npy>1)
243 ZHalo_swap_dir(X1,8,1,DOWN);
244 ZHalo_swap_dir(X2,8,1,DOWN);
245#endif
246#if(npz>1)
247 ZHalo_swap_dir(X1,8,2,DOWN);
248 ZHalo_swap_dir(X2,8,2,DOWN);
249#endif
250#if(npt>1)
251 ZHalo_swap_dir(X1,8,3,DOWN);
252 ZHalo_swap_dir(X2,8,3,DOWN);
253#endif
254
255 // The original FORTRAN Comment:
256 // dSdpi=dSdpi-Re(X1*(d(Mdagger)dp)*X2) -- Yikes!
257 // we're gonna need drugs for this one......
258 //
259 // Makes references to X1(.,.,iu(i,mu)) AND X2(.,.,iu(i,mu))
260 // as a result, need to swap the DOWN halos in all dirs for
261 // both these arrays, each of which has 8 cpts
262 //
263#ifdef __NVCC__
264 Complex_f *X1_f, *X2_f;
265 cudaMallocAsync((void **)&X1_f,kferm2*sizeof(Complex_f),NULL);
266 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
267 Transpose_c(X1_f,ndirac*nc,kvol,dimGrid,dimBlock);
268
269 cudaMallocAsync((void **)&X2_f,kferm2*sizeof(Complex_f),NULL);
270 cuComplex_convert(X2_f,X2,kferm2,true,dimBlock,dimGrid);
271 Transpose_c(X2_f,ndirac*nc,kvol,dimGrid,dimBlock);
272// Transpose_z(X1,kvol,ndirac*nc,dimGrid,dimBlock); Transpose_z(X2,kvol,ndirac*nc,dimGrid,dimBlock);
273 cuForce(dSdpi,u11t_f,u12t_f,X1_f,X2_f,gamval_f,dk4m_f,dk4p_f,iu,gamin,akappa,dimGrid,dimBlock);
274 cudaDeviceSynchronise();
275 cudaFreeAsync(X1_f,NULL); cudaFreeAsync(X2_f,NULL);
276#else
277#pragma omp parallel for
278 for(int i=0;i<kvol;i++)
279 for(int idirac=0;idirac<ndirac;idirac++){
280 int mu, uid, igork1;
281#ifndef NO_SPACE
282#pragma omp simd aligned(dSdpi,X1,X2,u11t,u12t,iu:AVX)
283 for(mu=0; mu<3; mu++){
284 //Long term ambition. I used the diff command on the different
285 //spacial components of dSdpi and saw a lot of the values required
286 //for them are duplicates (u11(i,mu)*X2(1,idirac,i) is used again with
287 //a minus in front for example. Why not evaluate them first /and then plug
288 //them into the equation? Reduce the number of evaluations needed and look
289 //a bit neater (although harder to follow as a consequence).
290
291 //Up indices
292 uid = iu[mu+ndim*i];
293 igork1 = gamin[mu*ndirac+idirac];
294
295 //REMINDER. Gamma is already scaled by kappa when we defined them. So if yer trying to rederive this from
296 //Montvay and Munster and notice a missing kappa in the code, that is why.
297 dSdpi[(i*nadj)*ndim+mu]+=akappa*creal(I*
298 (conj(X1[(i*ndirac+idirac)*nc])*
299 (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
300 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
301 +conj(X1[(uid*ndirac+idirac)*nc])*
302 ( u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
303 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
304 +conj(X1[(i*ndirac+idirac)*nc+1])*
305 (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
306 +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
307 +conj(X1[(uid*ndirac+idirac)*nc+1])*
308 (-u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
309 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
310 dSdpi[(i*nadj)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
311 (conj(X1[(i*ndirac+idirac)*nc])*
312 (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
313 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
314 +conj(X1[(uid*ndirac+idirac)*nc])*
315 (-u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
316 +conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
317 +conj(X1[(i*ndirac+idirac)*nc+1])*
318 (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
319 +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
320 +conj(X1[(uid*ndirac+idirac)*nc+1])*
321 (u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
322 +conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
323
324 dSdpi[(i*nadj+1)*ndim+mu]+=akappa*creal(
325 (conj(X1[(i*ndirac+idirac)*nc])*
326 (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
327 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
328 +conj(X1[(uid*ndirac+idirac)*nc])*
329 (-u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
330 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
331 +conj(X1[(i*ndirac+idirac)*nc+1])*
332 (-u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
333 -u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
334 +conj(X1[(uid*ndirac+idirac)*nc+1])*
335 (u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
336 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
337 dSdpi[(i*nadj+1)*ndim+mu]+=creal(gamval[mu*ndirac+idirac]*
338 (conj(X1[(i*ndirac+idirac)*nc])*
339 (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
340 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
341 +conj(X1[(uid*ndirac+idirac)*nc])*
342 (u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
343 +conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
344 +conj(X1[(i*ndirac+idirac)*nc+1])*
345 (-u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
346 -u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
347 +conj(X1[(uid*ndirac+idirac)*nc+1])*
348 (-u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
349 +conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
350
351 dSdpi[(i*nadj+2)*ndim+mu]+=akappa*creal(I*
352 (conj(X1[(i*ndirac+idirac)*nc])*
353 (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
354 +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
355 +conj(X1[(uid*ndirac+idirac)*nc])*
356 (-conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
357 -u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])
358 +conj(X1[(i*ndirac+idirac)*nc+1])*
359 (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
360 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
361 +conj(X1[(uid*ndirac+idirac)*nc+1])*
362 (-conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
363 +u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])));
364 dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
365 (conj(X1[(i*ndirac+idirac)*nc])*
366 (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
367 +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
368 +conj(X1[(uid*ndirac+idirac)*nc])*
369 (conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
370 +u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])
371 +conj(X1[(i*ndirac+idirac)*nc+1])*
372 (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
373 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
374 +conj(X1[(uid*ndirac+idirac)*nc+1])*
375 (conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
376 -u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])));
377
378 }
379#endif
380 //We're not done tripping yet!! Time like term is different. dk4? shows up
381 //For consistency we'll leave mu in instead of hard coding.
382 mu=3;
383 uid = iu[mu+ndim*i];
384 igork1 = gamin[mu*ndirac+idirac];
385#ifndef NO_TIME
386 dSdpi[(i*nadj)*ndim+mu]+=creal(I*
387 (conj(X1[(i*ndirac+idirac)*nc])*
388 (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
389 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
390 +conj(X1[(uid*ndirac+idirac)*nc])*
391 (dk4p[i]* (+u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
392 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
393 +conj(X1[(i*ndirac+idirac)*nc+1])*
394 (dk4m[i]* (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
395 +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
396 +conj(X1[(uid*ndirac+idirac)*nc+1])*
397 (dk4p[i]* (-u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
398 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))))
399 +creal(I*
400 (conj(X1[(i*ndirac+idirac)*nc])*
401 (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
402 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
403 +conj(X1[(uid*ndirac+idirac)*nc])*
404 (-dk4p[i]* (u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
405 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
406 +conj(X1[(i*ndirac+idirac)*nc+1])*
407 (dk4m[i]* (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
408 +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
409 +conj(X1[(uid*ndirac+idirac)*nc+1])*
410 (-dk4p[i]* (-u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
411 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
412
413 dSdpi[(i*nadj+1)*ndim+mu]+=creal(
414 conj(X1[(i*ndirac+idirac)*nc])*
415 (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
416 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
417 +conj(X1[(uid*ndirac+idirac)*nc])*
418 (dk4p[i]* (-u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
419 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
420 +conj(X1[(i*ndirac+idirac)*nc+1])*
421 (dk4m[i]* (-u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
422 -u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
423 +conj(X1[(uid*ndirac+idirac)*nc+1])*
424 (dk4p[i]* ( u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
425 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])))
426 +creal(
427 (conj(X1[(i*ndirac+idirac)*nc])*
428 (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
429 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
430 +conj(X1[(uid*ndirac+idirac)*nc])*
431 (-dk4p[i]* (-u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
432 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
433 +conj(X1[(i*ndirac+idirac)*nc+1])*
434 (dk4m[i]* (-u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
435 -u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
436 +conj(X1[(uid*ndirac+idirac)*nc+1])*
437 (-dk4p[i]* (u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
438 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
439
440 dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*
441 (conj(X1[(i*ndirac+idirac)*nc])*
442 (dk4m[i]* (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
443 +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
444 +conj(X1[(uid*ndirac+idirac)*nc])*
445 (dk4p[i]*(-conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
446 -u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))
447 +conj(X1[(i*ndirac+idirac)*nc+1])*
448 (dk4m[i]* (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
449 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
450 +conj(X1[(uid*ndirac+idirac)*nc+1])*
451 (dk4p[i]*(-conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
452 +u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))))
453 +creal(I*
454 (conj(X1[(i*ndirac+idirac)*nc])*
455 (dk4m[i]* (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
456 +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
457 +conj(X1[(uid*ndirac+idirac)*nc])*
458 (-dk4p[i]*(-conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
459 -u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))
460 +conj(X1[(i*ndirac+idirac)*nc+1])*
461 (dk4m[i]* (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
462 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
463 +conj(X1[(uid*ndirac+idirac)*nc+1])*
464 (-dk4p[i]*(-conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
465 +u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))));
466
467#endif
468 }
469#endif
470 }
471#ifdef __NVCC__
472 cudaFree(X2);
473#else
474 free(X2);
475#endif
476 return 0;
477}
int Force(double *dSdpi, int iflag, double res1, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancg)
Calculates the force at each intermediate time.
Definition force.c:131
int Gauge_force(double *dSdpi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
Definition force.c:6
Matrix multiplication and related declarations.
int Hdslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
Evaluates in double precision.
Definition matrices.c:267
#define DOWN
Flag for send down.
Definition par_mpi.h:35
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int CHalo_swap_dir(Complex_f *c, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define nadj
adjacent spatial indices
Definition sizes.h:175
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define nf
Fermion flavours (double it)
Definition sizes.h:151
#define ndirac
Dirac indices.
Definition sizes.h:177
#define halo
Total Halo size.
Definition sizes.h:222
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
Definition congrad.c:7
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
Definition su2hmc.c:349
int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
Extracts all the single precision gauge links in the direction only.
Definition su2hmc.c:321