Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time.
96 {
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123 const char *funcname = "Force";
124#ifdef __NVCC__
125 int device=-1;
126 cudaGetDevice(&device);
127#endif
128#ifndef NO_GAUGE
130#endif
131 if(!akappa)
132 return 0;
133
134 int itercg=1;
135#ifdef __NVCC__
139 cuComplex_convert(X1_f,X1,
kferm2,
true,dimBlock,dimGrid);
140#else
142#endif
143 for(
int na = 0; na<
nf; na++){
144#ifdef __NVCC__
146#else
148#endif
149 if(!iflag){
150#ifdef __NVCC__
152 cudaMallocAsync((
void **)&smallPhi,
kferm2*
sizeof(
Complex),streams[0]);
153#else
155#endif
157
158 Congradq(na,res1,X1,smallPhi,ut_f,iu,
id,gamval_f,gamin,dk_f,jqq,akappa,&itercg);
159#ifdef __NVCC__
160 cudaFreeAsync(smallPhi,streams[0]);
161#else
162 free(smallPhi);
163#endif
164 *ancg+=itercg;
165#ifdef __NVCC__
166 Complex blasa=2.0;
double blasb=-1.0;
167 cublasZdscal(cublas_handle,
kferm2,&blasb,(cuDoubleComplex *)(X0+na*
kferm2),1);
168 cublasZaxpy(cublas_handle,
kferm2,(cuDoubleComplex *)&blasa,(cuDoubleComplex *)X1,1,(cuDoubleComplex *)(X0+na*
kferm2),1);
169 cuComplex_convert(X1_f,X1,
kferm2,
true,dimBlock,dimGrid);
170
171 cudaDeviceSynchronise();
172#elif (defined __INTEL_MKL__)
174
175
176 cblas_zaxpby(
kferm2, &blasa, X1, 1, &blasb, X0+na*
kferm2, 1);
177#elif defined USE_BLAS
178 Complex blasa=2.0;
double blasb=-1.0;
181#else
182#pragma omp parallel for simd collapse(2)
183 for(
int i=0;i<
kvol;i++)
184 for(
int idirac=0;idirac<
ndirac;idirac++){
189 }
190#endif
191 }
192 #ifdef __NVCC__
193 Hdslash_f(X2_f,X1_f,ut_f,iu,
id,gamval_f,gamin,dk_f,akappa);
194 #else
195 Hdslash(X2,X1,ut,iu,
id,gamval,gamin,dk,akappa);
196 #endif
197#ifdef __NVCC__
198 float blasd=2.0;
199 cudaDeviceSynchronise();
200 cublasCsscal(cublas_handle,
kferm2, &blasd, (cuComplex *)X2_f, 1);
201#elif defined USE_BLAS
202 double blasd=2.0;
203 cblas_zdscal(
kferm2, blasd, X2, 1);
204#else
205#pragma unroll
207 X2[i]*=2;
208#endif
209#if(npx>1)
212#endif
213#if(npy>1)
216#endif
217#if(npz>1)
220#endif
221#if(npt>1)
224#endif
225
226
227
228
229
230
231
232
233
234#ifdef __NVCC__
235 cuForce(dSdpi,ut_f,X1_f,X2_f,gamval_f,dk_f,iu,gamin,akappa,dimGrid,dimBlock);
236 cudaDeviceSynchronise();
237#else
238#pragma omp parallel for
239 for(
int i=0;i<
kvol;i++)
240 for(
int idirac=0;idirac<
ndirac;idirac++){
241 int mu, uid, igork1;
242#ifndef NO_SPACE
243#pragma omp simd
244 for(mu=0; mu<3; mu++){
245
246
247
248
249
250
251
252
254 igork1 = gamin[mu*
ndirac+idirac];
255
256
257
258 dSdpi[(i*
nadj)*
ndim+mu]+=akappa*creal(I*
268 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
281 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
284
285 dSdpi[(i*
nadj+1)*
ndim+mu]+=akappa*creal(
295 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
308 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
311
312 dSdpi[(i*
nadj+2)*
ndim+mu]+=akappa*creal(I*
322 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
335 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
338
339 }
340#endif
341
342
343 mu=3;
345 igork1 = gamin[mu*
ndirac+idirac];
346#ifndef NO_TIME
349 (dk[0][i]*(-conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
355 (dk[0][i]* (ut[0][i*
ndim+mu] *X2[(uid*
ndirac+idirac)*
nc]
357 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
360 +creal(I*
362 (dk[0][i]*(-conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
368 (dk[0][i]* (ut[0][i*
ndim+mu] *X2[(uid*
ndirac+igork1)*
nc]
370 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
371 (-dk[1][i]* (-ut[0][i*
ndim+mu] *X2[(i*
ndirac+igork1)*
nc]
373
376 (dk[0][i]*(-conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
382 (dk[0][i]* (-ut[0][i*
ndim+mu] *X2[(uid*
ndirac+idirac)*
nc]
384 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
387 +creal(
389 (dk[0][i]*(-conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
392 (-dk[1][i]* (-ut[1][i*
ndim+mu] *X2[(i*
ndirac+igork1)*
nc]
395 (dk[0][i]* (-ut[0][i*
ndim+mu] *X2[(uid*
ndirac+igork1)*
nc]
397 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
400
403 (dk[0][i]* (ut[0][i*
ndim+mu] *X2[(uid*
ndirac+idirac)*
nc]
406 (dk[1][i]*(-conj(ut[0][i*
ndim+mu])*X2[(i*
ndirac+idirac)*
nc]
409 (dk[0][i]* (conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
411 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
412 (dk[1][i]*(-conj(ut[1][i*
ndim+mu])*X2[(i*
ndirac+idirac)*
nc]
414 +creal(I*
416 (dk[0][i]* (ut[0][i*
ndim+mu] *X2[(uid*
ndirac+igork1)*
nc]
419 (-dk[1][i]*(-conj(ut[0][i*
ndim+mu])*X2[(i*
ndirac+igork1)*
nc]
422 (dk[0][i]* (conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
424 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
425 (-dk[1][i]*(-conj(ut[1][i*
ndim+mu])*X2[(i*
ndirac+igork1)*
nc]
427
428#endif
429 }
430#endif
431 }
432#ifdef __NVCC__
433 cudaFreeAsync(X1_f,streams[0]); cudaFreeAsync(X2_f,streams[1]);
434#else
435 free(X2);
436#endif
437 return 0;
438}
int Gauge_force(double *dSdpi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates in single precision.
int Hdslash(Complex *phi, Complex *r, Complex *u11t[2], unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk[2], float akappa)
Evaluates in double precision.
#define DOWN
Flag for send down.
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
#define kferm2Halo
Dirac lattice and halo.
#define nadj
adjacent spatial indices
#define kvol
Sublattice volume.
#define Complex
Double precision complex number.
#define nf
Fermion flavours (double it)
#define ndirac
Dirac indices.
#define Complex_f
Single precision complex number.
#define kferm2
sublattice size including Dirac indices
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)