Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time.
134 {
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161 const char *funcname = "Force";
162#ifdef __NVCC__
163 int device=-1;
164 cudaGetDevice(&device);
165#endif
166#ifndef NO_GAUGE
168#endif
169
170 int itercg=1;
171#ifdef __NVCC__
174#else
176#endif
177 for(
int na = 0; na<
nf; na++){
178#ifdef __NVCC__
180#else
182#endif
183 if(!iflag){
184#ifdef __NVCC__
186 cudaMallocAsync((
void **)&smallPhi,
kferm2*
sizeof(
Complex),streams[0]);
187#else
189#endif
191
192 Congradq(na,res1,X1,smallPhi,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg);
193#ifdef __NVCC__
194 cudaFreeAsync(smallPhi,streams[0]);
195#else
196 free(smallPhi);
197#endif
198 *ancg+=itercg;
199#ifdef __NVCC__
200 Complex blasa=2.0;
double blasb=-1.0;
201 cublasZdscal(cublas_handle,
kferm2,&blasb,(cuDoubleComplex *)(X0+na*
kferm2),1);
202 cublasZaxpy(cublas_handle,
kferm2,(cuDoubleComplex *)&blasa,(cuDoubleComplex *)X1,1,(cuDoubleComplex *)(X0+na*
kferm2),1);
203
204 cudaDeviceSynchronise();
205#elif (defined __INTEL_MKL__)
207
208
209 cblas_zaxpby(
kferm2, &blasa, X1, 1, &blasb, X0+na*
kferm2, 1);
210#elif defined USE_BLAS
211 Complex blasa=2.0;
double blasb=-1.0;
214#else
215#pragma omp parallel for simd collapse(2)
216 for(
int i=0;i<
kvol;i++)
217 for(
int idirac=0;idirac<
ndirac;idirac++){
222 }
223#endif
224 }
225 Hdslash(X2,X1,u11t,u12t,iu,
id,gamval,gamin,dk4m,dk4p,akappa);
226#ifdef __NVCC__
227 double blasd=2.0;
228 cudaDeviceSynchronise();
229 cublasZdscal(cublas_handle,
kferm2, &blasd, (cuDoubleComplex *)X2, 1);
230#elif defined USE_BLAS
231 double blasd=2.0;
232 cblas_zdscal(
kferm2, blasd, X2, 1);
233#else
234#pragma unroll
236 X2[i]*=2;
237#endif
238#if(npx>1)
241#endif
242#if(npy>1)
245#endif
246#if(npz>1)
249#endif
250#if(npt>1)
253#endif
254
255
256
257
258
259
260
261
262
263#ifdef __NVCC__
266 cuComplex_convert(X1_f,X1,
kferm2,
true,dimBlock,dimGrid);
268
270 cuComplex_convert(X2_f,X2,
kferm2,
true,dimBlock,dimGrid);
272
273 cuForce(dSdpi,u11t_f,u12t_f,X1_f,X2_f,gamval_f,dk4m_f,dk4p_f,iu,gamin,akappa,dimGrid,dimBlock);
274 cudaDeviceSynchronise();
275 cudaFreeAsync(X1_f,NULL); cudaFreeAsync(X2_f,NULL);
276#else
277#pragma omp parallel for
278 for(
int i=0;i<
kvol;i++)
279 for(
int idirac=0;idirac<
ndirac;idirac++){
280 int mu, uid, igork1;
281#ifndef NO_SPACE
282#pragma omp simd aligned(dSdpi,X1,X2,u11t,u12t,iu:AVX)
283 for(mu=0; mu<3; mu++){
284
285
286
287
288
289
290
291
293 igork1 = gamin[mu*
ndirac+idirac];
294
295
296
297 dSdpi[(i*
nadj)*
ndim+mu]+=akappa*creal(I*
307 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
320 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
323
324 dSdpi[(i*
nadj+1)*
ndim+mu]+=akappa*creal(
334 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
347 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
350
351 dSdpi[(i*
nadj+2)*
ndim+mu]+=akappa*creal(I*
361 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
374 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
377
378 }
379#endif
380
381
382 mu=3;
384 igork1 = gamin[mu*
ndirac+idirac];
385#ifndef NO_TIME
388 (dk4m[i]*(-conj(u12t[i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
396 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
399 +creal(I*
401 (dk4m[i]*(-conj(u12t[i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
409 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
412
415 (dk4m[i]*(-conj(u12t[i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
423 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
426 +creal(
428 (dk4m[i]*(-conj(u12t[i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
436 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
439
445 (dk4p[i]*(-conj(u11t[i*
ndim+mu])*X2[(i*
ndirac+idirac)*
nc]
448 (dk4m[i]* (conj(u12t[i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
450 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
451 (dk4p[i]*(-conj(u12t[i*
ndim+mu])*X2[(i*
ndirac+idirac)*
nc]
453 +creal(I*
458 (-dk4p[i]*(-conj(u11t[i*
ndim+mu])*X2[(i*
ndirac+igork1)*
nc]
461 (dk4m[i]* (conj(u12t[i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
463 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
464 (-dk4p[i]*(-conj(u12t[i*
ndim+mu])*X2[(i*
ndirac+igork1)*
nc]
466
467#endif
468 }
469#endif
470 }
471#ifdef __NVCC__
472 cudaFree(X2);
473#else
474 free(X2);
475#endif
476 return 0;
477}
int Gauge_force(double *dSdpi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
int Hdslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
Evaluates in double precision.
#define DOWN
Flag for send down.
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
#define kferm2Halo
Dirac lattice and halo.
#define nadj
adjacent spatial indices
#define kvol
Sublattice volume.
#define Complex
Double precision complex number.
#define nf
Fermion flavours (double it)
#define ndirac
Dirac indices.
#define Complex_f
Single precision complex number.
#define kferm2
sublattice size including Dirac indices
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)