27 const char *funcname =
"Gauge_force";
37 cudaGetDevice(&device);
38 Complex_f *Sigma11, *Sigma12, *u11sh, *u12sh;
39 cudaMallocAsync((
void **)&Sigma11,
kvol*
sizeof(
Complex_f),streams[0]);
40 cudaMallocAsync((
void **)&Sigma12,
kvol*
sizeof(
Complex_f),streams[1]);
41 cudaMallocManaged((
void **)&u11sh,(
kvol+
halo)*
sizeof(
Complex_f),cudaMemAttachGlobal);
42 cudaMallocManaged((
void **)&u12sh,(
kvol+
halo)*
sizeof(
Complex_f),cudaMemAttachGlobal);
50 for(
int mu=0; mu<
ndim; mu++){
58 for(
int nu=0; nu<
ndim; nu++)
62 cuPlus_staple(mu,nu,iu,Sigma11,Sigma12,u11t,u12t,dimGrid,dimBlock);
64#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12,iu:AVX)
65 for(
int i=0;i<
kvol;i++){
66 int uidm = iu[mu+
ndim*i];
67 int uidn = iu[nu+
ndim*i];
69 u12t[uidm*
ndim+nu]*conj(u12t[uidn*
ndim+mu]);
71 u12t[uidm*
ndim+nu]*u11t[uidn*
ndim+mu];
72 Sigma11[i]+=a11*conj(u11t[i*
ndim+nu])+a12*conj(u12t[i*
ndim+nu]);
73 Sigma12[i]+=-a11*u12t[i*
ndim+nu]+a12*u11t[i*
ndim+nu];
81 cudaMemPrefetchAsync(u11sh,
kvol*
sizeof(
Complex_f),cudaCpuDeviceId,streams[0]);
82 cudaMemPrefetchAsync(u12sh,
kvol*
sizeof(
Complex_f),cudaCpuDeviceId,streams[1]);
92 cudaDeviceSynchronise();
93 cuMinus_staple(mu,nu,iu,
id,Sigma11,Sigma12,u11sh,u12sh,u11t,u12t,dimGrid,dimBlock);
95#pragma omp parallel for simd aligned(u11t,u12t,u11sh,u12sh,Sigma11,Sigma12,iu,id:AVX)
96 for(
int i=0;i<
kvol;i++){
97 int uidm = iu[mu+
ndim*i];
98 int didn =
id[nu+
ndim*i];
101 u12sh[uidm]*conj(u12t[didn*
ndim+mu]);
103 u12sh[uidm]*u11t[didn*
ndim+mu];
104 Sigma11[i]+=a11*u11t[didn*
ndim+nu]-a12*conj(u12t[didn*
ndim+nu]);
105 Sigma12[i]+=a11*u12t[didn*
ndim+nu]+a12*conj(u11t[didn*
ndim+nu]);
110 cuGauge_force(mu,Sigma11,Sigma12,u11t,u12t,dSdpi,beta,dimGrid,dimBlock);
112#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12,dSdpi:AVX)
113 for(
int i=0;i<
kvol;i++){
117 dSdpi[(i*
nadj)*
ndim+mu]=(
double)(beta*cimag(a11));
118 dSdpi[(i*
nadj+1)*
ndim+mu]=(
double)(beta*creal(a11));
119 dSdpi[(i*
nadj+2)*
ndim+mu]=(
double)(beta*cimag(a12));
124 cudaDeviceSynchronise();
125 cudaFreeAsync(Sigma11,streams[0]); cudaFreeAsync(Sigma12,streams[1]); cudaFree(u11sh); cudaFree(u12sh);
127 free(u11sh); free(u12sh); free(Sigma11); free(Sigma12);
133 int *gamin,
double *dk4m,
double *dk4p,
float *dk4m_f,
float *dk4p_f,
Complex_f jqq,\
134 float akappa,
float beta,
double *ancg){
161 const char *funcname =
"Force";
164 cudaGetDevice(&device);
177 for(
int na = 0; na<
nf; na++){
186 cudaMallocAsync((
void **)&smallPhi,
kferm2*
sizeof(
Complex),streams[0]);
192 Congradq(na,res1,X1,smallPhi,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg);
194 cudaFreeAsync(smallPhi,streams[0]);
200 Complex blasa=2.0;
double blasb=-1.0;
201 cublasZdscal(cublas_handle,
kferm2,&blasb,(cuDoubleComplex *)(X0+na*
kferm2),1);
202 cublasZaxpy(cublas_handle,
kferm2,(cuDoubleComplex *)&blasa,(cuDoubleComplex *)X1,1,(cuDoubleComplex *)(X0+na*
kferm2),1);
204 cudaDeviceSynchronise();
205#elif (defined __INTEL_MKL__)
209 cblas_zaxpby(
kferm2, &blasa, X1, 1, &blasb, X0+na*
kferm2, 1);
210#elif defined USE_BLAS
211 Complex blasa=2.0;
double blasb=-1.0;
215#pragma omp parallel for simd collapse(2)
216 for(
int i=0;i<
kvol;i++)
217 for(
int idirac=0;idirac<
ndirac;idirac++){
225 Hdslash(X2,X1,u11t,u12t,iu,
id,gamval,gamin,dk4m,dk4p,akappa);
228 cudaDeviceSynchronise();
229 cublasZdscal(cublas_handle,
kferm2, &blasd, (cuDoubleComplex *)X2, 1);
230#elif defined USE_BLAS
232 cblas_zdscal(
kferm2, blasd, X2, 1);
266 cuComplex_convert(X1_f,X1,
kferm2,
true,dimBlock,dimGrid);
270 cuComplex_convert(X2_f,X2,
kferm2,
true,dimBlock,dimGrid);
273 cuForce(dSdpi,u11t_f,u12t_f,X1_f,X2_f,gamval_f,dk4m_f,dk4p_f,iu,gamin,akappa,dimGrid,dimBlock);
274 cudaDeviceSynchronise();
275 cudaFreeAsync(X1_f,NULL); cudaFreeAsync(X2_f,NULL);
277#pragma omp parallel for
278 for(
int i=0;i<
kvol;i++)
279 for(
int idirac=0;idirac<
ndirac;idirac++){
282#pragma omp simd aligned(dSdpi,X1,X2,u11t,u12t,iu:AVX)
283 for(mu=0; mu<3; mu++){
293 igork1 = gamin[mu*
ndirac+idirac];
297 dSdpi[(i*
nadj)*
ndim+mu]+=akappa*creal(I*
307 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
320 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
324 dSdpi[(i*
nadj+1)*
ndim+mu]+=akappa*creal(
334 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
347 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
351 dSdpi[(i*
nadj+2)*
ndim+mu]+=akappa*creal(I*
361 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
374 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
384 igork1 = gamin[mu*
ndirac+idirac];
388 (dk4m[i]*(-conj(u12t[i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
396 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
401 (dk4m[i]*(-conj(u12t[i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
409 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
415 (dk4m[i]*(-conj(u12t[i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
423 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
428 (dk4m[i]*(-conj(u12t[i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
436 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
445 (dk4p[i]*(-conj(u11t[i*
ndim+mu])*X2[(i*
ndirac+idirac)*
nc]
448 (dk4m[i]* (conj(u12t[i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
450 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
451 (dk4p[i]*(-conj(u12t[i*
ndim+mu])*X2[(i*
ndirac+idirac)*
nc]
458 (-dk4p[i]*(-conj(u11t[i*
ndim+mu])*X2[(i*
ndirac+igork1)*
nc]
461 (dk4m[i]* (conj(u12t[i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
463 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
464 (-dk4p[i]*(-conj(u12t[i*
ndim+mu])*X2[(i*
ndirac+igork1)*
nc]
int Force(double *dSdpi, int iflag, double res1, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancg)
Calculates the force at each intermediate time.
int Gauge_force(double *dSdpi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
Matrix multiplication and related declarations.
int Hdslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
Evaluates in double precision.
#define DOWN
Flag for send down.
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int CHalo_swap_dir(Complex_f *c, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
#define kferm2Halo
Dirac lattice and halo.
#define nadj
adjacent spatial indices
#define kvol
Sublattice volume.
#define Complex
Double precision complex number.
#define nf
Fermion flavours (double it)
#define ndirac
Dirac indices.
#define halo
Total Halo size.
#define Complex_f
Single precision complex number.
#define kferm2
sublattice size including Dirac indices
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
Extracts all the single precision gauge links in the direction only.