27 const char *funcname =
"Gauge_force";
36 cuGauge_force(ut,dSdpi,beta,iu,
id,dimGrid,dimBlock);
44 for(
int mu=0; mu<
ndim; mu++){
47 for(
int nu=0; nu<
ndim; nu++)
50#pragma omp parallel for simd
51 for(
int i=0;i<
kvol;i++){
52 int uidm = iu[mu+
ndim*i];
53 int uidn = iu[nu+
ndim*i];
55 ut[1][uidm*
ndim+nu]*conj(ut[1][uidn*
ndim+mu]);
57 ut[1][uidm*
ndim+nu]*ut[0][uidn*
ndim+mu];
58 Sigma[0][i]+=a11*conj(ut[0][i*
ndim+nu])+a12*conj(ut[1][i*
ndim+nu]);
59 Sigma[1][i]+=-a11*ut[1][i*
ndim+nu]+a12*ut[0][i*
ndim+nu];
67#pragma omp parallel for simd
68 for(
int i=0;i<
kvol;i++){
69 int uidm = iu[mu+
ndim*i];
70 int didn =
id[nu+
ndim*i];
73 ush[1][uidm]*conj(ut[1][didn*
ndim+mu]);
75 ush[1][uidm]*ut[0][didn*
ndim+mu];
76 Sigma[0][i]+=a11*ut[0][didn*
ndim+nu]-a12*conj(ut[1][didn*
ndim+nu]);
77 Sigma[1][i]+=a11*ut[1][didn*
ndim+nu]+a12*conj(ut[0][didn*
ndim+nu]);
80#pragma omp parallel for simd
81 for(
int i=0;i<
kvol;i++){
85 dSdpi[(i*
nadj)*
ndim+mu]=(
double)(beta*cimag(a11));
86 dSdpi[(i*
nadj+1)*
ndim+mu]=(
double)(beta*creal(a11));
87 dSdpi[(i*
nadj+2)*
ndim+mu]=(
double)(beta*cimag(a12));
90 free(ush[0]); free(ush[1]); free(Sigma[0]); free(Sigma[1]);
96 int *gamin,
double *dk[2],
float *dk_f[2],
Complex_f jqq,
float akappa,
float beta,
double *ancg){
123 const char *funcname =
"Force";
126 cudaGetDevice(&device);
139 cuComplex_convert(X1_f,X1,
kferm2,
true,dimBlock,dimGrid);
143 for(
int na = 0; na<
nf; na++){
152 cudaMallocAsync((
void **)&smallPhi,
kferm2*
sizeof(
Complex),streams[0]);
158 Congradq(na,res1,X1,smallPhi,ut_f,iu,
id,gamval_f,gamin,dk_f,jqq,akappa,&itercg);
160 cudaFreeAsync(smallPhi,streams[0]);
166 Complex blasa=2.0;
double blasb=-1.0;
167 cublasZdscal(cublas_handle,
kferm2,&blasb,(cuDoubleComplex *)(X0+na*
kferm2),1);
168 cublasZaxpy(cublas_handle,
kferm2,(cuDoubleComplex *)&blasa,(cuDoubleComplex *)X1,1,(cuDoubleComplex *)(X0+na*
kferm2),1);
169 cuComplex_convert(X1_f,X1,
kferm2,
true,dimBlock,dimGrid);
171 cudaDeviceSynchronise();
172#elif (defined __INTEL_MKL__)
176 cblas_zaxpby(
kferm2, &blasa, X1, 1, &blasb, X0+na*
kferm2, 1);
177#elif defined USE_BLAS
178 Complex blasa=2.0;
double blasb=-1.0;
182#pragma omp parallel for simd collapse(2)
183 for(
int i=0;i<
kvol;i++)
184 for(
int idirac=0;idirac<
ndirac;idirac++){
193 Hdslash_f(X2_f,X1_f,ut_f,iu,
id,gamval_f,gamin,dk_f,akappa);
195 Hdslash(X2,X1,ut,iu,
id,gamval,gamin,dk,akappa);
199 cudaDeviceSynchronise();
200 cublasCsscal(cublas_handle,
kferm2, &blasd, (cuComplex *)X2_f, 1);
201#elif defined USE_BLAS
203 cblas_zdscal(
kferm2, blasd, X2, 1);
235 cuForce(dSdpi,ut_f,X1_f,X2_f,gamval_f,dk_f,iu,gamin,akappa,dimGrid,dimBlock);
236 cudaDeviceSynchronise();
238#pragma omp parallel for
239 for(
int i=0;i<
kvol;i++)
240 for(
int idirac=0;idirac<
ndirac;idirac++){
244 for(mu=0; mu<3; mu++){
254 igork1 = gamin[mu*
ndirac+idirac];
258 dSdpi[(i*
nadj)*
ndim+mu]+=akappa*creal(I*
268 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
281 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
285 dSdpi[(i*
nadj+1)*
ndim+mu]+=akappa*creal(
295 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
308 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
312 dSdpi[(i*
nadj+2)*
ndim+mu]+=akappa*creal(I*
322 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
335 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
345 igork1 = gamin[mu*
ndirac+idirac];
349 (dk[0][i]*(-conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
355 (dk[0][i]* (ut[0][i*
ndim+mu] *X2[(uid*
ndirac+idirac)*
nc]
357 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
362 (dk[0][i]*(-conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
368 (dk[0][i]* (ut[0][i*
ndim+mu] *X2[(uid*
ndirac+igork1)*
nc]
370 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
371 (-dk[1][i]* (-ut[0][i*
ndim+mu] *X2[(i*
ndirac+igork1)*
nc]
376 (dk[0][i]*(-conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
382 (dk[0][i]* (-ut[0][i*
ndim+mu] *X2[(uid*
ndirac+idirac)*
nc]
384 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
389 (dk[0][i]*(-conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
392 (-dk[1][i]* (-ut[1][i*
ndim+mu] *X2[(i*
ndirac+igork1)*
nc]
395 (dk[0][i]* (-ut[0][i*
ndim+mu] *X2[(uid*
ndirac+igork1)*
nc]
397 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
403 (dk[0][i]* (ut[0][i*
ndim+mu] *X2[(uid*
ndirac+idirac)*
nc]
406 (dk[1][i]*(-conj(ut[0][i*
ndim+mu])*X2[(i*
ndirac+idirac)*
nc]
409 (dk[0][i]* (conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
411 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
412 (dk[1][i]*(-conj(ut[1][i*
ndim+mu])*X2[(i*
ndirac+idirac)*
nc]
416 (dk[0][i]* (ut[0][i*
ndim+mu] *X2[(uid*
ndirac+igork1)*
nc]
419 (-dk[1][i]*(-conj(ut[0][i*
ndim+mu])*X2[(i*
ndirac+igork1)*
nc]
422 (dk[0][i]* (conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
424 +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
425 (-dk[1][i]*(-conj(ut[1][i*
ndim+mu])*X2[(i*
ndirac+igork1)*
nc]
433 cudaFreeAsync(X1_f,streams[0]); cudaFreeAsync(X2_f,streams[1]);
int Force(double *dSdpi, int iflag, double res1, Complex *X0, Complex *X1, Complex *Phi, Complex *ut[2], Complex_f *ut_f[2], unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], Complex_f jqq, float akappa, float beta, double *ancg)
Calculates the force at each intermediate time.
int Gauge_force(double *dSdpi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
Matrix multiplication and related declarations.
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates in single precision.
int Hdslash(Complex *phi, Complex *r, Complex *u11t[2], unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk[2], float akappa)
Evaluates in double precision.
#define DOWN
Flag for send down.
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int CHalo_swap_dir(Complex_f *c, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
#define kferm2Halo
Dirac lattice and halo.
#define nadj
adjacent spatial indices
#define kvol
Sublattice volume.
#define Complex
Double precision complex number.
#define nf
Fermion flavours (double it)
#define ndirac
Dirac indices.
#define halo
Total Halo size.
#define Complex_f
Single precision complex number.
#define kferm2
sublattice size including Dirac indices
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
Extracts all the single precision gauge links in the direction only.