Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time. 
   96                                                                                                  {
   97   
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123   const char *funcname = "Force";
  124#ifdef __NVCC__
  125   int device=-1;
  126   cudaGetDevice(&device);
  127#endif
  128#ifndef NO_GAUGE
  130#endif
  131   if(!akappa)
  132      return 0;
  133   
  134   int itercg=1;
  135#ifdef __NVCC__
  139   cuComplex_convert(X1_f,X1,
kferm2,
true,dimBlock,dimGrid);
 
  140#else
  142#endif
  143   for(
int na = 0; na<
nf; na++){
 
  144#ifdef __NVCC__
  146#else
  148#endif
  149      if(!iflag){
  150#ifdef __NVCC__
  152         cudaMallocAsync((
void **)&smallPhi,
kferm2*
sizeof(
Complex),streams[0]);
 
  153#else
  155#endif
  157         
  158         Congradq(na,res1,X1,smallPhi,ut_f,iu,
id,gamval_f,gamin,dk_f,jqq,akappa,&itercg);
 
  159#ifdef __NVCC__
  160         cudaFreeAsync(smallPhi,streams[0]);
  161#else
  162         free(smallPhi);
  163#endif
  164         *ancg+=itercg;
  165#ifdef __NVCC__
  166         Complex blasa=2.0; 
double blasb=-1.0;
 
  167         cublasZdscal(cublas_handle,
kferm2,&blasb,(cuDoubleComplex *)(X0+na*
kferm2),1);
 
  168         cublasZaxpy(cublas_handle,
kferm2,(cuDoubleComplex *)&blasa,(cuDoubleComplex *)X1,1,(cuDoubleComplex *)(X0+na*
kferm2),1);
 
  169         cuComplex_convert(X1_f,X1,
kferm2,
true,dimBlock,dimGrid);
 
  170         
  171         cudaDeviceSynchronise();
  172#elif (defined __INTEL_MKL__)
  174         
  175         
  176         cblas_zaxpby(
kferm2, &blasa, X1, 1, &blasb, X0+na*
kferm2, 1); 
 
  177#elif defined USE_BLAS
  178         Complex blasa=2.0; 
double blasb=-1.0;
 
  181#else
  182#pragma omp parallel for simd collapse(2)
  183         for(
int i=0;i<
kvol;i++)
 
  184            for(
int idirac=0;idirac<
ndirac;idirac++){
 
  189            }
  190#endif
  191      }
  192      #ifdef __NVCC__
  193      Hdslash_f(X2_f,X1_f,ut_f,iu,
id,gamval_f,gamin,dk_f,akappa);
 
  194      #else
  195      Hdslash(X2,X1,ut,iu,
id,gamval,gamin,dk,akappa);
 
  196      #endif
  197#ifdef __NVCC__
  198      float blasd=2.0;
  199      cudaDeviceSynchronise();
  200      cublasCsscal(cublas_handle,
kferm2, &blasd, (cuComplex *)X2_f, 1);
 
  201#elif defined USE_BLAS
  202      double blasd=2.0;
  203      cblas_zdscal(
kferm2, blasd, X2, 1);
 
  204#else
  205#pragma unroll
  207         X2[i]*=2;
  208#endif
  209#if(npx>1)
  212#endif
  213#if(npy>1)
  216#endif
  217#if(npz>1)
  220#endif
  221#if(npt>1)
  224#endif
  225 
  226      
  227      
  228      
  229      
  230      
  231      
  232      
  233      
  234#ifdef __NVCC__
  235      cuForce(dSdpi,ut_f,X1_f,X2_f,gamval_f,dk_f,iu,gamin,akappa,dimGrid,dimBlock);
  236      cudaDeviceSynchronise();
  237#else
  238#pragma omp parallel for
  239      for(
int i=0;i<
kvol;i++)
 
  240         for(
int idirac=0;idirac<
ndirac;idirac++){
 
  241            int mu, uid, igork1;
  242#ifndef NO_SPACE
  243#pragma omp simd 
  244            for(mu=0; mu<3; mu++){
  245               
  246               
  247               
  248               
  249               
  250               
  251 
  252               
  254               igork1 = gamin[mu*
ndirac+idirac];   
 
  255 
  256               
  257               
  258               dSdpi[(i*
nadj)*
ndim+mu]+=akappa*creal(I*
 
  268                      +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
 
  281                      +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
 
  284 
  285               dSdpi[(i*
nadj+1)*
ndim+mu]+=akappa*creal(
 
  295                      +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
 
  308                      +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
 
  311 
  312               dSdpi[(i*
nadj+2)*
ndim+mu]+=akappa*creal(I*
 
  322                      +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
 
  335                      +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
 
  338 
  339            }
  340#endif
  341            
  342            
  343            mu=3;
  345            igork1 = gamin[mu*
ndirac+idirac];   
 
  346#ifndef NO_TIME
  349                   (dk[0][i]*(-conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
 
  355                   (dk[0][i]*       (ut[0][i*
ndim+mu] *X2[(uid*
ndirac+idirac)*
nc]
 
  357                   +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
 
  360               +creal(I*
  362                      (dk[0][i]*(-conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
 
  368                      (dk[0][i]*       (ut[0][i*
ndim+mu] *X2[(uid*
ndirac+igork1)*
nc]
 
  370                      +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
 
  371                      (-dk[1][i]*      (-ut[0][i*
ndim+mu] *X2[(i*
ndirac+igork1)*
nc]
 
  373 
  376                  (dk[0][i]*(-conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
 
  382                  (dk[0][i]*      (-ut[0][i*
ndim+mu] *X2[(uid*
ndirac+idirac)*
nc]
 
  384                  +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
 
  387               +creal(
  389                      (dk[0][i]*(-conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
 
  392                      (-dk[1][i]*      (-ut[1][i*
ndim+mu] *X2[(i*
ndirac+igork1)*
nc]
 
  395                      (dk[0][i]*      (-ut[0][i*
ndim+mu] *X2[(uid*
ndirac+igork1)*
nc]
 
  397                      +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
 
  400 
  403                   (dk[0][i]*       (ut[0][i*
ndim+mu] *X2[(uid*
ndirac+idirac)*
nc]
 
  406                   (dk[1][i]*(-conj(ut[0][i*
ndim+mu])*X2[(i*
ndirac+idirac)*
nc]
 
  409                   (dk[0][i]* (conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+idirac)*
nc]
 
  411                   +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
 
  412                   (dk[1][i]*(-conj(ut[1][i*
ndim+mu])*X2[(i*
ndirac+idirac)*
nc]
 
  414               +creal(I*
  416                      (dk[0][i]*       (ut[0][i*
ndim+mu] *X2[(uid*
ndirac+igork1)*
nc]
 
  419                      (-dk[1][i]*(-conj(ut[0][i*
ndim+mu])*X2[(i*
ndirac+igork1)*
nc]
 
  422                      (dk[0][i]* (conj(ut[1][i*
ndim+mu])*X2[(uid*
ndirac+igork1)*
nc]
 
  424                      +conj(X1[(uid*
ndirac+idirac)*
nc+1])*
 
  425                      (-dk[1][i]*(-conj(ut[1][i*
ndim+mu])*X2[(i*
ndirac+igork1)*
nc]
 
  427 
  428#endif
  429         }
  430#endif
  431   }
  432#ifdef __NVCC__
  433   cudaFreeAsync(X1_f,streams[0]); cudaFreeAsync(X2_f,streams[1]);
  434#else
  435   free(X2); 
  436#endif
  437   return 0;
  438}
int Gauge_force(double *dSdpi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates  in single precision.
int Hdslash(Complex *phi, Complex *r, Complex *u11t[2], unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk[2], float akappa)
Evaluates  in double precision.
#define DOWN
Flag for send down.
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
#define kferm2Halo
Dirac lattice and halo.
#define nadj
adjacent spatial indices
#define kvol
Sublattice volume.
#define Complex
Double precision complex number.
#define nf
Fermion flavours (double it)
#define ndirac
Dirac indices.
#define Complex_f
Single precision complex number.
#define kferm2
sublattice size including Dirac indices
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves  Implements up/down pa...
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)