10#include <cuda_runtime.h>
19int Init(
int istart,
int ibound,
int iread,
float beta,
float fmu,
float akappa,
Complex_f ajq,\
21 Complex *gamval,
Complex_f *gamval_f,
int *gamin,
double *dk4m,
double *dk4p,
float *dk4m_f,
float *dk4p_f,\
22 unsigned int *iu,
unsigned int *
id){
61 const char *funcname =
"Init";
75 printf(
"Checked addresses\n");
77 double chem1=exp(fmu);
double chem2 = 1/chem1;
79#pragma omp parallel for simd aligned(dk4m,dk4p:AVX)
80 for(
int i = 0; i<
kvol; i++){
88 printf(
"Implementing antiperiodic boundary conditions on rank %i\n",
rank);
90#pragma omp parallel for simd aligned(dk4m,dk4p:AVX)
105 cuReal_convert(dk4p_f,dk4p,
kvol+
halo,
true,dimBlock,dimGrid);
106 cuReal_convert(dk4m_f,dk4m,
kvol+
halo,
true,dimBlock,dimGrid);
108#pragma omp parallel for simd aligned(dk4m,dk4p,dk4m_f,dk4p_f:AVX)
110 dk4p_f[i]=(float)dk4p[i];
111 dk4m_f[i]=(float)dk4m[i];
114 int __attribute__((aligned(
AVX))) gamin_t[4][4] = {{3,2,1,0},{3,2,1,0},{2,3,0,1},{2,3,0,1}};
120 cudaMemcpy(gamin,gamin_t,4*4*
sizeof(
int),cudaMemcpyDefault);
122 memcpy(gamin,gamin_t,4*4*
sizeof(
int));
124 Complex __attribute__((aligned(
AVX))) gamval_t[5][4] = {{-I,-I,I,I},{-1,1,1,-1},{-I,I,I,-I},{1,1,1,1},{1,1,-1,-1}};
129 cblas_zdscal(5*4, akappa, gamval_t, 1);
131#pragma omp parallel for simd collapse(2) aligned(gamval,gamval_f:AVX)
134 gamval_t[i][j]*=akappa;
138 cudaMemcpy(gamval,gamval_t,5*4*
sizeof(
Complex),cudaMemcpyDefault);
139 cuComplex_convert(gamval_f,gamval,20,
true,dimBlockOne,dimGridOne);
141 memcpy(gamval,gamval_t,5*4*
sizeof(
Complex));
142 for(
int i=0;i<5*4;i++)
146 if(!
rank) printf(
"Calling Par_sread() for configuration: %i\n", iread);
147 Par_sread(iread, beta, fmu, akappa, ajq,u11,u12,u11t,u12t);
155#pragma omp parallel for simd aligned(u11t:AVX)
158 u11t[i]=1; u12t[i]=0;
165 u11t[i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
166 u12t[i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
169#elif (defined __INTEL_MKL__&&!defined USE_RAN2)
171 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*
ndim*
kvol, u11t, -1, 1);
172 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*
ndim*
kvol, u12t, -1, 1);
182 fprintf(stderr,
"Warning %i in %s: Gauge fields are not initialised.\n", NOINIT, funcname);
186 cudaGetDevice(&device);
187 cudaMemPrefetchAsync(u11t,
ndim*
kvol*
sizeof(
Complex),device,streams[0]);
188 cudaMemPrefetchAsync(u12t,
ndim*
kvol*
sizeof(
Complex),device,streams[1]);
194 cudaMemcpyAsync(u11,u11t,
ndim*
kvol*
sizeof(
Complex),cudaMemcpyDefault,streams[0]);
195 cudaMemPrefetchAsync(u11,
ndim*
kvol*
sizeof(
Complex),device,streams[0]);
196 cudaMemcpyAsync(u12,u12t,
ndim*
kvol*
sizeof(
Complex),cudaMemcpyDefault,streams[1]);
197 cudaMemPrefetchAsync(u12,
ndim*
kvol*
sizeof(
Complex),device,streams[1]);
204 printf(
"Initialisation Complete\n");
211 float akappa,
float beta,
double *ancgh,
int traj){
236 const char *funcname =
"Hamilton";
241 cudaGetDevice(&device);
242 cudaMemPrefetchAsync(pp,
kmom*
sizeof(
double),device,NULL);
243 cublasDnrm2(cublas_handle,
kmom, pp, 1,&hp);
245#elif defined USE_BLAS
246 double hp = cblas_dnrm2(
kmom, pp, 1);
250 for(
int i = 0; i<
kmom; i++)
254 double avplaqs, avplaqt;
259 double hf = 0;
int itercg = 0;
262 cudaMallocAsync((
void **)&smallPhi,
kferm2*
sizeof(
Complex),NULL);
267 for(
int na=0;na<
nf;na++){
270 cudaDeviceSynchronise();
272 cudaMemcpyAsync(X1,X0+na*
kferm2,
kferm2*
sizeof(
Complex),cudaMemcpyDeviceToDevice,streams[0]);
274 cudaDeviceSynchronise();
280 if(
Congradq(na,res2,X1,smallPhi,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg))
281 fprintf(stderr,
"Trajectory %d\n", traj);
285 cudaMemcpyAsync(X0+na*
kferm2,X1,
kferm2*
sizeof(
Complex),cudaMemcpyDeviceToDevice,streams[0]);
292 cublasZdotc(cublas_handle,
kferm2,(cuDoubleComplex *)smallPhi,1,(cuDoubleComplex *) X1,1,(cuDoubleComplex *) &dot);
294#elif defined USE_BLAS
296 cblas_zdotc_sub(
kferm2, smallPhi, 1, X1, 1, &dot);
302 hf+=creal(conj(smallPhi[j])*X1[j]);
306 cudaFreeAsync(smallPhi,NULL);
314 *s=hg+hf; *h=(*s)+hp;
317 printf(
"hg=%.5e; hf=%.5e; hp=%.5e; h=%.5e\n", hg, hf, hp, *h);
323 const char *funcname =
"C_gather";
327 cuC_gather(x,y,n,table,mu,dimBlock,dimGrid);
329#pragma omp parallel for simd aligned (x,y,table:AVX)
330 for(
int i=0; i<n; i++)
337 const char *funcname =
"Z_gather";
341 cuZ_gather(x,y,n,table,mu,dimBlock,dimGrid);
343#pragma omp parallel for simd aligned (x,y,table:AVX)
344 for(
int i=0; i<n; i++)
366 const char *funcname =
"Fill_Small_Phi";
369 cuFill_Small_Phi(na,smallPhi,Phi,dimBlock,dimGrid);
371#pragma omp parallel for simd aligned(smallPhi,Phi:AVX) collapse(3)
372 for(
int i = 0; i<
kvol;i++)
373 for(
int idirac = 0; idirac<
ndirac; idirac++)
374 for(
int ic= 0; ic<
nc; ic++)
382 cuUpDownPart(na,X0,R1,dimBlock,dimGrid);
383 cudaDeviceSynchronise();
386#pragma omp parallel for simd collapse(2) aligned(X0,R1:AVX)
387 for(
int i=0; i<
kvol; i++)
388 for(
int idirac = 0; idirac <
ndirac; idirac++){
Header for routines related to lattice sites.
int Check_addr(unsigned int *table, int lns, int lnt, int imin, int imax)
int Addrc(unsigned int *iu, unsigned int *id)
Loads the addresses required during the update.
Matrix multiplication and related declarations.
#define UP
Flag for send up.
int Par_sread(const int iread, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t)
Reads and assigns the gauges from file.
int DHalo_swap_dir(double *d, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
int * pcoord
The processor grid.
Header for random number configuration.
int Par_ranset(long *seed, int iread)
Uses the rank to get a new seed. Copying from the FORTRAN description here c create new seeds in rang...
double ran2(long *idum)
Generates uniformly distributed random double between zero and one as described in numerical recipes....
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
#define kmom
sublattice momentum sites
#define ngorkov
Gor'kov indices.
#define ksizet
Sublattice t extent.
#define kvol
Sublattice volume.
#define Complex
Double precision complex number.
#define nthreads
Number of threads for OpenMP, which can be overwritten at runtime.
#define ksize
Sublattice spatial extent for a cubic lattice.
#define nf
Fermion flavours (double it)
#define ndirac
Dirac indices.
#define kvol3
Sublattice spatial volume.
#define halo
Total Halo size.
#define Complex_f
Single precision complex number.
#define kferm2
sublattice size including Dirac indices
#define npt
Processor grid t extent.
int Init(int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, unsigned int *iu, unsigned int *id)
Initialises the system.
int Z_gather(Complex *x, Complex *y, int n, unsigned int *table, unsigned int mu)
Extracts all the double precision gauge links in the direction only.
int Hamilton(double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
Calculate the Hamiltonian.
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
Extracts all the single precision gauge links in the direction only.
Function declarations for most of the routines.
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
int Average_Plaquette(double *hg, double *avplaqs, double *avplaqt, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, float beta)
Calculates the gauge action using new (how new?) lookup table.
int Reunitarise(Complex *u11t, Complex *u12t)
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.