19int Init(
int istart,
int ibound,
int iread,
float beta,
float fmu,
float akappa,
Complex_f ajq,\
24 int *gamin,
double *dk[2],
float *dk_f[2],
unsigned int *iu,
unsigned int *
id){
63 const char *funcname =
"Init";
77 printf(
"Checked addresses\n");
79 double chem1=exp(fmu);
double chem2 = 1/chem1;
81#pragma omp parallel for simd
82 for(
int i = 0; i<
kvol; i++){
83 dk[1][i]=akappa*chem1;
84 dk[0][i]=akappa*chem2;
90 printf(
"Implementing antiperiodic boundary conditions on rank %i\n",
rank);
92#pragma omp parallel for simd
107 cuReal_convert(dk_f[1],dk[1],
kvol+
halo,
true,dimBlock,dimGrid);
108 cuReal_convert(dk_f[0],dk[0],
kvol+
halo,
true,dimBlock,dimGrid);
110#pragma omp parallel for simd
112 dk_f[1][i]=(float)dk[1][i];
113 dk_f[0][i]=(float)dk[0][i];
117 int __attribute__((aligned(
AVX))) gamin_t[4][4] = {{3,2,1,0},{3,2,1,0},{2,3,0,1},{2,3,0,1}};
122 cudaMemcpy(gamin,gamin_t,4*4*
sizeof(
int),cudaMemcpyDefault);
124 memcpy(gamin,gamin_t,4*4*
sizeof(
int));
127 Complex __attribute__((aligned(
AVX))) gamval_t[5][4] = {{-I,-I,I,I},{-1,1,1,-1},{-I,I,I,-I},{1,1,1,1},{1,1,-1,-1}};
132 cblas_zdscal(5*4, akappa, gamval_t, 1);
134#pragma omp parallel for simd collapse(2) aligned(gamval,gamval_f:AVX)
137 gamval_t[i][j]*=akappa;
142 cudaMemcpy(gamval,gamval_t,5*4*
sizeof(
Complex),cudaMemcpyDefault);
143 cuComplex_convert(gamval_f,gamval,20,
true,dimBlockOne,dimGridOne);
145 memcpy(gamval,gamval_t,5*4*
sizeof(
Complex));
146 for(
int i=0;i<5*4;i++)
151 int __attribute__((aligned(
AVX))) sigin_t[7][4] = {{0,1,2,3},{0,1,2,3},{1,0,3,2},{1,0,3,2},{1,0,3,2},{1,0,3,2},{0,1,2,3}};
162 Complex __attribute__((aligned(
AVX))) sigval_t[7][4] = {{0,0,0,0},{-1,1,-1,1},{-I,I,-I,I},{1,1,-1,-1},{-1,-1,-1,-1},{-I,I,I,-I},{1,-1,-1,1}};
164 cblas_zdscal(6*4, akappa, sigval_t+4*
sizeof(
Complex), 1);
166#pragma omp parallel for simd collapse(2) aligned(sigval,sigval_f:AVX)
169 sigval_t[i][j]*=akappa;
173 cudaMemcpy(sigval,sigval_t,7*4*
sizeof(
Complex),cudaMemcpyDefault);
174 cuComplex_convert(sigval_f,sigval,28,
true,dimBlockOne,dimGridOne);
176 memcpy(sigval,sigval_t,7*4*
sizeof(
Complex));
177 for(
int i=0;i<7*4;i++)
183 if(!
rank) printf(
"Calling Par_sread() for configuration: %i\n", iread);
184 Par_sread(iread, beta, fmu, akappa, ajq,u[0],u[1],ut[0],ut[1]);
192#pragma omp parallel for simd
195 ut[0][i]=1; ut[1][i]=0;
202 ut[0][i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
203 ut[1][i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
206#elif (defined __INTEL_MKL__&&!defined USE_RAN2)
208 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*
ndim*
kvol, ut[0], -1, 1);
209 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*
ndim*
kvol, ut[1], -1, 1);
219 fprintf(stderr,
"Warning %i in %s: Gauge fields are not initialised.\n", NOINIT, funcname);
223 cudaGetDevice(&device);
224 cudaMemPrefetchAsync(ut[0],
ndim*
kvol*
sizeof(
Complex),device,streams[0]);
225 cudaMemPrefetchAsync(ut[1],
ndim*
kvol*
sizeof(
Complex),device,streams[1]);
231 cudaMemcpyAsync(u[0],ut[0],
ndim*
kvol*
sizeof(
Complex),cudaMemcpyDefault,streams[0]);
232 cudaMemPrefetchAsync(u[0],
ndim*
kvol*
sizeof(
Complex),device,streams[0]);
233 cudaMemcpyAsync(u[1],ut[1],
ndim*
kvol*
sizeof(
Complex),cudaMemcpyDefault,streams[1]);
234 cudaMemPrefetchAsync(u[1],
ndim*
kvol*
sizeof(
Complex),device,streams[1]);
241 printf(
"Initialisation Complete\n");
247 float *dk[2],
Complex_f jqq,
float akappa,
float beta,
double *ancgh,
int traj){
272 const char *funcname =
"Hamilton";
277 cudaGetDevice(&device);
278 cudaMemPrefetchAsync(pp,
kmom*
sizeof(
double),device,NULL);
279 cublasDnrm2(cublas_handle,
kmom, pp, 1,&hp);
281#elif defined USE_BLAS
282 double hp = cblas_dnrm2(
kmom, pp, 1);
286 for(
int i = 0; i<
kmom; i++)
290 double avplaqs, avplaqt;
295 double hf = 0;
int itercg = 0;
298 cudaMallocAsync((
void **)&smallPhi,
kferm2*
sizeof(
Complex),NULL);
303 for(
int na=0;na<
nf;na++){
306 cudaDeviceSynchronise();
308 cudaMemcpyAsync(X1,X0+na*
kferm2,
kferm2*
sizeof(
Complex),cudaMemcpyDeviceToDevice,streams[0]);
310 cudaDeviceSynchronise();
316 if(
Congradq(na,res2,X1,smallPhi,ut,iu,
id,gamval_f,gamin,dk,jqq,akappa,&itercg))
317 fprintf(stderr,
"Trajectory %d\n", traj);
321 cudaMemcpyAsync(X0+na*
kferm2,X1,
kferm2*
sizeof(
Complex),cudaMemcpyDeviceToDevice,streams[0]);
328 cublasZdotc(cublas_handle,
kferm2,(cuDoubleComplex *)smallPhi,1,(cuDoubleComplex *) X1,1,(cuDoubleComplex *) &dot);
330#elif defined USE_BLAS
332 cblas_zdotc_sub(
kferm2, smallPhi, 1, X1, 1, &dot);
338 hf+=creal(conj(smallPhi[j])*X1[j]);
342 cudaFreeAsync(smallPhi,NULL);
350 *s=hg+hf; *h=(*s)+hp;
353 printf(
"hg=%.5e; hf=%.5e; hp=%.5e; h=%.5e\n", hg, hf, hp, *h);
int Init(int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq, Complex *u[2], Complex *ut[2], Complex_f *ut_f[2], Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], unsigned int *iu, unsigned int *id)
Initialises the system.
int Hamilton(double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
Calculate the Hamiltonian.
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...