79int main(
int argc,
char *argv[]){
81 const char *funcname =
"main";
86 MPI_Comm_rank(comm, &
rank);
87 MPI_Comm_size(comm, &
size);
114 float akappa = 0.1780f;
119 float fmu = 0.0f;
int iread = 0;
int istart = 1;
123#ifdef USE_MATH_DEFINES
124 const double tpi = 2*
M_PI;
126 const double tpi = 2*acos(-1.0);
128 float dt=0.004;
float ajq = 0.0;
float delb=0;
129 float athq = 0.0;
int stepl = 250;
int ntraj = 10;
133 const char *filename = (argc!=2) ?
"midout":argv[1];
135 if( !(midout = fopen(filename, fileop) ) ){
136 fprintf(stderr,
"Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\
137 OPENERROR, funcname, filename, fileop);
139 MPI_Abort(comm,OPENERROR);
145 fscanf(midout,
"%f %f %f %f %f %f %f %d %d %d %d %d", &dt, &beta, &akappa,\
146 &ajq, &athq, &fmu, &delb, &stepl, &ntraj, &istart, &icheck, &iread);
148 assert(stepl>0); assert(ntraj>0); assert(istart>=0); assert(icheck>0); assert(iread>=0);
157 jqq=ajq*cexp(athq*I);
161 cublasCreate(&cublas_handle);
163 blockInit(
nx,
ny,
nz,
nt, &dimBlock, &dimGrid);
166 cudaGetDevice(&device);
169 cudaDeviceGetDefaultMemPool(&mempool, device);
171 cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
174 printf(
"jqq=%f+(%f)I\n",creal(jqq),cimag(jqq));
199 unsigned int *iu, *id;
201 cudaMallocManaged((
void**)&iu,
ndim*
kvol*
sizeof(
int),cudaMemAttachGlobal);
202 cudaMallocManaged((
void**)&
id,
ndim*
kvol*
sizeof(
int),cudaMemAttachGlobal);
204 cudaMallocManaged(&dk[0],(
kvol+
halo)*
sizeof(
double),cudaMemAttachGlobal);
205 cudaMallocManaged(&dk[1],(
kvol+
halo)*
sizeof(
double),cudaMemAttachGlobal);
207 cudaMallocManaged(&dk_f[0],(
kvol+
halo)*
sizeof(
float),cudaMemAttachGlobal);
208 cudaMallocManaged(&dk_f[1],(
kvol+
halo)*
sizeof(
float),cudaMemAttachGlobal);
210 cudaMalloc(&dk_f[0],(
kvol+
halo)*
sizeof(
float));
211 cudaMalloc(&dk_f[1],(
kvol+
halo)*
sizeof(
float));
217 cudaMallocManaged(&gamin,4*4*
sizeof(
Complex),cudaMemAttachGlobal);
218 cudaMallocManaged(&gamval,5*4*
sizeof(
Complex),cudaMemAttachGlobal);
220 cudaMallocManaged(&gamval_f,5*4*
sizeof(
Complex_f),cudaMemAttachGlobal);
222 cudaMalloc(&gamval_f,5*4*
sizeof(
Complex_f));
224 cudaMallocManaged(&u[0],
ndim*
kvol*
sizeof(
Complex),cudaMemAttachGlobal);
225 cudaMallocManaged(&u[1],
ndim*
kvol*
sizeof(
Complex),cudaMemAttachGlobal);
236 id = (
unsigned int*)aligned_alloc(
AVX,
ndim*
kvol*
sizeof(
int));
237 iu = (
unsigned int*)aligned_alloc(
AVX,
ndim*
kvol*
sizeof(
int));
239 int *gamin = (
int *)aligned_alloc(
AVX,4*4*
sizeof(
int));
243 dk[0] = (
double *)aligned_alloc(
AVX,(
kvol+
halo)*
sizeof(
double));
244 dk[1] = (
double *)aligned_alloc(
AVX,(
kvol+
halo)*
sizeof(
double));
245 dk_f[0] = (
float *)aligned_alloc(
AVX,(
kvol+
halo)*
sizeof(
float));
246 dk_f[1] = (
float *)aligned_alloc(
AVX,(
kvol+
halo)*
sizeof(
float));
268 Init(istart,ibound,iread,beta,fmu,akappa,ajq,u,ut,ut_f,gamval,gamval_f,gamin,dk,dk_f,iu,
id);
271 Init_CUDA(ut[0],ut[1],gamval,gamval_f,gamin,dk[0],dk[1],iu,
id);
285 cudaDeviceSynchronise();
289 Diagnostics(istart, u[0], u[1], ut[0], ut[1], ut_f[0], ut_f[1], iu,
id,
hu,
hd, dk[0], dk[1],\
290 dk_f[0], dk_f[1], gamin, gamval, gamval_f, jqq, akappa, beta, ancg_diag);
298 if(!
rank) printf(
"Initial Polyakov loop evaluated as %e\n", poly);
300 double hg, avplaqs, avplaqt;
304 double traj=stepl*dt;
306 double proby = 2.5/stepl;
308 int buffer;
char buff2[7];
310 buffer = (int)round(100*beta);
311 sprintf(buff2,
"b%03d",buffer);
312 strcat(suffix,buff2);
314 buffer = (int)round(10000*akappa);
315 sprintf(buff2,
"k%04d",buffer);
316 strcat(suffix,buff2);
318 buffer = (int)round(1000*fmu);
319 sprintf(buff2,
"mu%04d",buffer);
320 strcat(suffix,buff2);
322 buffer = (int)round(1000*ajq);
323 sprintf(buff2,
"j%03d",buffer);
324 strcat(suffix,buff2);
326 sprintf(buff2,
"s%02d",
nx);
327 strcat(suffix,buff2);
329 sprintf(buff2,
"t%02d",
nt);
330 strcat(suffix,buff2);
331 char outname[
FILELEN] =
"Output.";
char *outop=
"a";
332 strcat(outname,suffix);
335 if(!(output=fopen(outname, outop) )){
336 fprintf(stderr,
"Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",OPENERROR,funcname,outname,outop);
338 MPI_Abort(comm,OPENERROR);
343 printf(
"hg = %e, <Ps> = %e, <Pt> = %e, <Poly> = %e\n", hg, avplaqs, avplaqt, poly);
344 fprintf(output,
"ksize = %i ksizet = %i Nf = %i Halo =%i\nTime step dt = %e Trajectory length = %e\n"\
345 "No. of Trajectories = %i β = %e\nκ = %e μ = %e\nDiquark source = %e Diquark phase angle = %e\n"\
346 "Stopping Residuals: Guidance: %e Acceptance: %e, Estimator: %e\nSeed = %ld\n",
347 ksize,
ksizet,
nf,
halo, dt, traj, ntraj, beta, akappa, fmu, ajq, athq,
rescgg,
rescga,
respbp,
seed);
350 printf(
"ksize = %i ksizet = %i Nf = %i Halo = %i\nTime step dt = %e Trajectory length = %e\n"\
351 "No. of Trajectories = %i β = %e\nκ = %e μ = %e\nDiquark source = %e Diquark phase angle = %e\n"\
352 "Stopping Residuals: Guidance: %e Acceptance: %e, Estimator: %e\nSeed = %ld\n",
353 ksize,
ksizet,
nf,
halo, dt, traj, ntraj, beta, akappa, fmu, ajq, athq,
rescgg,
rescga,
respbp,
seed);
358 double actiona = 0.0;
double vel2a = 0.0;
double pbpa = 0.0;
double endenfa = 0.0;
double denfa = 0.0;
360 double e_dH=0;
double e_dH_e=0;
362 double yav = 0.0;
double yyav = 0.0;
364 int naccp = 0;
int ipbp = 0;
int itot = 0;
380 cudaMallocManaged(&dSdpi,
kmom*
sizeof(
double),cudaMemAttachGlobal);
383 cudaMalloc(&dSdpi,
kmom*
sizeof(
double));
387 cudaMallocManaged(&pp,
kmom*
sizeof(
double),cudaMemAttachGlobal);
388 cudaDeviceSynchronise();
394 dSdpi = aligned_alloc(
AVX,
kmom*
sizeof(
double));
396 pp = aligned_alloc(
AVX,
kmom*
sizeof(
double));
407 start_time = MPI_Wtime();
409 start_time = omp_get_wtime();
415 double ancg,ancgh,totancg,totancgh=0;
416 for(
int itraj = iread+1; itraj <= ntraj+iread; itraj++){
421 printf(
"Starting itraj %i\n", itraj);
423 for(
int na=0; na<
nf; na++){
433 cudaMallocManaged(&R1_f,
kferm*
sizeof(
Complex_f),cudaMemAttachGlobal);
447#if (defined __NVCC__ && defined _DEBUG)
448 cudaMemPrefetchAsync(R1_f,
kferm*
sizeof(
Complex_f),device,streams[1]);
450#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
453 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm, R, 0, 1/sqrt(2));
461 cudaDeviceSynchronise();
463 Dslashd_f(R1_f,R,ut_f[0],ut_f[1],iu,
id,gamval_f,gamin,dk_f,jqq,akappa);
468 cuComplex_convert(R1_f,R1,
kferm,
false,dimBlock,dimGrid);
472 cudaFreeAsync(R1_f,NULL);
476 cudaDeviceSynchronise();
479#pragma omp simd aligned(R1_f,R1:AVX)
480 for(
int i=0;i<
kferm;i++)
486 UpDownPart(na, X0, R1);
493 cudaMemcpyAsync(ut[0], u[0],
ndim*
kvol*
sizeof(
Complex),cudaMemcpyHostToDevice,NULL);
494 cudaMemcpyAsync(ut[1], u[1],
ndim*
kvol*
sizeof(
Complex),cudaMemcpyHostToDevice,NULL);
503#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
506 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream,
kmom, pp, 0, 1);
512 cudaMemPrefetchAsync(pp,
kmom*
sizeof(
double),device,streams[1]);
515 Hamilton(&H0,&S0,
rescga,pp,X0,X1,Phi,ut_f,iu,
id,gamval_f,gamin,dk_f,jqq,akappa,beta,&ancgh,itraj);
517 if(!
rank) printf(
"H0: %e S0: %e\n", H0, S0);
524#if (defined INT_LPFR && defined INT_OMF2) ||(defined INT_LPFR && defined INT_OMF4)||(defined INT_OMF2 && defined INT_OMF4)
525#error "Only one integrator may be defined"
526#elif defined INT_LPFR
527 Leapfrog(ut,ut_f,X0,X1,Phi,dk,dk_f,dSdpi,pp,iu,
id,gamval,gamval_f,gamin,jqq,beta,akappa,stepl,dt,&ancg,&itot,proby);
528#elif defined INT_OMF2
529 OMF2(ut,ut_f,X0,X1,Phi,dk,dk_f,dSdpi,pp,iu,
id,gamval,gamval_f,gamin,jqq,beta,akappa,stepl,dt,&ancg,&itot,proby);
530#elif defined INT_OMF4
531 OMF4(ut,ut_f,X0,X1,Phi,dk,dk_f,dSdpi,pp,iu,
id,gamval,gamval_f,gamin,jqq,beta,akappa,stepl,dt,&ancg,&itot,proby);
533#error "No integrator defined. Please define {INT_LPFR.INT_OMF2,INT_OMF4}"
541 Hamilton(&H1,&S1,
rescga,pp,X0,X1,Phi,ut_f,iu,
id,gamval_f,gamin,dk_f,jqq,akappa,beta,&ancgh,itraj);
545 printf(
"H0-H1=%f-%f",H0,H1);
553 fprintf(output,
"dH = %e dS = %e\n", dH, dS);
555 printf(
"dH = %e dS = %e\n", dH, dS);
558 e_dH+=dH; e_dH_e+=dH*dH;
568 printf(
"New configuration accepted on trajectory %i.\n", itraj);
585 printf(
"New configuration rejected on trajectory %i.\n", itraj);
591 cublasDnrm2(cublas_handle,
kmom, pp, 1,&vel2);
593#elif defined USE_BLAS
594 vel2 = cblas_dnrm2(
kmom, pp, 1);
598 for(
int i=0; i<
kmom; i++)
610 cudaMemcpyAsync(ut[0], u[0],
ndim*
kvol*
sizeof(
Complex),cudaMemcpyDefault,streams[0]);
611 cudaMemcpyAsync(ut[1], u[1],
ndim*
kvol*
sizeof(
Complex),cudaMemcpyDefault,streams[1]);
612 cudaDeviceSynchronise();
623 printf(
"Starting measurements\n");
632 measure_check =
Measure(&pbp,&endenf,&denf,&qq,&qbqb,
respbp,&itercg,ut,ut_f,iu,
id,\
633 gamval,gamval_f,gamin,dk,dk_f,jqq,akappa,Phi,R1);
636 printf(
"Finished measurements\n");
638 pbpa+=pbp; endenfa+=endenf; denfa+=denf; ipbp++;
648#pragma omp parallel for
649 for(
int i=0; i<4; i++)
658 fprintf(output,
"Measure (CG) %i Update (CG) %.3f Hamiltonian (CG) %.3f\n", itercg, ancg, ancgh);
664 char fortname[
FILELEN] =
"fermi.";
665 strcat(fortname,suffix);
666 const char *fortop= (itraj==1) ?
"w" :
"a";
667 if(!(fortout=fopen(fortname, fortop) )){
668 fprintf(stderr,
"Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\
669 OPENERROR, funcname, fortname, fortop);
671 MPI_Abort(comm,OPENERROR);
677 fprintf(fortout,
"pbp\tendenf\tdenf\n");
679 fprintf(fortout,
"%e\t%e\t%e\n", NAN, NAN, NAN);
681 fprintf(fortout,
"%e\t%e\t%e\n", pbp, endenf, denf);
691 char fortname[
FILELEN] =
"bose.";
692 strcat(fortname,suffix);
693 const char *fortop= (itraj==1) ?
"w" :
"a";
694 if(!(fortout=fopen(fortname, fortop) )){
695 fprintf(stderr,
"Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\
696 OPENERROR, funcname, fortname, fortop);
699 fprintf(fortout,
"avplaqs\tavplaqt\tpoly\n");
700 fprintf(fortout,
"%e\t%e\t%e\n", avplaqs, avplaqt, poly);
707 char fortname[
FILELEN] =
"diq.";
708 strcat(fortname,suffix);
709 const char *fortop= (itraj==1) ?
"w" :
"a";
710 if(!(fortout=fopen(fortname, fortop) )){
711 fprintf(stderr,
"Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\
712 OPENERROR, funcname, fortname, fortop);
714 MPI_Abort(comm,OPENERROR);
720 fprintf(fortout,
"Re(qq)\n");
722 fprintf(fortout,
"%e\n", NAN);
724 fprintf(fortout,
"%e\n", creal(qq));
732 Par_swrite(itraj,icheck,beta,fmu,akappa,ajq,u[0],u[1]);
741 elapsed = MPI_Wtime()-start_time;
743 elapsed = omp_get_wtime()-start_time;
751 cudaFree(dk[0]); cudaFree(dk[1]); cudaFree(R1); cudaFree(dSdpi); cudaFree(pp);
752 cudaFree(Phi); cudaFree(ut[0]); cudaFree(ut[1]);
753 cudaFree(X0); cudaFree(X1); cudaFree(u[0]); cudaFree(u[1]);
754 cudaFree(
id); cudaFree(iu);
755 cudaFree(dk_f[0]); cudaFree(dk_f[1]); cudaFree(ut_f[0]); cudaFree(ut_f[1]);
756 cudaFree(gamin); cudaFree(gamval); cudaFree(gamval_f);
757 cublasDestroy(cublas_handle);
759 free(dk[0]); free(dk[1]); free(R1); free(dSdpi); free(pp);
760 free(Phi); free(ut[0]); free(ut[1]);
761 free(X0); free(X1); free(u[0]); free(u[1]);
763 free(dk_f[0]); free(dk_f[1]); free(ut_f[0]); free(ut_f[1]);
764 free(gamin); free(gamval); free(gamval_f);
768 gsl_rng_free(ranlux_instd);
769#elif (defined __INTEL_MKL__ &&!defined USE_RAN2)
770 vslDeleteStream(&stream);
774 FILE *sa3at = fopen(
"Bench_times.csv",
"a");
777 int cuversion; cudaRuntimeGetVersion(&cuversion);
778 sprintf(version,
"CUDA %d\tBlock: (%d,%d,%d)\tGrid: (%d,%d,%d)\n%s\n",cuversion,\
779 dimBlock.x,dimBlock.y,dimBlock.z,dimGrid.x,dimGrid.y,dimGrid.z,__VERSION__);
781 char *version=__VERSION__;
783 fprintf(sa3at,
"%s\nβ%0.3f κ:%0.4f μ:%0.4f j:%0.3f s:%i t:%i kvol:%ld\n"
784 "npx:%i npt:%i nthread:%i ncore:%i time:%f traj_time:%f\n\n",\
785 version,beta,akappa,fmu,ajq,
nx,
nt,
kvol,
npx,
npt,
nthreads,
npx*
npy*
npz*
npt*
nthreads,elapsed,elapsed/ntraj);
790 actiona/=ntraj; vel2a/=ntraj; pbpa/=ipbp; endenfa/=ipbp; denfa/=ipbp;
791 totancg/=ntraj; totancgh/=ntraj;
792 e_dH/=ntraj; e_dH_e=sqrt((e_dH_e/ntraj-e_dH*e_dH)/(ntraj-1));
793 yav/=ntraj; yyav=sqrt((yyav/ntraj - yav*yav)/(ntraj-1));
794 float traj_cost=totancg/dt;
795 double atraj=dt*itot/ntraj;
798 fprintf(output,
"Averages for the last %i trajectories\n"\
799 "Number of acceptances: %i\tAverage Trajectory Length = %e\n"\
800 "<dH>=%e+/-%e\t<exp(dH)>=%e+/-%e\tTrajectory cost=N_cg/dt =%e\n"\
801 "Average number of congrad iter guidance: %.3f acceptance %.3f\n"\
803 "Mean Square Velocity = %e\tAction Per Site = %e\n"\
804 "Energy Density = %e\tNumber Density %e\n\n\n",\
805 ntraj, naccp, atraj, e_dH,e_dH_e, yav, yyav, traj_cost, totancg, totancgh, pbpa, vel2a, actiona, endenfa, denfa);