su2hmc
Loading...
Searching...
No Matches
main.c
Go to the documentation of this file.
1
43#include <assert.h>
44#include <coord.h>
45#include <math.h>
46#include <matrices.h>
47#include <par_mpi.h>
48#include <random.h>
49#include <string.h>
50#include <su2hmc.h>
51#ifdef __NVCC__
52#include <cublas_v2.h>
53#include <cuda.h>
54#include <cuda_runtime.h>
55cublasHandle_t cublas_handle;
56cublasStatus_t cublas_status;
57cudaMemPool_t mempool;
58//Fix this later
59#endif
79int main(int argc, char *argv[]){
80 //Instead of hard coding the function name so the error messages are easier to implement
81 const char *funcname = "main";
82
83 Par_begin(argc, argv);
84 //Add error catching code...
85#if(nproc>1)
86 MPI_Comm_rank(comm, &rank);
87 MPI_Comm_size(comm, &size);
88#endif
89
113 float beta = 1.7f;
114 float akappa = 0.1780f;
115#ifdef __NVCC__
116 __managed__
117#endif
118 Complex_f jqq = 0;
119 float fmu = 0.0f;
120 int iread = 0;
121 int istart = 1;
122 int iprint = 1; //How often are measurements made
123 int icheck = 5; //How often are configurations saved
124 int ibound = -1;
125#ifdef USE_MATH_DEFINES
126 const double tpi = 2*M_PI;
127#else
128 const double tpi = 2*acos(-1.0);
129#endif
130 float dt=0.004; float ajq = 0.0;
131 float delb=0; //Not used?
132 float athq = 0.0;
133 int stepl = 250; int ntraj = 10;
134 //rank is zero means it must be the "master process"
135 if(!rank){
136 FILE *midout;
137 const char *filename = (argc!=2) ?"midout":argv[1];
138 char *fileop = "r";
139 if( !(midout = fopen(filename, fileop) ) ){
140 fprintf(stderr, "Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\
141 OPENERROR, funcname, filename, fileop);
142#if(nproc>1)
143 MPI_Abort(comm,OPENERROR);
144#else
145 exit(OPENERROR);
146#endif
147 }
148 //See the README for what each entry means
149 fscanf(midout, "%f %f %f %f %f %f %f %d %d %d %d %d", &dt, &beta, &akappa,\
150 &ajq, &athq, &fmu, &delb, &stepl, &ntraj, &istart, &icheck, &iread);
151 fclose(midout);
152 assert(stepl>0); assert(ntraj>0); assert(istart>=0); assert(icheck>0); assert(iread>=0);
153 }
154 //Send inputs to other ranks
155#if(nproc>1)
156 Par_fcopy(&dt); Par_fcopy(&beta); Par_fcopy(&akappa); Par_fcopy(&ajq);
157 Par_fcopy(&athq); Par_fcopy(&fmu); Par_fcopy(&delb); //Not used?
158 Par_icopy(&stepl); Par_icopy(&ntraj); Par_icopy(&istart); Par_icopy(&icheck);
159 Par_icopy(&iread);
160#endif
161 jqq=ajq*cexp(athq*I);
162 //End of input
163#ifdef __NVCC__
164 //CUBLAS Handle
165 cublasCreate(&cublas_handle);
166 //Set up grid and blocks
167 blockInit(nx, ny, nz, nt, &dimBlock, &dimGrid);
168 //CUDA device
169 int device=-1;
170 cudaGetDevice(&device);
171 //For asynchronous memory, when CUDA syncs any unused memory in the pool is released back to the OS
172 //unless a threshold is given. We'll base our threshold off of Congradq
173 cudaDeviceGetDefaultMemPool(&mempool, device);
174 int threshold=2*kferm2*sizeof(Complex_f);
175 cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
176#endif
177#ifdef _DEBUG
178 printf("jqq=%f+(%f)I\n",creal(jqq),cimag(jqq));
179#endif
180#ifdef _DEBUG
181 seed = 967580161;
182#else
183 seed = time(NULL);
184#endif
185
186 //Gauge, trial and momentum fields
187 //You'll notice that there are two different allocation/free statements
188 //One for CUDA and one for everything else depending on what's
189 //being used
190 Complex *u11, *u12, *u11t, *u12t;
191 Complex_f *u11t_f, *u12t_f;
192 double *dk4m, *dk4p, *pp;
193 float *dk4m_f, *dk4p_f;
194 //Halo index arrays
195 unsigned int *iu, *id;
196#ifdef __NVCC__
197 cudaMallocManaged((void**)&iu,ndim*kvol*sizeof(int),cudaMemAttachGlobal);
198 cudaMallocManaged((void**)&id,ndim*kvol*sizeof(int),cudaMemAttachGlobal);
199
200 cudaMallocManaged(&dk4m,(kvol+halo)*sizeof(double),cudaMemAttachGlobal);
201 cudaMallocManaged(&dk4p,(kvol+halo)*sizeof(double),cudaMemAttachGlobal);
202#ifdef _DEBUG
203 cudaMallocManaged(&dk4m_f,(kvol+halo)*sizeof(float),cudaMemAttachGlobal);
204 cudaMallocManaged(&dk4p_f,(kvol+halo)*sizeof(float),cudaMemAttachGlobal);
205#else
206 cudaMalloc(&dk4m_f,(kvol+halo)*sizeof(float));
207 cudaMalloc(&dk4p_f,(kvol+halo)*sizeof(float));
208#endif
209
210 int *gamin;
211 Complex *gamval;
212 Complex_f *gamval_f;
213 cudaMallocManaged(&gamin,4*4*sizeof(Complex),cudaMemAttachGlobal);
214 cudaMallocManaged(&gamval,5*4*sizeof(Complex),cudaMemAttachGlobal);
215#ifdef _DEBUG
216 cudaMallocManaged(&gamval_f,5*4*sizeof(Complex_f),cudaMemAttachGlobal);
217#else
218 cudaMalloc(&gamval_f,5*4*sizeof(Complex_f));
219#endif
220 cudaMallocManaged(&u11,ndim*kvol*sizeof(Complex),cudaMemAttachGlobal);
221 cudaMallocManaged(&u12,ndim*kvol*sizeof(Complex),cudaMemAttachGlobal);
222 cudaMallocManaged(&u11t,ndim*(kvol+halo)*sizeof(Complex),cudaMemAttachGlobal);
223 cudaMallocManaged(&u12t,ndim*(kvol+halo)*sizeof(Complex),cudaMemAttachGlobal);
224#ifdef _DEBUG
225 cudaMallocManaged(&u11t_f,ndim*(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);
226 cudaMallocManaged(&u12t_f,ndim*(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);
227#else
228 cudaMalloc(&u11t_f,ndim*(kvol+halo)*sizeof(Complex_f));
229 cudaMalloc(&u12t_f,ndim*(kvol+halo)*sizeof(Complex_f));
230#endif
231#else
232 id = (unsigned int*)aligned_alloc(AVX,ndim*kvol*sizeof(int));
233 iu = (unsigned int*)aligned_alloc(AVX,ndim*kvol*sizeof(int));
234
235 int *gamin = (int *)aligned_alloc(AVX,4*4*sizeof(int));
236 Complex *gamval=(Complex *)aligned_alloc(AVX,5*4*sizeof(Complex));
237 Complex_f *gamval_f=(Complex_f *)aligned_alloc(AVX,5*4*sizeof(Complex_f));;
238
239 dk4m = (double *)aligned_alloc(AVX,(kvol+halo)*sizeof(double));
240 dk4p = (double *)aligned_alloc(AVX,(kvol+halo)*sizeof(double));
241 dk4m_f = (float *)aligned_alloc(AVX,(kvol+halo)*sizeof(float));
242 dk4p_f = (float *)aligned_alloc(AVX,(kvol+halo)*sizeof(float));
243
244 u11 = (Complex *)aligned_alloc(AVX,ndim*kvol*sizeof(Complex));
245 u12 = (Complex *)aligned_alloc(AVX,ndim*kvol*sizeof(Complex));
246 u11t = (Complex *)aligned_alloc(AVX,ndim*(kvol+halo)*sizeof(Complex));
247 u12t = (Complex *)aligned_alloc(AVX,ndim*(kvol+halo)*sizeof(Complex));
248 u11t_f = (Complex_f *)aligned_alloc(AVX,ndim*(kvol+halo)*sizeof(Complex_f));
249 u12t_f = (Complex_f *)aligned_alloc(AVX,ndim*(kvol+halo)*sizeof(Complex_f));
250#endif
264 Init(istart,ibound,iread,beta,fmu,akappa,ajq,u11,u12,u11t,u12t,u11t_f,u12t_f,gamval,gamval_f,gamin,dk4m,dk4p,dk4m_f,dk4p_f,iu,id);
265#ifdef __NVCC__
266 //GPU Initialisation stuff
267 Init_CUDA(u11t,u12t,gamval,gamval_f,gamin,dk4m,dk4p,iu,id);//&dimBlock,&dimGrid);
268#endif
269 //Send trials to accelerator for reunitarisation
270 Reunitarise(u11t,u12t);
271 //Get trials back
272 memcpy(u11, u11t, ndim*kvol*sizeof(Complex));
273 memcpy(u12, u12t, ndim*kvol*sizeof(Complex));
274#ifdef DIAGNOSTIC
275 double ancg_diag=0;
276 Diagnostics(istart, u11, u12, u11t, u12t, u11t_f, u12t_f, iu, id, hu, hd, dk4m, dk4p,\
277 dk4m_f, dk4p_f, gamin, gamval, gamval_f, jqq, akappa, beta, ancg_diag);
278#endif
279
280 //Initial Measurements
281 //====================
282 Trial_Exchange(u11t,u12t,u11t_f,u12t_f);
283 double poly = Polyakov(u11t_f,u12t_f);
284#ifdef _DEBUG
285 if(!rank) printf("Initial Polyakov loop evaluated as %e\n", poly);
286#endif
287 double hg, avplaqs, avplaqt;
288 //Halo exchange of the trial fields
289 Average_Plaquette(&hg,&avplaqs,&avplaqt,u11t_f,u12t_f,iu,beta);
290 //Trajectory length
291 double traj=stepl*dt;
292 //Acceptance probability
293 double proby = 2.5/stepl;
294 char suffix[FILELEN]="";
295 int buffer; char buff2[7];
296 //Add script for extracting correct mu, j etc.
297 buffer = (int)round(100*beta);
298 sprintf(buff2,"b%03d",buffer);
299 strcat(suffix,buff2);
300 //κ
301 buffer = (int)round(10000*akappa);
302 sprintf(buff2,"k%04d",buffer);
303 strcat(suffix,buff2);
304 //μ
305 buffer = (int)round(1000*fmu);
306 sprintf(buff2,"mu%04d",buffer);
307 strcat(suffix,buff2);
308 //J
309 buffer = (int)round(1000*ajq);
310 sprintf(buff2,"j%03d",buffer);
311 strcat(suffix,buff2);
312 //nx
313 sprintf(buff2,"s%02d",nx);
314 strcat(suffix,buff2);
315 //nt
316 sprintf(buff2,"t%02d",nt);
317 strcat(suffix,buff2);
318 char outname[FILELEN] = "Output."; char *outop="a";
319 strcat(outname,suffix);
320 FILE *output;
321 if(!rank){
322 if(!(output=fopen(outname, outop) )){
323 fprintf(stderr,"Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",OPENERROR,funcname,outname,outop);
324#if(nproc>1)
325 MPI_Abort(comm,OPENERROR);
326#else
327 exit(OPENERROR);
328#endif
329 }
330 printf("hg = %e, <Ps> = %e, <Pt> = %e, <Poly> = %e\n", hg, avplaqs, avplaqt, poly);
331 fprintf(output, "ksize = %i ksizet = %i Nf = %i Halo =%i\nTime step dt = %e Trajectory length = %e\n"\
332 "No. of Trajectories = %i β = %e\nκ = %e μ = %e\nDiquark source = %e Diquark phase angle = %e\n"\
333 "Stopping Residuals: Guidance: %e Acceptance: %e, Estimator: %e\nSeed = %ld\n",
334 ksize, ksizet, nf, halo, dt, traj, ntraj, beta, akappa, fmu, ajq, athq, rescgg, rescga, respbp, seed);
335#ifdef _DEBUG
336 //Print to terminal during debugging
337 printf("ksize = %i ksizet = %i Nf = %i Halo = %i\nTime step dt = %e Trajectory length = %e\n"\
338 "No. of Trajectories = %i β = %e\nκ = %e μ = %e\nDiquark source = %e Diquark phase angle = %e\n"\
339 "Stopping Residuals: Guidance: %e Acceptance: %e, Estimator: %e\nSeed = %ld\n",
340 ksize, ksizet, nf, halo, dt, traj, ntraj, beta, akappa, fmu, ajq, athq, rescgg, rescga, respbp, seed);
341#endif
342 }
343 //Initialise for averages
344 //======================
345 double actiona = 0.0; double vel2a = 0.0; double pbpa = 0.0; double endenfa = 0.0; double denfa = 0.0;
346 //Expected canged in Hamiltonian
347 double e_dH=0; double e_dH_e=0;
348 //Expected Metropolis accept probability. Skewed by cases where the hamiltonian decreases.
349 double yav = 0.0; double yyav = 0.0;
350
351 int naccp = 0; int ipbp = 0; int itot = 0;
352
353 //Start of classical evolution
354 //===========================
355 double pbp;
356 Complex qq;
357 double *dSdpi;
358 //Field and related declarations
359 Complex *Phi, *R1, *X0, *X1;
360 //Initialise Arrays. Leaving it late for scoping
361 //check the sizes in sizes.h
362#ifdef __NVCC__
363 cudaMallocManaged(&R1, kfermHalo*sizeof(Complex),cudaMemAttachGlobal);
364 cudaMalloc(&Phi, nf*kferm*sizeof(Complex));
365#ifdef _DEBUG
366 cudaMallocManaged(&X0, nf*kferm2*sizeof(Complex),cudaMemAttachGlobal);
367#else
368 cudaMalloc(&X0, nf*kferm2*sizeof(Complex));
369#endif
370
371 cudaMallocManaged(&X1, kferm2Halo*sizeof(Complex),cudaMemAttachGlobal);
372 cudaMallocManaged(&pp, kmom*sizeof(double),cudaMemAttachGlobal);
373 cudaMalloc(&dSdpi, kmom*sizeof(double));
374 cudaDeviceSynchronise();
375#else
376 R1= aligned_alloc(AVX,kfermHalo*sizeof(Complex));
377 Phi= aligned_alloc(AVX,nf*kferm*sizeof(Complex));
378 X0= aligned_alloc(AVX,nf*kferm2*sizeof(Complex));
379 X1= aligned_alloc(AVX,kferm2Halo*sizeof(Complex));
380 dSdpi = aligned_alloc(AVX,kmom*sizeof(double));
381 //pp is the momentum field
382 pp = aligned_alloc(AVX,kmom*sizeof(double));
383#endif
389#if (defined SA3AT)
390 double start_time=0;
391 if(!rank){
392#if(nproc>1)
393 start_time = MPI_Wtime();
394#else
395 start_time = omp_get_wtime();
396#endif
397 }
398#endif
399 double action;
400 //Conjugate Gradient iteration counters
401 double ancg,ancgh,totancg,totancgh=0;
402 for(int itraj = iread+1; itraj <= ntraj+iread; itraj++){
403 //Reset conjugate gradient averages
404 ancg = 0; ancgh = 0;
405#ifdef _DEBUG
406 if(!rank)
407 printf("Starting itraj %i\n", itraj);
408#endif
409 for(int na=0; na<nf; na++){
410 //Probably makes sense to declare this outside the loop
411 //but I do like scoping/don't want to break anything else just teat
412 //
413 //How do we optimise this for use in CUDA? Do we use CUDA's PRNG
414 //or stick with MKL and synchronise/copy over the array
415#ifdef __NVCC__
416 Complex_f *R1_f,*R;
417 cudaMallocManaged(&R,kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
418#ifdef _DEBUG
419 cudaMallocManaged(&R1_f,kferm*sizeof(Complex_f),cudaMemAttachGlobal);
420 cudaMemset(R1_f,0,kferm*sizeof(Complex_f));
421#else
422 cudaMallocAsync(&R1_f,kferm*sizeof(Complex_f),streams[0]);
423 cudaMemsetAsync(R1_f,0,kferm*sizeof(Complex_f),streams[0]);
424#endif
425#else
426 Complex_f *R1_f=aligned_alloc(AVX,kferm*sizeof(Complex_f));
427 Complex_f *R=aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
428 memset(R1_f,0,kferm*sizeof(Complex_f));
429#endif
430 //The FORTRAN code had two Gaussian routines.
431 //gaussp was the normal Box-Muller and gauss0 didn't have 2 inside the square root
432 //Using σ=1/sqrt(2) in these routines has the same effect as gauss0
433#if (defined __NVCC__ && defined _DEBUG)
434 cudaMemPrefetchAsync(R1_f,kferm*sizeof(Complex_f),device,streams[1]);
435#endif
436#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
437 Gauss_c(R, kferm, 0, 1/sqrt(2));
438#else
439 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, R, 0, 1/sqrt(2));
440#endif
441#ifdef __NVCC__
442 cudaMemPrefetchAsync(R,kfermHalo*sizeof(Complex_f),device,NULL);
443 //Transpose needed here for Dslashd
444 Transpose_c(R1_f,ngorkov*nc,kvol,dimGrid,dimBlock);
445 Transpose_c(R,ngorkov*nc,kvol,dimGrid,dimBlock);
446 //R is random so this techincally isn't required. But it does keep the code output consistent with previous
447 //versions.
448 //Flip all the gauge fields around so memory is coalesced
449 Transpose_c(u11t_f,ndim,kvol,dimGrid,dimBlock);
450 Transpose_c(u12t_f,ndim,kvol,dimGrid,dimBlock);
451 cudaDeviceSynchronise();
452#endif
453 Dslashd_f(R1_f,R,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa);
454#ifdef __NVCC__
455 //Make sure the multiplication is finished before freeing its input!!
456 cudaFree(R);//cudaDeviceSynchronise();
457 //cudaFree is blocking so don't need to synchronise
458 Transpose_c(R1_f,kvol,ngorkov*nc,dimGrid,dimBlock);
459 cuComplex_convert(R1_f,R1,kferm,false,dimBlock,dimGrid);
460 Transpose_c(u11t_f,kvol,ndim,dimGrid,dimBlock);
461 Transpose_c(u12t_f,kvol,ndim,dimGrid,dimBlock);
462 //cudaDeviceSynchronise();
463 //cudaFreeAsync(R1_f,NULL);
464 cudaMemcpyAsync(Phi+na*kferm,R1, kferm*sizeof(Complex),cudaMemcpyDefault,0);
465 //cudaMemcpyAsync(Phi+na*kferm,R1, kferm*sizeof(Complex),cudaMemcpyDefault,streams[1]);
466 cudaDeviceSynchronise();
467#ifdef _DEBUG
468 cudaFree(R1_f);
469#else
470 cudaFreeAsync(R1_f,0);
471#endif
472 //cudaFree is blocking so don't need cudaDeviceSynchronise()
473#else
474 free(R);
475#pragma omp simd aligned(R1_f,R1:AVX)
476 for(int i=0;i<kferm;i++)
477 R1[i]=(Complex)R1_f[i];
478 free(R1_f);
479 memcpy(Phi+na*kferm,R1, kferm*sizeof(Complex));
480 //Up/down partitioning (using only pseudofermions of flavour 1)
481#endif
482 UpDownPart(na, X0, R1);
483 }
484 //Heatbath
485 //========
486 //We're going to make the most of the new Gauss_d routine to send a flattened array
487 //and do this all in one step.
488#ifdef __NVCC__
489 cudaMemcpyAsync(u11t, u11, ndim*kvol*sizeof(Complex),cudaMemcpyHostToDevice,streams[1]);
490 cudaMemcpyAsync(u12t, u12, ndim*kvol*sizeof(Complex),cudaMemcpyHostToDevice,streams[2]);
491#else
492 memcpy(u11t, u11, ndim*kvol*sizeof(Complex));
493 memcpy(u12t, u12, ndim*kvol*sizeof(Complex));
494#endif
495#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
496 Gauss_d(pp, kmom, 0, 1);
497#else
498 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, kmom, pp, 0, 1);
499#endif
500 //Initialise Trial Fields
501#ifdef __NVCC__
502 cudaMemPrefetchAsync(pp,kmom*sizeof(double),device,streams[1]);
503 cudaMemcpy(u11t, u11, ndim*kvol*sizeof(Complex),cudaMemcpyDefault);
504 cudaMemcpy(u12t, u12, ndim*kvol*sizeof(Complex),cudaMemcpyDefault);
505#else
506 memcpy(u11t, u11, ndim*kvol*sizeof(Complex));
507 memcpy(u12t, u12, ndim*kvol*sizeof(Complex));
508#endif
509 Trial_Exchange(u11t,u12t,u11t_f,u12t_f);
510 double H0, S0;
511 Hamilton(&H0, &S0, rescga,pp,X0,X1,Phi,u11t,u12t,u11t_f,u12t_f,iu,id,gamval_f,gamin,\
512 dk4m_f,dk4p_f,jqq,akappa,beta,&ancgh,itraj);
513#ifdef _DEBUG
514 if(!rank) printf("H0: %e S0: %e\n", H0, S0);
515#endif
516 if(itraj==1)
517 action = S0/gvol;
518
519 //Integration
520 //TODO: Have this as a runtime parameter.
521#if (defined INT_LPFR && defined INT_OMF2) ||(defined INT_LPFR && defined INT_OMF4)||(defined INT_OMF2 && defined INT_OMF4)
522#error "Only one integrator may be defined
523#elif defined INT_LPFR
524 Leapfrog(u11t, u12t, u11t_f, u12t_f, X0, X1, Phi, dk4m, dk4p, dk4m_f, dk4p_f, dSdpi, pp,iu, id, gamval,
525 gamval_f, gamin, jqq, beta,akappa,stepl,dt,&ancg,&itot,proby);
526#elif defined INT_OMF2
527 OMF2(u11t, u12t, u11t_f, u12t_f, X0, X1, Phi, dk4m, dk4p, dk4m_f, dk4p_f, dSdpi, pp,iu, id, gamval,
528 gamval_f, gamin, jqq, beta,akappa,stepl,dt,&ancg,&itot,proby);
529#elif defined INT_OMF4
530 OMF4(u11t, u12t, u11t_f, u12t_f, X0, X1, Phi, dk4m, dk4p, dk4m_f, dk4p_f, dSdpi, pp,iu, id, gamval,
531 gamval_f, gamin, jqq, beta,akappa,stepl,dt,&ancg,&itot,proby);
532#else
533#error "No integrator defined. Please define {INT_LPFR.INT_OMF2,INT_OMF4}"
534#endif
535
536 totancg+=ancg;
537 //Monte Carlo step: Accept new fields with the probability of min(1,exp(H0-X0))
538 //Kernel Call needed here?
539 Reunitarise(u11t,u12t);
540 double H1, S1;
541 Hamilton(&H1, &S1, rescga,pp,X0,X1,Phi,u11t,u12t,u11t_f,u12t_f,iu,id,gamval_f,gamin,\
542 dk4m_f,dk4p_f,jqq,akappa,beta,&ancgh,itraj);
543 ancgh/=2.0; //Hamilton is called at start and end of trajectory
544 totancgh+=ancgh;
545#ifdef _DEBUG
546 printf("H0-H1=%f-%f",H0,H1);
547#endif
548 double dH = H0 - H1;
549#ifdef _DEBUG
550 printf("=%f\n",dH);
551#endif
552 double dS = S0 - S1;
553 if(!rank){
554 fprintf(output, "dH = %e dS = %e\n", dH, dS);
555#ifdef _DEBUG
556 printf("dH = %e dS = %e\n", dH, dS);
557#endif
558 }
559 e_dH+=dH; e_dH_e+=dH*dH;
560 double y = exp(dH);
561 yav+=y;
562 yyav+=y*y;
563 //The Monte-Carlo
564 //Always update dH is positive (gone from higher to lower energy)
565 bool acc;
566 if(dH>0 || Par_granf()<=y){
567 //Step is accepted. Set s=st
568 if(!rank)
569 printf("New configuration accepted on trajectory %i.\n", itraj);
570 //Original FORTRAN Comment:
571 //JIS 20100525: write config here to preempt troubles during measurement!
572 //JIS 20100525: remove when all is ok....
573 memcpy(u11,u11t,ndim*kvol*sizeof(Complex));
574 memcpy(u12,u12t,ndim*kvol*sizeof(Complex));
575 naccp++;
576 //Divide by gvol since we've summed over all lattice sites
577 action=S1/gvol;
578 acc=true;
579 }
580 else{
581 if(!rank)
582 printf("New configuration rejected on trajectory %i.\n", itraj);
583 acc=false;
584 }
585 actiona+=action;
586 double vel2=0.0;
587#ifdef __NVCC__
588 cublasDnrm2(cublas_handle,kmom, pp, 1,&vel2);
589 vel2*=vel2;
590#elif defined USE_BLAS
591 vel2 = cblas_dnrm2(kmom, pp, 1);
592 vel2*=vel2;
593#else
594#pragma unroll
595 for(int i=0; i<kmom; i++)
596 vel2+=pp[i]*pp[i];
597#endif
598#if(nproc>1)
599 Par_dsum(&vel2);
600#endif
601 vel2a+=vel2/(ndim*nadj*gvol);
602
603 if(itraj%iprint==0){
604 //If rejected, copy the previously accepted field in for measurements
605 if(!acc){
606#ifdef __NVCC__
607 cudaMemcpyAsync(u11t, u11, ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[0]);
608 cudaMemcpyAsync(u12t, u12, ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[1]);
609#else
610 memcpy(u11t, u11, ndim*kvol*sizeof(Complex));
611 memcpy(u12t, u12, ndim*kvol*sizeof(Complex));
612#endif
613 Trial_Exchange(u11t,u12t,u11t_f,u12t_f);
614 }
615#ifdef _DEBUG
616 if(!rank)
617 printf("Starting measurements\n");
618#endif
619 int itercg=0;
620 double endenf, denf;
621 Complex qbqb;
622 //Stop gap for measurement failure on Kay;
623 //If the Congrad in Measure fails, don't measure the Diquark or PBP-Density observables for
624 //that trajectory
625 int measure_check=0;
626 measure_check = Measure(&pbp,&endenf,&denf,&qq,&qbqb,respbp,&itercg,u11t,u12t,u11t_f,u12t_f,iu,id,\
627 gamval,gamval_f,gamin,dk4m,dk4p,dk4m_f,dk4p_f,jqq,akappa,Phi,R1);
628#ifdef _DEBUG
629 if(!rank)
630 printf("Finished measurements\n");
631#endif
632 pbpa+=pbp; endenfa+=endenf; denfa+=denf; ipbp++;
633 Average_Plaquette(&hg,&avplaqs,&avplaqt,u11t_f,u12t_f,iu,beta);
634 poly = Polyakov(u11t_f,u12t_f);
635 //We have four output files, so may as well get the other ranks to help out
636 //and abuse scoping rules while we're at it.
637 //Can use either OpenMP or MPI to do this
638#if (nproc>=4)
639 switch(rank)
640#else
641 if(!rank)
642#pragma omp parallel for
643 for(int i=0; i<4; i++)
644 switch(i)
645#endif
646 {
647 case(0):
648 //Output code... Some files weren't opened in the main loop of the FORTRAN code
649 //That will need to be looked into for the C version
650 //It would explain the weird names like fort.1X that looked like they were somehow
651 //FORTRAN related...
652 fprintf(output, "Measure (CG) %i Update (CG) %.3f Hamiltonian (CG) %.3f\n", itercg, ancg, ancgh);
653 fflush(output);
654 break;
655 case(1):
656 {
657 FILE *fortout;
658 char fortname[FILELEN] = "fermi.";
659 strcat(fortname,suffix);
660 const char *fortop= (itraj==1) ? "w" : "a";
661 if(!(fortout=fopen(fortname, fortop) )){
662 fprintf(stderr, "Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\
663 OPENERROR, funcname, fortname, fortop);
664#if(nproc>1)
665 MPI_Abort(comm,OPENERROR);
666#else
667 exit(OPENERROR);
668#endif
669 }
670 if(itraj==1)
671 fprintf(fortout, "pbp\tendenf\tdenf\n");
672 if(measure_check)
673 fprintf(fortout, "%e\t%e\t%e\n", NAN, NAN, NAN);
674 else
675 fprintf(fortout, "%e\t%e\t%e\n", pbp, endenf, denf);
676 fclose(fortout);
677 break;
678 }
679 case(2):
680 //The original code implicitly created these files with the name
681 //fort.XX where XX is the file label
682 //from FORTRAN. This was fort.12
683 {
684 FILE *fortout;
685 char fortname[FILELEN] = "bose.";
686 strcat(fortname,suffix);
687 const char *fortop= (itraj==1) ? "w" : "a";
688 if(!(fortout=fopen(fortname, fortop) )){
689 fprintf(stderr, "Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\
690 OPENERROR, funcname, fortname, fortop);
691 }
692 if(itraj==1)
693 fprintf(fortout, "avplaqs\tavplaqt\tpoly\n");
694 fprintf(fortout, "%e\t%e\t%e\n", avplaqs, avplaqt, poly);
695 fclose(fortout);
696 break;
697 }
698 case(3):
699 {
700 FILE *fortout;
701 char fortname[FILELEN] = "diq.";
702 strcat(fortname,suffix);
703 const char *fortop= (itraj==1) ? "w" : "a";
704 if(!(fortout=fopen(fortname, fortop) )){
705 fprintf(stderr, "Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\
706 OPENERROR, funcname, fortname, fortop);
707#if(nproc>1)
708 MPI_Abort(comm,OPENERROR);
709#else
710 exit(OPENERROR);
711#endif
712 }
713 if(itraj==1)
714 fprintf(fortout, "Re(qq)\n");
715 if(measure_check)
716 fprintf(fortout, "%e\n", NAN);
717 else
718 fprintf(fortout, "%e\n", creal(qq));
719 fclose(fortout);
720 break;
721 }
722 default: break;
723 }
724 }
725 if(itraj%icheck==0){
726 Par_swrite(itraj,icheck,beta,fmu,akappa,ajq,u11,u12);
727 }
728 if(!rank)
729 fflush(output);
730 }
731#if (defined SA3AT)
732 double elapsed = 0;
733 if(!rank){
734#if(nproc>1)
735 elapsed = MPI_Wtime()-start_time;
736#else
737 elapsed = omp_get_wtime()-start_time;
738#endif
739 }
740#endif
741 //End of main loop
742 //Free arrays
743#ifdef __NVCC__
744 //Make a routine that does this for us
745 cudaFree(dk4m); cudaFree(dk4p); cudaFree(R1); cudaFree(dSdpi); cudaFree(pp);
746 cudaFree(Phi); cudaFree(u11t); cudaFree(u12t);
747 cudaFree(X0); cudaFree(X1); cudaFree(u11); cudaFree(u12);
748 cudaFree(id); cudaFree(iu);
749 cudaFree(dk4m_f); cudaFree(dk4p_f); cudaFree(u11t_f); cudaFree(u12t_f);
750 cudaFree(gamin); cudaFree(gamval); cudaFree(gamval_f);
751 cublasDestroy(cublas_handle);
752#else
753 free(dk4m); free(dk4p); free(R1); free(dSdpi); free(pp);
754 free(Phi); free(u11t); free(u12t);
755 free(X0); free(X1); free(u11); free(u12);
756 free(id); free(iu);
757 free(dk4m_f); free(dk4p_f); free(u11t_f); free(u12t_f);
758 free(gamin); free(gamval); free(gamval_f);
759#endif
760 free(hd); free(hu);free(h1u); free(h1d); free(halosize); free(pcoord);
761#ifdef __RANLUX__
762 gsl_rng_free(ranlux_instd);
763#elif (defined __INTEL_MKL__ &&!defined USE_RAN2)
764 vslDeleteStream(&stream);
765#endif
766#if (defined SA3AT)
767 if(!rank){
768 FILE *sa3at = fopen("Bench_times.csv", "a");
769#ifdef __NVCC__
770 char *version[256];
771 int cuversion; cudaRuntimeGetVersion(&cuversion);
772 sprintf(version,"CUDA %d\tBlock: (%d,%d,%d)\tGrid: (%d,%d,%d)\n%s\n",cuversion,\
773 dimBlock.x,dimBlock.y,dimBlock.z,dimGrid.x,dimGrid.y,dimGrid.z,__VERSION__);
774#else
775 char *version=__VERSION__;
776#endif
777 fprintf(sa3at, "%s\nβ%0.3f κ:%0.4f μ:%0.4f j:%0.3f s:%i t:%i kvol:%ld\n"
778 "npx:%i npt:%i nthread:%i ncore:%i time:%f traj_time:%f\n\n",\
779 version,beta,akappa,fmu,ajq,nx,nt,kvol,npx,npt,nthreads,npx*npy*npz*npt*nthreads,elapsed,elapsed/ntraj);
780 fclose(sa3at);
781 }
782#endif
783 //Get averages for final output
784 actiona/=ntraj; vel2a/=ntraj; pbpa/=ipbp; endenfa/=ipbp; denfa/=ipbp;
785 totancg/=ntraj; totancgh/=ntraj;
786 e_dH/=ntraj; e_dH_e=sqrt((e_dH_e/ntraj-e_dH*e_dH)/(ntraj-1));
787 yav/=ntraj; yyav=sqrt((yyav/ntraj - yav*yav)/(ntraj-1));
788 float traj_cost=totancg/dt;
789 double atraj=dt*itot/ntraj;
790
791 if(!rank){
792 fprintf(output, "Averages for the last %i trajectories\n"\
793 "Number of acceptances: %i\tAverage Trajectory Length = %e\n"\
794 "<dH>=%e+/-%e\t<exp(dH)>=%e+/-%e\tTrajectory cost=N_cg/dt =%e\n"\
795 "Average number of congrad iter guidance: %.3f acceptance %.3f\n"\
796 "psibarpsi = %e\n"\
797 "Mean Square Velocity = %e\tAction Per Site = %e\n"\
798 "Energy Density = %e\tNumber Density %e\n\n\n",\
799 ntraj, naccp, atraj, e_dH,e_dH_e, yav, yyav, traj_cost, totancg, totancgh, pbpa, vel2a, actiona, endenfa, denfa);
800 fclose(output);
801 }
802#if(nproc>1)
803 //Ensure writing is done before finalising just in case finalise segfaults and crashes the other ranks mid-write
804 MPI_Barrier(comm);
805 MPI_Finalise();
806#endif
807 fflush(stdout);
808 return 0;
809}
Header for routines related to lattice sites.
unsigned int * hd
Down halo indices.
Definition coord.c:15
unsigned int * hu
Up halo indices.
Definition coord.c:15
int main(int argc, char *argv[])
Definition main.c:79
Matrix multiplication and related declarations.
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:584
MPI headers.
int Par_fcopy(float *fval)
Broadcasts a float to the other processes.
int size
The number of MPI ranks in total.
Definition par_mpi.c:22
int rank
The MPI rank.
Definition par_mpi.c:22
int Par_begin(int argc, char *argv[])
Initialises the MPI configuration.
Definition par_mpi.c:25
int Par_icopy(int *ival)
Broadcasts an integer to the other processes.
int Trial_Exchange(Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f)
Exchanges the trial fields.
Definition par_mpi.c:1178
#define M_PI
if not defined elsewhere
Definition random.c:41
Header for random number configuration.
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
Definition random.c:260
int Gauss_d(double *ps, unsigned int n, const double mu, const double sigma)
Generates a vector of normally distributed random double precision numbers using the Box-Muller Metho...
Definition random.c:306
long seed
RAN2 seed.
Definition random.c:27
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define rescgg
Conjugate gradient residue for update.
Definition sizes.h:240
#define nt
Lattice temporal extent. This also corresponds to the inverse temperature.
Definition sizes.h:86
#define kmom
sublattice momentum sites
Definition sizes.h:184
#define nx
Lattice x extent.
Definition sizes.h:66
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define ksizet
Sublattice t extent.
Definition sizes.h:149
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define rescga
Conjugate gradient residue for acceptance.
Definition sizes.h:242
#define ksize
Sublattice spatial extent for a cubic lattice.
Definition sizes.h:146
#define nf
Fermion flavours (double it)
Definition sizes.h:151
#define respbp
Conjugate gradient residue for .
Definition sizes.h:238
#define gvol
Lattice volume.
Definition sizes.h:92
#define FILELEN
Default file name length.
Definition sizes.h:62
#define halo
Total Halo size.
Definition sizes.h:222
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225
#define nz
Lattice z extent. We normally use cubic lattices so this is the same as nx.
Definition sizes.h:80
#define ny
Lattice y extent. We normally use cubic lattices so this is the same as nx.
Definition sizes.h:74
Function declarations for most of the routines.
int Init(int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, unsigned int *iu, unsigned int *id)
Initialises the system.
Definition su2hmc.c:19
int Hamilton(double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
Calculate the Hamiltonian.
Definition su2hmc.c:208
int Average_Plaquette(double *hg, double *avplaqs, double *avplaqt, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, float beta)
Calculates the gauge action using new (how new?) lookup table.
Definition bosonic.c:8
int Reunitarise(Complex *u11t, Complex *u12t)
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
Definition matrices.c:904
double Polyakov(Complex_f *u11t, Complex_f *u12t)
Calculate the Polyakov loop (no prizes for guessing that one...)
Definition bosonic.c:105