su2hmc
Loading...
Searching...
No Matches
main.c
Go to the documentation of this file.
1
43#include <assert.h>
44#include <coord.h>
45#include <math.h>
46#include <matrices.h>
47#include <par_mpi.h>
48#include <random.h>
49#include <string.h>
50#include <su2hmc.h>
51#ifdef __NVCC__
52#include <cublas_v2.h>
53#include <cuda.h>
54#include <cuda_runtime.h>
55cublasHandle_t cublas_handle;
56cublasStatus_t cublas_status;
57cudaMemPool_t mempool;
58//Fix this later
59#endif
79int main(int argc, char *argv[]){
80 //Instead of hard coding the function name so the error messages are easier to implement
81 const char *funcname = "main";
82
83 Par_begin(argc, argv);
84 //Add error catching code...
85#if(nproc>1)
86 MPI_Comm_rank(comm, &rank);
87 MPI_Comm_size(comm, &size);
88#endif
89
113 float beta = 1.7f;
114 float akappa = 0.1780f;
115#ifdef __NVCC__
116 __managed__
117#endif
118 Complex_f jqq = 0;
119 float fmu = 0.0f; int iread = 0; int istart = 1;
120 int iprint = 1; //How often are measurements made
121 int icheck = 5; //How often are configurations saved
122 int ibound = -1;
123#ifdef USE_MATH_DEFINES
124 const double tpi = 2*M_PI;
125#else
126 const double tpi = 2*acos(-1.0);
127#endif
128 float dt=0.004; float ajq = 0.0; float delb=0; //Not used?
129 float athq = 0.0; int stepl = 250; int ntraj = 10;
130 //rank is zero means it must be the "master process"
131 if(!rank){
132 FILE *midout;
133 const char *filename = (argc!=2) ?"midout":argv[1];
134 char *fileop = "r";
135 if( !(midout = fopen(filename, fileop) ) ){
136 fprintf(stderr, "Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\
137 OPENERROR, funcname, filename, fileop);
138#if(nproc>1)
139 MPI_Abort(comm,OPENERROR);
140#else
141 exit(OPENERROR);
142#endif
143 }
144 //See the README for what each entry means
145 fscanf(midout, "%f %f %f %f %f %f %f %d %d %d %d %d", &dt, &beta, &akappa,\
146 &ajq, &athq, &fmu, &delb, &stepl, &ntraj, &istart, &icheck, &iread);
147 fclose(midout);
148 assert(stepl>0); assert(ntraj>0); assert(istart>=0); assert(icheck>0); assert(iread>=0);
149 }
150 //Send inputs to other ranks
151#if(nproc>1)
152 Par_fcopy(&dt); Par_fcopy(&beta); Par_fcopy(&akappa); Par_fcopy(&ajq);
153 Par_fcopy(&athq); Par_fcopy(&fmu); Par_fcopy(&delb); //Not used?
154 Par_icopy(&stepl); Par_icopy(&ntraj); Par_icopy(&istart); Par_icopy(&icheck);
155 Par_icopy(&iread);
156#endif
157 jqq=ajq*cexp(athq*I);
158 //End of input
159#ifdef __NVCC__
160 //CUBLAS Handle
161 cublasCreate(&cublas_handle);
162 //Set up grid and blocks
163 blockInit(nx, ny, nz, nt, &dimBlock, &dimGrid);
164 //CUDA device
165 int device=-1;
166 cudaGetDevice(&device);
167 //For asynchronous memory, when CUDA syncs any unused memory in the pool is released back to the OS
168 //unless a threshold is given. We'll base our threshold off of Congradq
169 cudaDeviceGetDefaultMemPool(&mempool, device);
170 int threshold=2*kferm2*sizeof(Complex_f);
171 cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
172#endif
173#ifdef _DEBUG
174 printf("jqq=%f+(%f)I\n",creal(jqq),cimag(jqq));
175#endif
176#ifdef _DEBUG
177 seed = 967580161;
178#else
179 seed = time(NULL);
180#endif
181
182 //Gauge, trial and momentum fields
183 //You'll notice that there are two different allocation/free statements
184 //One for CUDA and one for everything else depending on what's
185 //being used
186 /*** Let's take a quick moment to compare this to the analysis code.
187 * The analysis code stores the gauge field as a 4 component real valued vector, whereas the produciton code
188 * used two complex numbers.
189 *
190 * Analysis code: u=(Re(u[0]),Im(u[1]),Re(u[1]),Im(u[0]))
191 * Production code: u[0]=u[0]+I*u[3] u[1]=u[2]+I*u[1]
192 *
193 */
194 Complex *u[2], *ut[2];
195 Complex_f *ut_f[2];
196 double *dk[2], *pp;
197 float *dk_f[2];
198 //Halo index arrays
199 unsigned int *iu, *id;
200#ifdef __NVCC__
201 cudaMallocManaged((void**)&iu,ndim*kvol*sizeof(int),cudaMemAttachGlobal);
202 cudaMallocManaged((void**)&id,ndim*kvol*sizeof(int),cudaMemAttachGlobal);
203
204 cudaMallocManaged(&dk[0],(kvol+halo)*sizeof(double),cudaMemAttachGlobal);
205 cudaMallocManaged(&dk[1],(kvol+halo)*sizeof(double),cudaMemAttachGlobal);
206#ifdef _DEBUG
207 cudaMallocManaged(&dk_f[0],(kvol+halo)*sizeof(float),cudaMemAttachGlobal);
208 cudaMallocManaged(&dk_f[1],(kvol+halo)*sizeof(float),cudaMemAttachGlobal);
209#else
210 cudaMalloc(&dk_f[0],(kvol+halo)*sizeof(float));
211 cudaMalloc(&dk_f[1],(kvol+halo)*sizeof(float));
212#endif
213
214 int *gamin;
215 Complex *gamval;
216 Complex_f *gamval_f;
217 cudaMallocManaged(&gamin,4*4*sizeof(Complex),cudaMemAttachGlobal);
218 cudaMallocManaged(&gamval,5*4*sizeof(Complex),cudaMemAttachGlobal);
219#ifdef _DEBUG
220 cudaMallocManaged(&gamval_f,5*4*sizeof(Complex_f),cudaMemAttachGlobal);
221#else
222 cudaMalloc(&gamval_f,5*4*sizeof(Complex_f));
223#endif
224 cudaMallocManaged(&u[0],ndim*kvol*sizeof(Complex),cudaMemAttachGlobal);
225 cudaMallocManaged(&u[1],ndim*kvol*sizeof(Complex),cudaMemAttachGlobal);
226 cudaMallocManaged(&ut[0],ndim*(kvol+halo)*sizeof(Complex),cudaMemAttachGlobal);
227 cudaMallocManaged(&ut[1],ndim*(kvol+halo)*sizeof(Complex),cudaMemAttachGlobal);
228#ifdef _DEBUG
229 cudaMallocManaged(&ut_f[0],ndim*(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);
230 cudaMallocManaged(&ut_f[1],ndim*(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);
231#else
232 cudaMalloc(&ut_f[0],ndim*(kvol+halo)*sizeof(Complex_f));
233 cudaMalloc(&ut_f[1],ndim*(kvol+halo)*sizeof(Complex_f));
234#endif
235#else
236 id = (unsigned int*)aligned_alloc(AVX,ndim*kvol*sizeof(int));
237 iu = (unsigned int*)aligned_alloc(AVX,ndim*kvol*sizeof(int));
238
239 int *gamin = (int *)aligned_alloc(AVX,4*4*sizeof(int));
240 Complex *gamval=(Complex *)aligned_alloc(AVX,5*4*sizeof(Complex));
241 Complex_f *gamval_f=(Complex_f *)aligned_alloc(AVX,5*4*sizeof(Complex_f));;
242
243 dk[0] = (double *)aligned_alloc(AVX,(kvol+halo)*sizeof(double));
244 dk[1] = (double *)aligned_alloc(AVX,(kvol+halo)*sizeof(double));
245 dk_f[0] = (float *)aligned_alloc(AVX,(kvol+halo)*sizeof(float));
246 dk_f[1] = (float *)aligned_alloc(AVX,(kvol+halo)*sizeof(float));
247
248 u[0] = (Complex *)aligned_alloc(AVX,ndim*kvol*sizeof(Complex));
249 u[1] = (Complex *)aligned_alloc(AVX,ndim*kvol*sizeof(Complex));
250 ut[0] = (Complex *)aligned_alloc(AVX,ndim*(kvol+halo)*sizeof(Complex));
251 ut[1] = (Complex *)aligned_alloc(AVX,ndim*(kvol+halo)*sizeof(Complex));
252 ut_f[0] = (Complex_f *)aligned_alloc(AVX,ndim*(kvol+halo)*sizeof(Complex_f));
253 ut_f[1] = (Complex_f *)aligned_alloc(AVX,ndim*(kvol+halo)*sizeof(Complex_f));
254#endif
268 Init(istart,ibound,iread,beta,fmu,akappa,ajq,u,ut,ut_f,gamval,gamval_f,gamin,dk,dk_f,iu,id);
269#ifdef __NVCC__
270 //GPU Initialisation stuff
271 Init_CUDA(ut[0],ut[1],gamval,gamval_f,gamin,dk[0],dk[1],iu,id);//&dimBlock,&dimGrid);
272#endif
273 //Send trials to accelerator for reunitarisation
274 Reunitarise(ut);
275 //Get trials back
276 memcpy(u[0], ut[0], ndim*kvol*sizeof(Complex));
277 memcpy(u[1], ut[1], ndim*kvol*sizeof(Complex));
278#ifdef __NVCC__
279 //Flip all the gauge fields around so memory is coalesced
280 Transpose_z(ut[0],ndim,kvol);
281 Transpose_z(ut[1],ndim,kvol);
282 //And the index arrays
283 Transpose_U(iu,ndim,kvol);
284 Transpose_U(id,ndim,kvol);
285 cudaDeviceSynchronise();
286#endif
287#ifdef DIAGNOSTIC
288 double ancg_diag=0;
289 Diagnostics(istart, u[0], u[1], ut[0], ut[1], ut_f[0], ut_f[1], iu, id, hu, hd, dk[0], dk[1],\
290 dk_f[0], dk_f[1], gamin, gamval, gamval_f, jqq, akappa, beta, ancg_diag);
291#endif
292
293 //Initial Measurements
294 //====================
295 Trial_Exchange(ut,ut_f);
296 double poly = Polyakov(ut_f);
297#ifdef _DEBUG
298 if(!rank) printf("Initial Polyakov loop evaluated as %e\n", poly);
299#endif
300 double hg, avplaqs, avplaqt;
301 //Halo exchange of the trial fields
302 Average_Plaquette(&hg,&avplaqs,&avplaqt,ut_f,iu,beta);
303 //Trajectory length
304 double traj=stepl*dt;
305 //Acceptance probability
306 double proby = 2.5/stepl;
307 char suffix[FILELEN]="";
308 int buffer; char buff2[7];
309 //Add script for extracting correct mu, j etc.
310 buffer = (int)round(100*beta);
311 sprintf(buff2,"b%03d",buffer);
312 strcat(suffix,buff2);
313 //κ
314 buffer = (int)round(10000*akappa);
315 sprintf(buff2,"k%04d",buffer);
316 strcat(suffix,buff2);
317 //μ
318 buffer = (int)round(1000*fmu);
319 sprintf(buff2,"mu%04d",buffer);
320 strcat(suffix,buff2);
321 //J
322 buffer = (int)round(1000*ajq);
323 sprintf(buff2,"j%03d",buffer);
324 strcat(suffix,buff2);
325 //nx
326 sprintf(buff2,"s%02d",nx);
327 strcat(suffix,buff2);
328 //nt
329 sprintf(buff2,"t%02d",nt);
330 strcat(suffix,buff2);
331 char outname[FILELEN] = "Output."; char *outop="a";
332 strcat(outname,suffix);
333 FILE *output;
334 if(!rank){
335 if(!(output=fopen(outname, outop) )){
336 fprintf(stderr,"Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",OPENERROR,funcname,outname,outop);
337#if(nproc>1)
338 MPI_Abort(comm,OPENERROR);
339#else
340 exit(OPENERROR);
341#endif
342 }
343 printf("hg = %e, <Ps> = %e, <Pt> = %e, <Poly> = %e\n", hg, avplaqs, avplaqt, poly);
344 fprintf(output, "ksize = %i ksizet = %i Nf = %i Halo =%i\nTime step dt = %e Trajectory length = %e\n"\
345 "No. of Trajectories = %i β = %e\nκ = %e μ = %e\nDiquark source = %e Diquark phase angle = %e\n"\
346 "Stopping Residuals: Guidance: %e Acceptance: %e, Estimator: %e\nSeed = %ld\n",
347 ksize, ksizet, nf, halo, dt, traj, ntraj, beta, akappa, fmu, ajq, athq, rescgg, rescga, respbp, seed);
348#ifdef _DEBUG
349 //Print to terminal during debugging
350 printf("ksize = %i ksizet = %i Nf = %i Halo = %i\nTime step dt = %e Trajectory length = %e\n"\
351 "No. of Trajectories = %i β = %e\nκ = %e μ = %e\nDiquark source = %e Diquark phase angle = %e\n"\
352 "Stopping Residuals: Guidance: %e Acceptance: %e, Estimator: %e\nSeed = %ld\n",
353 ksize, ksizet, nf, halo, dt, traj, ntraj, beta, akappa, fmu, ajq, athq, rescgg, rescga, respbp, seed);
354#endif
355 }
356 //Initialise for averages
357 //======================
358 double actiona = 0.0; double vel2a = 0.0; double pbpa = 0.0; double endenfa = 0.0; double denfa = 0.0;
359 //Expected canged in Hamiltonian
360 double e_dH=0; double e_dH_e=0;
361 //Expected Metropolis accept probability. Skewed by cases where the hamiltonian decreases.
362 double yav = 0.0; double yyav = 0.0;
363
364 int naccp = 0; int ipbp = 0; int itot = 0;
365
366 //Start of classical evolution
367 //===========================
368 double pbp;
369 Complex qq;
370 double *dSdpi;
371 //Field and related declarations
372 Complex *Phi, *R1, *X0, *X1;
373 //Initialise Arrays. Leaving it late for scoping
374 //check the sizes in sizes.h
375#ifdef __NVCC__
376 cudaMallocManaged(&R1, kfermHalo*sizeof(Complex),cudaMemAttachGlobal);
377 cudaMalloc(&Phi, nf*kferm*sizeof(Complex));
378#ifdef _DEBUG
379 cudaMallocManaged(&X0, nf*kferm2*sizeof(Complex),cudaMemAttachGlobal);
380 cudaMallocManaged(&dSdpi, kmom*sizeof(double),cudaMemAttachGlobal);
381#else
382 cudaMalloc(&X0, nf*kferm2*sizeof(Complex));
383 cudaMalloc(&dSdpi, kmom*sizeof(double));
384#endif
385
386 cudaMallocManaged(&X1, kferm2Halo*sizeof(Complex),cudaMemAttachGlobal);
387 cudaMallocManaged(&pp, kmom*sizeof(double),cudaMemAttachGlobal);
388 cudaDeviceSynchronise();
389#else
390 R1= aligned_alloc(AVX,kfermHalo*sizeof(Complex));
391 Phi= aligned_alloc(AVX,nf*kferm*sizeof(Complex));
392 X0= aligned_alloc(AVX,nf*kferm2*sizeof(Complex));
393 X1= aligned_alloc(AVX,kferm2Halo*sizeof(Complex));
394 dSdpi = aligned_alloc(AVX,kmom*sizeof(double));
395 //pp is the momentum field
396 pp = aligned_alloc(AVX,kmom*sizeof(double));
397#endif
403#if (defined SA3AT)
404 double start_time=0;
405 if(!rank){
406#if(nproc>1)
407 start_time = MPI_Wtime();
408#else
409 start_time = omp_get_wtime();
410#endif
411 }
412#endif
413 double action;
414 //Conjugate Gradient iteration counters
415 double ancg,ancgh,totancg,totancgh=0;
416 for(int itraj = iread+1; itraj <= ntraj+iread; itraj++){
417 //Reset conjugate gradient averages
418 ancg = 0; ancgh = 0;
419#ifdef _DEBUG
420 if(!rank)
421 printf("Starting itraj %i\n", itraj);
422#endif
423 for(int na=0; na<nf; na++){
424 //Probably makes sense to declare this outside the loop
425 //but I do like scoping/don't want to break anything else just teat
426 //
427 //How do we optimise this for use in CUDA? Do we use CUDA's PRNG
428 //or stick with MKL and synchronise/copy over the array
429#ifdef __NVCC__
430 Complex_f *R1_f,*R;
431 cudaMallocManaged(&R,kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
432#ifdef _DEBUG
433 cudaMallocManaged(&R1_f,kferm*sizeof(Complex_f),cudaMemAttachGlobal);
434 cudaMemset(R1_f,0,kferm*sizeof(Complex_f));
435#else
436 cudaMallocAsync(&R1_f,kferm*sizeof(Complex_f),streams[0]);
437 cudaMemsetAsync(R1_f,0,kferm*sizeof(Complex_f),streams[0]);
438#endif
439#else
440 Complex_f *R1_f=aligned_alloc(AVX,kferm*sizeof(Complex_f));
441 Complex_f *R=aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
442 memset(R1_f,0,kferm*sizeof(Complex_f));
443#endif
444 //The FORTRAN code had two Gaussian routines.
445 //gaussp was the normal Box-Muller and gauss0 didn't have 2 inside the square root
446 //Using σ=1/sqrt(2) in these routines has the same effect as gauss0
447#if (defined __NVCC__ && defined _DEBUG)
448 cudaMemPrefetchAsync(R1_f,kferm*sizeof(Complex_f),device,streams[1]);
449#endif
450#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
451 Gauss_c(R, kferm, 0, 1/sqrt(2));
452#else
453 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, R, 0, 1/sqrt(2));
454#endif
455#ifdef __NVCC__
456 cudaMemPrefetchAsync(R,kfermHalo*sizeof(Complex_f),device,NULL);
457 //Transpose needed here for Dslashd
458 Transpose_c(R,ngorkov*nc,kvol);
459 //R is random so this techincally isn't required. But it does keep the code output consistent with previous
460 //versions.
461 cudaDeviceSynchronise();
462#endif
463 Dslashd_f(R1_f,R,ut_f[0],ut_f[1],iu,id,gamval_f,gamin,dk_f,jqq,akappa);
464#ifdef __NVCC__
465 //Make sure the multiplication is finished before freeing its input!!
466 cudaFree(R);//cudaDeviceSynchronise();
467 //cudaFree is blocking so don't need to synchronise
468 cuComplex_convert(R1_f,R1,kferm,false,dimBlock,dimGrid);
469#ifdef _DEBUG
470 cudaFree(R1_f);
471#else
472 cudaFreeAsync(R1_f,NULL);
473#endif
474 cudaMemcpyAsync(Phi+na*kferm,R1, kferm*sizeof(Complex),cudaMemcpyDefault,NULL);
475 //cudaMemcpyAsync(Phi+na*kferm,R1, kferm*sizeof(Complex),cudaMemcpyDefault,streams[1]);
476 cudaDeviceSynchronise();
477#else
478 free(R);
479#pragma omp simd aligned(R1_f,R1:AVX)
480 for(int i=0;i<kferm;i++)
481 R1[i]=(Complex)R1_f[i];
482 free(R1_f);
483 memcpy(Phi+na*kferm,R1, kferm*sizeof(Complex));
484 //Up/down partitioning (using only pseudofermions of flavour 1)
485#endif
486 UpDownPart(na, X0, R1);
487 }
488 //Heatbath
489 //========
490 //We're going to make the most of the new Gauss_d routine to send a flattened array
491 //and do this all in one step.
492#ifdef __NVCC__
493 cudaMemcpyAsync(ut[0], u[0], ndim*kvol*sizeof(Complex),cudaMemcpyHostToDevice,NULL);
494 cudaMemcpyAsync(ut[1], u[1], ndim*kvol*sizeof(Complex),cudaMemcpyHostToDevice,NULL);
495 //Convert to SoA
496 Transpose_z(ut[0],ndim,kvol);
497 Transpose_z(ut[1],ndim,kvol);
498#else
499 memcpy(ut[0], u[0], ndim*kvol*sizeof(Complex));
500 memcpy(ut[1], u[1], ndim*kvol*sizeof(Complex));
501#endif
502 Trial_Exchange(ut,ut_f);
503#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
504 Gauss_d(pp, kmom, 0, 1);
505#else
506 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, kmom, pp, 0, 1);
507#endif
508 //Initialise Trial Fields
509#ifdef __NVCC__
510 //pp is random at this point so swapping the order isn't really necessary. But it does ensure that it matches for CPU and GPU
511 Transpose_d(pp,nadj*ndim,kvol);
512 cudaMemPrefetchAsync(pp,kmom*sizeof(double),device,streams[1]);
513#endif
514 double H0, S0;
515 Hamilton(&H0,&S0,rescga,pp,X0,X1,Phi,ut_f,iu,id,gamval_f,gamin,dk_f,jqq,akappa,beta,&ancgh,itraj);
516#ifdef _DEBUG
517 if(!rank) printf("H0: %e S0: %e\n", H0, S0);
518#endif
519 if(itraj==1)
520 action = S0/gvol;
521
522 //Integration
523 //TODO: Have this as a runtime parameter.
524#if (defined INT_LPFR && defined INT_OMF2) ||(defined INT_LPFR && defined INT_OMF4)||(defined INT_OMF2 && defined INT_OMF4)
525#error "Only one integrator may be defined"
526#elif defined INT_LPFR
527 Leapfrog(ut,ut_f,X0,X1,Phi,dk,dk_f,dSdpi,pp,iu,id,gamval,gamval_f,gamin,jqq,beta,akappa,stepl,dt,&ancg,&itot,proby);
528#elif defined INT_OMF2
529 OMF2(ut,ut_f,X0,X1,Phi,dk,dk_f,dSdpi,pp,iu,id,gamval,gamval_f,gamin,jqq,beta,akappa,stepl,dt,&ancg,&itot,proby);
530#elif defined INT_OMF4
531 OMF4(ut,ut_f,X0,X1,Phi,dk,dk_f,dSdpi,pp,iu,id,gamval,gamval_f,gamin,jqq,beta,akappa,stepl,dt,&ancg,&itot,proby);
532#else
533#error "No integrator defined. Please define {INT_LPFR.INT_OMF2,INT_OMF4}"
534#endif
535
536 totancg+=ancg;
537 //Monte Carlo step: Accept new fields with the probability of min(1,exp(H0-X0))
538 //Kernel Call needed here?
539 Reunitarise(ut);
540 double H1, S1;
541 Hamilton(&H1,&S1,rescga,pp,X0,X1,Phi,ut_f,iu,id,gamval_f,gamin,dk_f,jqq,akappa,beta,&ancgh,itraj);
542 ancgh/=2.0; //Hamilton is called at start and end of trajectory
543 totancgh+=ancgh;
544#ifdef _DEBUG
545 printf("H0-H1=%f-%f",H0,H1);
546#endif
547 double dH = H0 - H1;
548#ifdef _DEBUG
549 printf("=%f\n",dH);
550#endif
551 double dS = S0 - S1;
552 if(!rank){
553 fprintf(output, "dH = %e dS = %e\n", dH, dS);
554#ifdef _DEBUG
555 printf("dH = %e dS = %e\n", dH, dS);
556#endif
557 }
558 e_dH+=dH; e_dH_e+=dH*dH;
559 double y = exp(dH);
560 yav+=y;
561 yyav+=y*y;
562 //The Monte-Carlo
563 //Always update dH is positive (gone from higher to lower energy)
564 bool acc;
565 if(dH>0 || Par_granf()<=y){
566 //Step is accepted. Set s=st
567 if(!rank)
568 printf("New configuration accepted on trajectory %i.\n", itraj);
569 //Original FORTRAN Comment:
570 //JIS 20100525: write config here to preempt troubles during measurement!
571 //JIS 20100525: remove when all is ok....
572 memcpy(u[0],ut[0],ndim*kvol*sizeof(Complex));
573 memcpy(u[1],ut[1],ndim*kvol*sizeof(Complex));
574#ifdef __NVCC__
575 Transpose_z(u[0],kvol,ndim);
576 Transpose_z(u[1],kvol,ndim);
577#endif
578 naccp++;
579 //Divide by gvol since we've summed over all lattice sites
580 action=S1/gvol;
581 acc=true;
582 }
583 else{
584 if(!rank)
585 printf("New configuration rejected on trajectory %i.\n", itraj);
586 acc=false;
587 }
588 actiona+=action;
589 double vel2=0.0;
590#ifdef __NVCC__
591 cublasDnrm2(cublas_handle,kmom, pp, 1,&vel2);
592 vel2*=vel2;
593#elif defined USE_BLAS
594 vel2 = cblas_dnrm2(kmom, pp, 1);
595 vel2*=vel2;
596#else
597#pragma unroll
598 for(int i=0; i<kmom; i++)
599 vel2+=pp[i]*pp[i];
600#endif
601#if(nproc>1)
602 Par_dsum(&vel2);
603#endif
604 vel2a+=vel2/(ndim*nadj*gvol);
605
606 if(itraj%iprint==0){
607 //If rejected, copy the previously accepted field in for measurements
608 if(!acc){
609#ifdef __NVCC__
610 cudaMemcpyAsync(ut[0], u[0], ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[0]);
611 cudaMemcpyAsync(ut[1], u[1], ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[1]);
612 cudaDeviceSynchronise();
613 Transpose_z(ut[0],ndim,kvol);
614 Transpose_z(ut[1],ndim,kvol);
615#else
616 memcpy(ut[0], u[0], ndim*kvol*sizeof(Complex));
617 memcpy(ut[1], u[1], ndim*kvol*sizeof(Complex));
618#endif
619 Trial_Exchange(ut,ut_f);
620 }
621#ifdef _DEBUG
622 if(!rank)
623 printf("Starting measurements\n");
624#endif
625 int itercg=0;
626 double endenf, denf;
627 Complex qbqb;
628 //Stop gap for measurement failure on Kay;
629 //If the Congrad in Measure fails, don't measure the Diquark or PBP-Density observables for
630 //that trajectory
631 int measure_check=0;
632 measure_check = Measure(&pbp,&endenf,&denf,&qq,&qbqb,respbp,&itercg,ut,ut_f,iu,id,\
633 gamval,gamval_f,gamin,dk,dk_f,jqq,akappa,Phi,R1);
634#ifdef _DEBUG
635 if(!rank)
636 printf("Finished measurements\n");
637#endif
638 pbpa+=pbp; endenfa+=endenf; denfa+=denf; ipbp++;
639 Average_Plaquette(&hg,&avplaqs,&avplaqt,ut_f,iu,beta);
640 poly = Polyakov(ut_f);
641 //We have four output files, so may as well get the other ranks to help out
642 //and abuse scoping rules while we're at it.
643 //Can use either OpenMP or MPI to do this
644#if (nproc>=4)
645 switch(rank)
646#else
647 if(!rank)
648#pragma omp parallel for
649 for(int i=0; i<4; i++)
650 switch(i)
651#endif
652 {
653 case(0):
654 //Output code... Some files weren't opened in the main loop of the FORTRAN code
655 //That will need to be looked into for the C version
656 //It would explain the weird names like fort.1X that looked like they were somehow
657 //FORTRAN related...
658 fprintf(output, "Measure (CG) %i Update (CG) %.3f Hamiltonian (CG) %.3f\n", itercg, ancg, ancgh);
659 fflush(output);
660 break;
661 case(1):
662 {
663 FILE *fortout;
664 char fortname[FILELEN] = "fermi.";
665 strcat(fortname,suffix);
666 const char *fortop= (itraj==1) ? "w" : "a";
667 if(!(fortout=fopen(fortname, fortop) )){
668 fprintf(stderr, "Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\
669 OPENERROR, funcname, fortname, fortop);
670#if(nproc>1)
671 MPI_Abort(comm,OPENERROR);
672#else
673 exit(OPENERROR);
674#endif
675 }
676 if(itraj==1)
677 fprintf(fortout, "pbp\tendenf\tdenf\n");
678 if(measure_check)
679 fprintf(fortout, "%e\t%e\t%e\n", NAN, NAN, NAN);
680 else
681 fprintf(fortout, "%e\t%e\t%e\n", pbp, endenf, denf);
682 fclose(fortout);
683 break;
684 }
685 case(2):
686 //The original code implicitly created these files with the name
687 //fort.XX where XX is the file label
688 //from FORTRAN. This was fort.12
689 {
690 FILE *fortout;
691 char fortname[FILELEN] = "bose.";
692 strcat(fortname,suffix);
693 const char *fortop= (itraj==1) ? "w" : "a";
694 if(!(fortout=fopen(fortname, fortop) )){
695 fprintf(stderr, "Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\
696 OPENERROR, funcname, fortname, fortop);
697 }
698 if(itraj==1)
699 fprintf(fortout, "avplaqs\tavplaqt\tpoly\n");
700 fprintf(fortout, "%e\t%e\t%e\n", avplaqs, avplaqt, poly);
701 fclose(fortout);
702 break;
703 }
704 case(3):
705 {
706 FILE *fortout;
707 char fortname[FILELEN] = "diq.";
708 strcat(fortname,suffix);
709 const char *fortop= (itraj==1) ? "w" : "a";
710 if(!(fortout=fopen(fortname, fortop) )){
711 fprintf(stderr, "Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\
712 OPENERROR, funcname, fortname, fortop);
713#if(nproc>1)
714 MPI_Abort(comm,OPENERROR);
715#else
716 exit(OPENERROR);
717#endif
718 }
719 if(itraj==1)
720 fprintf(fortout, "Re(qq)\n");
721 if(measure_check)
722 fprintf(fortout, "%e\n", NAN);
723 else
724 fprintf(fortout, "%e\n", creal(qq));
725 fclose(fortout);
726 break;
727 }
728 default: break;
729 }
730 }
731 if(itraj%icheck==0){
732 Par_swrite(itraj,icheck,beta,fmu,akappa,ajq,u[0],u[1]);
733 }
734 if(!rank)
735 fflush(output);
736 }
737#if (defined SA3AT)
738 double elapsed = 0;
739 if(!rank){
740#if(nproc>1)
741 elapsed = MPI_Wtime()-start_time;
742#else
743 elapsed = omp_get_wtime()-start_time;
744#endif
745 }
746#endif
747 //End of main loop
748 //Free arrays
749#ifdef __NVCC__
750 //Make a routine that does this for us
751 cudaFree(dk[0]); cudaFree(dk[1]); cudaFree(R1); cudaFree(dSdpi); cudaFree(pp);
752 cudaFree(Phi); cudaFree(ut[0]); cudaFree(ut[1]);
753 cudaFree(X0); cudaFree(X1); cudaFree(u[0]); cudaFree(u[1]);
754 cudaFree(id); cudaFree(iu);
755 cudaFree(dk_f[0]); cudaFree(dk_f[1]); cudaFree(ut_f[0]); cudaFree(ut_f[1]);
756 cudaFree(gamin); cudaFree(gamval); cudaFree(gamval_f);
757 cublasDestroy(cublas_handle);
758#else
759 free(dk[0]); free(dk[1]); free(R1); free(dSdpi); free(pp);
760 free(Phi); free(ut[0]); free(ut[1]);
761 free(X0); free(X1); free(u[0]); free(u[1]);
762 free(id); free(iu);
763 free(dk_f[0]); free(dk_f[1]); free(ut_f[0]); free(ut_f[1]);
764 free(gamin); free(gamval); free(gamval_f);
765#endif
766 free(hd); free(hu);free(h1u); free(h1d); free(halosize); free(pcoord);
767#ifdef __RANLUX__
768 gsl_rng_free(ranlux_instd);
769#elif (defined __INTEL_MKL__ &&!defined USE_RAN2)
770 vslDeleteStream(&stream);
771#endif
772#if (defined SA3AT)
773 if(!rank){
774 FILE *sa3at = fopen("Bench_times.csv", "a");
775#ifdef __NVCC__
776 char *version[256];
777 int cuversion; cudaRuntimeGetVersion(&cuversion);
778 sprintf(version,"CUDA %d\tBlock: (%d,%d,%d)\tGrid: (%d,%d,%d)\n%s\n",cuversion,\
779 dimBlock.x,dimBlock.y,dimBlock.z,dimGrid.x,dimGrid.y,dimGrid.z,__VERSION__);
780#else
781 char *version=__VERSION__;
782#endif
783 fprintf(sa3at, "%s\nβ%0.3f κ:%0.4f μ:%0.4f j:%0.3f s:%i t:%i kvol:%ld\n"
784 "npx:%i npt:%i nthread:%i ncore:%i time:%f traj_time:%f\n\n",\
785 version,beta,akappa,fmu,ajq,nx,nt,kvol,npx,npt,nthreads,npx*npy*npz*npt*nthreads,elapsed,elapsed/ntraj);
786 fclose(sa3at);
787 }
788#endif
789 //Get averages for final output
790 actiona/=ntraj; vel2a/=ntraj; pbpa/=ipbp; endenfa/=ipbp; denfa/=ipbp;
791 totancg/=ntraj; totancgh/=ntraj;
792 e_dH/=ntraj; e_dH_e=sqrt((e_dH_e/ntraj-e_dH*e_dH)/(ntraj-1));
793 yav/=ntraj; yyav=sqrt((yyav/ntraj - yav*yav)/(ntraj-1));
794 float traj_cost=totancg/dt;
795 double atraj=dt*itot/ntraj;
796
797 if(!rank){
798 fprintf(output, "Averages for the last %i trajectories\n"\
799 "Number of acceptances: %i\tAverage Trajectory Length = %e\n"\
800 "<dH>=%e+/-%e\t<exp(dH)>=%e+/-%e\tTrajectory cost=N_cg/dt =%e\n"\
801 "Average number of congrad iter guidance: %.3f acceptance %.3f\n"\
802 "psibarpsi = %e\n"\
803 "Mean Square Velocity = %e\tAction Per Site = %e\n"\
804 "Energy Density = %e\tNumber Density %e\n\n\n",\
805 ntraj, naccp, atraj, e_dH,e_dH_e, yav, yyav, traj_cost, totancg, totancgh, pbpa, vel2a, actiona, endenfa, denfa);
806 fclose(output);
807 }
808#if(nproc>1)
809 //Ensure writing is done before finalising just in case finalise segfaults and crashes the other ranks mid-write
810 MPI_Barrier(comm);
811 MPI_Finalise();
812#endif
813 fflush(stdout);
814 return 0;
815}
Header for routines related to lattice sites.
unsigned int * h1u
Up halo starting element.
Definition coord.c:15
unsigned int * h1d
Down halo starting element.
Definition coord.c:15
unsigned int * halosize
Array containing the size of the halo in each direction.
Definition coord.c:15
unsigned int * hd
Down halo indices.
Definition coord.c:15
unsigned int * hu
Up halo indices.
Definition coord.c:15
int OMF2(Complex *ut[2], Complex_f *ut_f[2], Complex *X0, Complex *X1, Complex *Phi, double *dk[2], float *dk_f[2], double *dSdpi, double *pp, int *iu, int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, Complex jqq, float beta, float akappa, int stepl, float dt, double *ancg, int *itot, float proby)
OMF second order five step integrator.
Definition integrate.c:116
int Leapfrog(Complex *ut[2], Complex_f *ut_f[2], Complex *X0, Complex *X1, Complex *Phi, double *dk[2], float *dk_f[2], double *dSdpi, double *pp, int *iu, int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, Complex jqq, float beta, float akappa, int stepl, float dt, double *ancg, int *itot, float proby)
Leapfrog integrator. Each trajectory step takes the form of p->p+dt/2,u->u+dt,p->p+dt/2 In practice t...
Definition integrate.c:65
int OMF4(Complex *ut[2], Complex_f *ut_f[2], Complex *X0, Complex *X1, Complex *Phi, double *dk[2], float *dk_f[2], double *dSdpi, double *pp, int *iu, int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, Complex jqq, float beta, float akappa, int stepl, float dt, double *ancg, int *itot, float proby)
OMF fourth order eleven step integrator.
Definition integrate.c:194
int main(int argc, char *argv[])
Definition main.c:79
Matrix multiplication and related declarations.
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:585
MPI headers.
int Par_swrite(const int itraj, const int icheck, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12)
Copies u11 and u12 into arrays without halos which then get written to output.
Definition par_mpi.c:341
int Par_fcopy(float *fval)
Broadcasts a float to the other processes.
int size
The number of MPI ranks in total.
Definition par_mpi.c:22
int rank
The MPI rank.
Definition par_mpi.c:22
int Par_begin(int argc, char *argv[])
Initialises the MPI configuration.
Definition par_mpi.c:25
int Par_icopy(int *ival)
Broadcasts an integer to the other processes.
int Trial_Exchange(Complex *ut[2], Complex_f *ut_f[2])
Exchanges the trial fields.
Definition par_mpi.c:1178
#define MPI_Finalise()
Avoid any accidents with US/UK spelling.
Definition par_mpi.h:30
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
int * pcoord
The processor grid.
Definition par_mpi.c:19
#define M_PI
if not defined elsewhere
Definition random.c:41
Header for random number configuration.
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
Definition random.c:260
int Gauss_d(double *ps, unsigned int n, const double mu, const double sigma)
Generates a vector of normally distributed random double precision numbers using the Box-Muller Metho...
Definition random.c:306
double Par_granf()
Generates a random double which is then sent to the other ranks.
Definition random.c:190
long seed
RAN2 seed.
Definition random.c:27
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define rescgg
Conjugate gradient residue for update.
Definition sizes.h:240
#define nt
Lattice temporal extent. This also corresponds to the inverse temperature.
Definition sizes.h:86
#define kmom
sublattice momentum sites
Definition sizes.h:184
#define nx
Lattice x extent.
Definition sizes.h:66
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define ksizet
Sublattice t extent.
Definition sizes.h:149
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define npx
Processor grid x extent. This must be a divisor of nx.
Definition sizes.h:97
#define nadj
adjacent spatial indices
Definition sizes.h:175
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define rescga
Conjugate gradient residue for acceptance.
Definition sizes.h:242
#define npz
Processor grid z extent.
Definition sizes.h:116
#define nthreads
Number of threads for OpenMP, which can be overwritten at runtime.
Definition sizes.h:135
#define ksize
Sublattice spatial extent for a cubic lattice.
Definition sizes.h:146
#define nf
Fermion flavours (double it)
Definition sizes.h:151
#define respbp
Conjugate gradient residue for .
Definition sizes.h:238
#define gvol
Lattice volume.
Definition sizes.h:92
#define FILELEN
Default file name length.
Definition sizes.h:62
#define halo
Total Halo size.
Definition sizes.h:222
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define npy
Processor grid y extent.
Definition sizes.h:108
#define ndim
Dimensions.
Definition sizes.h:179
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188
#define npt
Processor grid t extent.
Definition sizes.h:124
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225
#define nz
Lattice z extent. We normally use cubic lattices so this is the same as nx.
Definition sizes.h:80
#define ny
Lattice y extent. We normally use cubic lattices so this is the same as nx.
Definition sizes.h:74
Function declarations for most of the routines.
double Polyakov(Complex_f *ut[2])
Calculate the Polyakov loop (no prizes for guessing that one...)
Definition bosonic.c:127
int Average_Plaquette(double *hg, double *avplaqs, double *avplaqt, Complex_f *ut[2], unsigned int *iu, float beta)
Calculates the gauge action using new (how new?) lookup table.
Definition bosonic.c:20
int Measure(double *pbp, double *endenf, double *denf, Complex *qq, Complex *qbqb, double res, int *itercg, Complex *ut[2], Complex_f *ut_f[2], unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], Complex_f jqq, float akappa, Complex *Phi, Complex *R1)
Calculate fermion expectation values via a noisy estimator.
Definition fermionic.c:8
int Init(int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq, Complex *u[2], Complex *ut[2], Complex_f *ut_f[2], Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], unsigned int *iu, unsigned int *id)
Initialises the system.
Definition su2hmc.c:19
int Hamilton(double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
Calculate the Hamiltonian.
Definition su2hmc.c:245
int Reunitarise(Complex *ut[2])
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
Definition matrices.c:1012