su2hmc/main_8c_source.html

#include <assert.h>

#include <coord.h>

#include <math.h>

#include <matrices.h>

#include <par_mpi.h>

#include <random.h>

#include <string.h>

#include <su2hmc.h>

#ifdef   __NVCC__

#include <cublas_v2.h>

#include <cuda.h>

#include <cuda_runtime.h>

cublasHandle_t cublas_handle;

cublasStatus_t cublas_status;

cudaMemPool_t mempool;

//Fix this later

#endif

int main(int argc, char *argv[]){

   //Instead of hard coding the function name so the error messages are easier to implement

   const char *funcname = "main";


   Par_begin(argc, argv);

   //Add error catching code...

#if(nproc>1)

   MPI_Comm_rank(comm, &rank);

   MPI_Comm_size(comm, &size);

#endif


   float beta = 1.7f;

   float akappa = 0.1780f;

#ifdef __NVCC__

   __managed__

#endif

      Complex_f jqq = 0;

   float fmu = 0.0f;

   int iread = 0;

   int istart = 1;

   int iprint = 1; //How often are measurements made

   int icheck = 5; //How often are configurations saved

   int ibound = -1;

#ifdef USE_MATH_DEFINES

   const double tpi = 2*M_PI;

#else

   const double tpi = 2*acos(-1.0);

#endif

   float dt=0.004; float ajq = 0.0;

   float delb=0; //Not used?

   float athq = 0.0;

   int stepl = 250; int ntraj = 10;

   //rank is zero means it must be the "master process"

   if(!rank){

      FILE *midout;

      const char *filename = (argc!=2) ?"midout":argv[1];

      char *fileop = "r";

      if( !(midout = fopen(filename, fileop) ) ){

         fprintf(stderr, "Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\

               OPENERROR, funcname, filename, fileop);

#if(nproc>1)

         MPI_Abort(comm,OPENERROR);

#else

         exit(OPENERROR);

#endif

      }

      //See the README for what each entry means

      fscanf(midout, "%f %f %f %f %f %f %f %d %d %d %d %d", &dt, &beta, &akappa,\

            &ajq, &athq, &fmu, &delb, &stepl, &ntraj, &istart, &icheck, &iread);

      fclose(midout);

      assert(stepl>0);  assert(ntraj>0);    assert(istart>=0);  assert(icheck>0);  assert(iread>=0);

   }

   //Send inputs to other ranks

#if(nproc>1)

   Par_fcopy(&dt); Par_fcopy(&beta); Par_fcopy(&akappa); Par_fcopy(&ajq);

   Par_fcopy(&athq); Par_fcopy(&fmu); Par_fcopy(&delb); //Not used?

   Par_icopy(&stepl); Par_icopy(&ntraj); Par_icopy(&istart); Par_icopy(&icheck);

   Par_icopy(&iread);

#endif

   jqq=ajq*cexp(athq*I);

   //End of input

#ifdef __NVCC__

   //CUBLAS Handle

   cublasCreate(&cublas_handle);

   //Set up grid and blocks

   blockInit(nx, ny, nz, nt, &dimBlock, &dimGrid);

   //CUDA device

   int device=-1;

   cudaGetDevice(&device);

   //For asynchronous memory, when CUDA syncs any unused memory in the pool is released back to the OS

   //unless a threshold is given. We'll base our threshold off of Congradq

   cudaDeviceGetDefaultMemPool(&mempool, device);

   int threshold=2*kferm2*sizeof(Complex_f);

   cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);

#endif

#ifdef _DEBUG

   printf("jqq=%f+(%f)I\n",creal(jqq),cimag(jqq));

#endif

#ifdef _DEBUG

   seed = 967580161;

#else

   seed = time(NULL);

#endif


   //Gauge, trial and momentum fields

   //You'll notice that there are two different allocation/free statements

   //One for CUDA and one for everything else depending on what's

   //being used

   Complex *u11, *u12, *u11t, *u12t;

   Complex_f *u11t_f, *u12t_f;

   double *dk4m, *dk4p, *pp;

   float *dk4m_f, *dk4p_f;

   //Halo index arrays

   unsigned int *iu, *id;

#ifdef __NVCC__

   cudaMallocManaged((void**)&iu,ndim*kvol*sizeof(int),cudaMemAttachGlobal);

   cudaMallocManaged((void**)&id,ndim*kvol*sizeof(int),cudaMemAttachGlobal);


   cudaMallocManaged(&dk4m,(kvol+halo)*sizeof(double),cudaMemAttachGlobal);

   cudaMallocManaged(&dk4p,(kvol+halo)*sizeof(double),cudaMemAttachGlobal);

#ifdef _DEBUG

   cudaMallocManaged(&dk4m_f,(kvol+halo)*sizeof(float),cudaMemAttachGlobal);

   cudaMallocManaged(&dk4p_f,(kvol+halo)*sizeof(float),cudaMemAttachGlobal);

#else

   cudaMalloc(&dk4m_f,(kvol+halo)*sizeof(float));

   cudaMalloc(&dk4p_f,(kvol+halo)*sizeof(float));

#endif


   int   *gamin;

   Complex  *gamval;

   Complex_f *gamval_f;

   cudaMallocManaged(&gamin,4*4*sizeof(Complex),cudaMemAttachGlobal);

   cudaMallocManaged(&gamval,5*4*sizeof(Complex),cudaMemAttachGlobal);

#ifdef _DEBUG

   cudaMallocManaged(&gamval_f,5*4*sizeof(Complex_f),cudaMemAttachGlobal);

#else

   cudaMalloc(&gamval_f,5*4*sizeof(Complex_f));

#endif

   cudaMallocManaged(&u11,ndim*kvol*sizeof(Complex),cudaMemAttachGlobal);

   cudaMallocManaged(&u12,ndim*kvol*sizeof(Complex),cudaMemAttachGlobal);

   cudaMallocManaged(&u11t,ndim*(kvol+halo)*sizeof(Complex),cudaMemAttachGlobal);

   cudaMallocManaged(&u12t,ndim*(kvol+halo)*sizeof(Complex),cudaMemAttachGlobal);

#ifdef _DEBUG

   cudaMallocManaged(&u11t_f,ndim*(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);

   cudaMallocManaged(&u12t_f,ndim*(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);

#else

   cudaMalloc(&u11t_f,ndim*(kvol+halo)*sizeof(Complex_f));

   cudaMalloc(&u12t_f,ndim*(kvol+halo)*sizeof(Complex_f));

#endif

#else

   id = (unsigned int*)aligned_alloc(AVX,ndim*kvol*sizeof(int));

   iu = (unsigned int*)aligned_alloc(AVX,ndim*kvol*sizeof(int));


   int   *gamin = (int *)aligned_alloc(AVX,4*4*sizeof(int));

   Complex  *gamval=(Complex *)aligned_alloc(AVX,5*4*sizeof(Complex));

   Complex_f *gamval_f=(Complex_f *)aligned_alloc(AVX,5*4*sizeof(Complex_f));;


   dk4m = (double *)aligned_alloc(AVX,(kvol+halo)*sizeof(double));

   dk4p = (double *)aligned_alloc(AVX,(kvol+halo)*sizeof(double));

   dk4m_f = (float *)aligned_alloc(AVX,(kvol+halo)*sizeof(float));

   dk4p_f = (float *)aligned_alloc(AVX,(kvol+halo)*sizeof(float));


   u11 = (Complex *)aligned_alloc(AVX,ndim*kvol*sizeof(Complex));

   u12 = (Complex *)aligned_alloc(AVX,ndim*kvol*sizeof(Complex));

   u11t = (Complex *)aligned_alloc(AVX,ndim*(kvol+halo)*sizeof(Complex));

   u12t = (Complex *)aligned_alloc(AVX,ndim*(kvol+halo)*sizeof(Complex));

   u11t_f = (Complex_f *)aligned_alloc(AVX,ndim*(kvol+halo)*sizeof(Complex_f));

   u12t_f = (Complex_f *)aligned_alloc(AVX,ndim*(kvol+halo)*sizeof(Complex_f));

#endif

   Init(istart,ibound,iread,beta,fmu,akappa,ajq,u11,u12,u11t,u12t,u11t_f,u12t_f,gamval,gamval_f,gamin,dk4m,dk4p,dk4m_f,dk4p_f,iu,id);

#ifdef __NVCC__

   //GPU Initialisation stuff

   Init_CUDA(u11t,u12t,gamval,gamval_f,gamin,dk4m,dk4p,iu,id);//&dimBlock,&dimGrid);

#endif

   //Send trials to accelerator for reunitarisation

   Reunitarise(u11t,u12t);

   //Get trials back

   memcpy(u11, u11t, ndim*kvol*sizeof(Complex));

   memcpy(u12, u12t, ndim*kvol*sizeof(Complex));

#ifdef DIAGNOSTIC

   double ancg_diag=0;

   Diagnostics(istart, u11, u12, u11t, u12t, u11t_f, u12t_f, iu, id, hu, hd, dk4m, dk4p,\

         dk4m_f, dk4p_f, gamin, gamval, gamval_f, jqq, akappa, beta, ancg_diag);

#endif


   //Initial Measurements

   //====================

   Trial_Exchange(u11t,u12t,u11t_f,u12t_f);

   double poly = Polyakov(u11t_f,u12t_f);

#ifdef _DEBUG

   if(!rank) printf("Initial Polyakov loop evaluated as %e\n", poly);

#endif

   double hg, avplaqs, avplaqt;

   //Halo exchange of the trial fields

   Average_Plaquette(&hg,&avplaqs,&avplaqt,u11t_f,u12t_f,iu,beta);

   //Trajectory length

   double traj=stepl*dt;

   //Acceptance probability

   double proby = 2.5/stepl;

   char suffix[FILELEN]="";

   int buffer; char buff2[7];

   //Add script for extracting correct mu, j etc.

   buffer = (int)round(100*beta);

   sprintf(buff2,"b%03d",buffer);

   strcat(suffix,buff2);

   //κ

   buffer = (int)round(10000*akappa);

   sprintf(buff2,"k%04d",buffer);

   strcat(suffix,buff2);

   //μ

   buffer = (int)round(1000*fmu);

   sprintf(buff2,"mu%04d",buffer);

   strcat(suffix,buff2);

   //J

   buffer = (int)round(1000*ajq);

   sprintf(buff2,"j%03d",buffer);

   strcat(suffix,buff2);

   //nx

   sprintf(buff2,"s%02d",nx);

   strcat(suffix,buff2);

   //nt

   sprintf(buff2,"t%02d",nt);

   strcat(suffix,buff2);

   char outname[FILELEN] = "Output."; char *outop="a";

   strcat(outname,suffix);

   FILE *output;

   if(!rank){

      if(!(output=fopen(outname, outop) )){

         fprintf(stderr,"Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",OPENERROR,funcname,outname,outop);

#if(nproc>1)

         MPI_Abort(comm,OPENERROR);

#else

         exit(OPENERROR);

#endif

      }

      printf("hg = %e, <Ps> = %e, <Pt> = %e, <Poly> = %e\n", hg, avplaqs, avplaqt, poly);

      fprintf(output, "ksize = %i ksizet = %i Nf = %i Halo =%i\nTime step dt = %e Trajectory length = %e\n"\

            "No. of Trajectories = %i β = %e\nκ = %e μ = %e\nDiquark source = %e Diquark phase angle = %e\n"\

            "Stopping Residuals: Guidance: %e Acceptance: %e, Estimator: %e\nSeed = %ld\n",

            ksize, ksizet, nf, halo, dt, traj, ntraj, beta, akappa, fmu, ajq, athq, rescgg, rescga, respbp, seed);

#ifdef _DEBUG

      //Print to terminal during debugging

      printf("ksize = %i ksizet = %i Nf = %i Halo = %i\nTime step dt = %e Trajectory length = %e\n"\

            "No. of Trajectories = %i β = %e\nκ = %e μ = %e\nDiquark source = %e Diquark phase angle = %e\n"\

            "Stopping Residuals: Guidance: %e Acceptance: %e, Estimator: %e\nSeed = %ld\n",

            ksize, ksizet, nf, halo, dt, traj, ntraj, beta, akappa, fmu, ajq, athq, rescgg, rescga, respbp, seed);

#endif

   }

   //Initialise for averages

   //======================

   double actiona = 0.0; double vel2a = 0.0; double pbpa = 0.0; double endenfa = 0.0; double denfa = 0.0;

   //Expected canged in Hamiltonian

   double e_dH=0; double e_dH_e=0;

   //Expected Metropolis accept probability. Skewed by cases where the hamiltonian decreases.

   double yav = 0.0; double yyav = 0.0;


   int naccp = 0; int ipbp = 0; int itot = 0;


   //Start of classical evolution

   //===========================

   double pbp;

   Complex qq;

   double *dSdpi;

   //Field and related declarations

   Complex *Phi, *R1, *X0, *X1;

   //Initialise Arrays. Leaving it late for scoping

   //check the sizes in sizes.h

#ifdef __NVCC__

   cudaMallocManaged(&R1, kfermHalo*sizeof(Complex),cudaMemAttachGlobal);

   cudaMalloc(&Phi, nf*kferm*sizeof(Complex));

#ifdef _DEBUG

   cudaMallocManaged(&X0, nf*kferm2*sizeof(Complex),cudaMemAttachGlobal);

#else

   cudaMalloc(&X0, nf*kferm2*sizeof(Complex));

#endif


   cudaMallocManaged(&X1, kferm2Halo*sizeof(Complex),cudaMemAttachGlobal);

   cudaMallocManaged(&pp, kmom*sizeof(double),cudaMemAttachGlobal);

   cudaMalloc(&dSdpi, kmom*sizeof(double));

   cudaDeviceSynchronise();

#else

   R1= aligned_alloc(AVX,kfermHalo*sizeof(Complex));

   Phi= aligned_alloc(AVX,nf*kferm*sizeof(Complex));

   X0= aligned_alloc(AVX,nf*kferm2*sizeof(Complex));

   X1= aligned_alloc(AVX,kferm2Halo*sizeof(Complex));

   dSdpi = aligned_alloc(AVX,kmom*sizeof(double));

   //pp is the momentum field

   pp = aligned_alloc(AVX,kmom*sizeof(double));

#endif

#if (defined SA3AT)

   double start_time=0;

   if(!rank){

#if(nproc>1)

      start_time = MPI_Wtime();

#else

      start_time = omp_get_wtime();

#endif

   }

#endif

   double action;

   //Conjugate Gradient iteration counters

   double ancg,ancgh,totancg,totancgh=0;

   for(int itraj = iread+1; itraj <= ntraj+iread; itraj++){

      //Reset conjugate gradient averages

      ancg = 0; ancgh = 0;

#ifdef _DEBUG

      if(!rank)

         printf("Starting itraj %i\n", itraj);

#endif

      for(int na=0; na<nf; na++){

         //Probably makes sense to declare this outside the loop

         //but I do like scoping/don't want to break anything else just teat

         //

         //How do we optimise this for use in CUDA? Do we use CUDA's PRNG

         //or stick with MKL and synchronise/copy over the array

#ifdef __NVCC__

         Complex_f *R1_f,*R;

         cudaMallocManaged(&R,kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);

#ifdef _DEBUG

         cudaMallocManaged(&R1_f,kferm*sizeof(Complex_f),cudaMemAttachGlobal);

         cudaMemset(R1_f,0,kferm*sizeof(Complex_f));

#else

         cudaMallocAsync(&R1_f,kferm*sizeof(Complex_f),streams[0]);

         cudaMemsetAsync(R1_f,0,kferm*sizeof(Complex_f),streams[0]);

#endif

#else

         Complex_f *R1_f=aligned_alloc(AVX,kferm*sizeof(Complex_f));

         Complex_f *R=aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));

         memset(R1_f,0,kferm*sizeof(Complex_f));

#endif

         //The FORTRAN code had two Gaussian routines.

         //gaussp was the normal Box-Muller and gauss0 didn't have 2 inside the square root

         //Using σ=1/sqrt(2) in these routines has the same effect as gauss0

#if (defined __NVCC__ && defined _DEBUG)

         cudaMemPrefetchAsync(R1_f,kferm*sizeof(Complex_f),device,streams[1]);

#endif

#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))

         Gauss_c(R, kferm, 0, 1/sqrt(2));

#else

         vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, R, 0, 1/sqrt(2));

#endif

#ifdef __NVCC__

         cudaMemPrefetchAsync(R,kfermHalo*sizeof(Complex_f),device,NULL);

         //Transpose needed here for Dslashd

         Transpose_c(R1_f,ngorkov*nc,kvol,dimGrid,dimBlock);

         Transpose_c(R,ngorkov*nc,kvol,dimGrid,dimBlock);

         //R is random so this techincally isn't required. But it does keep the code output consistent with previous

         //versions.

         //Flip all the gauge fields around so memory is coalesced

         Transpose_c(u11t_f,ndim,kvol,dimGrid,dimBlock);

         Transpose_c(u12t_f,ndim,kvol,dimGrid,dimBlock);

         cudaDeviceSynchronise();

#endif

         Dslashd_f(R1_f,R,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa);

#ifdef __NVCC__

         //Make sure the multiplication is finished before freeing its input!!

         cudaFree(R);//cudaDeviceSynchronise();

                     //cudaFree is blocking so don't need to synchronise

         Transpose_c(R1_f,kvol,ngorkov*nc,dimGrid,dimBlock);

         cuComplex_convert(R1_f,R1,kferm,false,dimBlock,dimGrid);

         Transpose_c(u11t_f,kvol,ndim,dimGrid,dimBlock);

         Transpose_c(u12t_f,kvol,ndim,dimGrid,dimBlock);

         //cudaDeviceSynchronise();

         //cudaFreeAsync(R1_f,NULL);

         cudaMemcpyAsync(Phi+na*kferm,R1, kferm*sizeof(Complex),cudaMemcpyDefault,0);

         //cudaMemcpyAsync(Phi+na*kferm,R1, kferm*sizeof(Complex),cudaMemcpyDefault,streams[1]);

         cudaDeviceSynchronise();

#ifdef _DEBUG

         cudaFree(R1_f);

#else

         cudaFreeAsync(R1_f,0);

#endif

         //cudaFree is blocking so don't need cudaDeviceSynchronise()

#else

         free(R);

#pragma omp simd aligned(R1_f,R1:AVX)

         for(int i=0;i<kferm;i++)

            R1[i]=(Complex)R1_f[i];

         free(R1_f);

         memcpy(Phi+na*kferm,R1, kferm*sizeof(Complex));

         //Up/down partitioning (using only pseudofermions of flavour 1)

#endif

         UpDownPart(na, X0, R1);

      }

      //Heatbath

      //========

      //We're going to make the most of the new Gauss_d routine to send a flattened array

      //and do this all in one step.

#ifdef __NVCC__

      cudaMemcpyAsync(u11t, u11, ndim*kvol*sizeof(Complex),cudaMemcpyHostToDevice,streams[1]);

      cudaMemcpyAsync(u12t, u12, ndim*kvol*sizeof(Complex),cudaMemcpyHostToDevice,streams[2]);

#else

      memcpy(u11t, u11, ndim*kvol*sizeof(Complex));

      memcpy(u12t, u12, ndim*kvol*sizeof(Complex));

#endif

#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))

      Gauss_d(pp, kmom, 0, 1);

#else

      vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, kmom, pp, 0, 1);

#endif

      //Initialise Trial Fields

#ifdef __NVCC__

      cudaMemPrefetchAsync(pp,kmom*sizeof(double),device,streams[1]);

      cudaMemcpy(u11t, u11, ndim*kvol*sizeof(Complex),cudaMemcpyDefault);

      cudaMemcpy(u12t, u12, ndim*kvol*sizeof(Complex),cudaMemcpyDefault);

#else

      memcpy(u11t, u11, ndim*kvol*sizeof(Complex));

      memcpy(u12t, u12, ndim*kvol*sizeof(Complex));

#endif

      Trial_Exchange(u11t,u12t,u11t_f,u12t_f);

      double H0, S0;

      Hamilton(&H0, &S0, rescga,pp,X0,X1,Phi,u11t,u12t,u11t_f,u12t_f,iu,id,gamval_f,gamin,\

            dk4m_f,dk4p_f,jqq,akappa,beta,&ancgh,itraj);

#ifdef _DEBUG

      if(!rank) printf("H0: %e S0: %e\n", H0, S0);

#endif

      if(itraj==1)

         action = S0/gvol;


      //Integration

      //TODO: Have this as a runtime parameter.

#if (defined INT_LPFR && defined INT_OMF2) ||(defined INT_LPFR && defined INT_OMF4)||(defined INT_OMF2 && defined INT_OMF4)

#error "Only one integrator may be defined

#elif defined INT_LPFR

      Leapfrog(u11t, u12t, u11t_f, u12t_f, X0, X1, Phi, dk4m, dk4p, dk4m_f, dk4p_f, dSdpi, pp,iu, id, gamval,

            gamval_f, gamin, jqq, beta,akappa,stepl,dt,&ancg,&itot,proby);

#elif defined INT_OMF2

      OMF2(u11t, u12t, u11t_f, u12t_f, X0, X1, Phi, dk4m, dk4p, dk4m_f, dk4p_f, dSdpi, pp,iu, id, gamval,

            gamval_f, gamin, jqq, beta,akappa,stepl,dt,&ancg,&itot,proby);

#elif defined INT_OMF4

      OMF4(u11t, u12t, u11t_f, u12t_f, X0, X1, Phi, dk4m, dk4p, dk4m_f, dk4p_f, dSdpi, pp,iu, id, gamval,

            gamval_f, gamin, jqq, beta,akappa,stepl,dt,&ancg,&itot,proby);

#else

#error "No integrator defined. Please define {INT_LPFR.INT_OMF2,INT_OMF4}"

#endif


      totancg+=ancg;

      //Monte Carlo step: Accept new fields with the probability of min(1,exp(H0-X0))

      //Kernel Call needed here?

      Reunitarise(u11t,u12t);

      double H1, S1;

      Hamilton(&H1, &S1, rescga,pp,X0,X1,Phi,u11t,u12t,u11t_f,u12t_f,iu,id,gamval_f,gamin,\

            dk4m_f,dk4p_f,jqq,akappa,beta,&ancgh,itraj);

      ancgh/=2.0; //Hamilton is called at start and end of trajectory

      totancgh+=ancgh;

#ifdef _DEBUG

      printf("H0-H1=%f-%f",H0,H1);

#endif

      double dH = H0 - H1;

#ifdef _DEBUG

      printf("=%f\n",dH);

#endif

      double dS = S0 - S1;

      if(!rank){

         fprintf(output, "dH = %e dS = %e\n", dH, dS);

#ifdef _DEBUG

         printf("dH = %e dS = %e\n", dH, dS);

#endif

      }

      e_dH+=dH; e_dH_e+=dH*dH;

      double y = exp(dH);

      yav+=y;

      yyav+=y*y;

      //The Monte-Carlo

      //Always update  dH is positive (gone from higher to lower energy)

      bool acc;

      if(dH>0 || Par_granf()<=y){

         //Step is accepted. Set s=st

         if(!rank)

            printf("New configuration accepted on trajectory %i.\n", itraj);

         //Original FORTRAN Comment:

         //JIS 20100525: write config here to preempt troubles during measurement!

         //JIS 20100525: remove when all is ok....

         memcpy(u11,u11t,ndim*kvol*sizeof(Complex));

         memcpy(u12,u12t,ndim*kvol*sizeof(Complex));

         naccp++;

         //Divide by gvol since we've summed over all lattice sites

         action=S1/gvol;

         acc=true;

      }

      else{

         if(!rank)

            printf("New configuration rejected on trajectory %i.\n", itraj);

         acc=false;

      }

      actiona+=action;

      double vel2=0.0;

#ifdef __NVCC__

      cublasDnrm2(cublas_handle,kmom, pp, 1,&vel2);

      vel2*=vel2;

#elif defined USE_BLAS

      vel2 = cblas_dnrm2(kmom, pp, 1);

      vel2*=vel2;

#else

#pragma unroll

      for(int i=0; i<kmom; i++)

         vel2+=pp[i]*pp[i];

#endif

#if(nproc>1)

      Par_dsum(&vel2);

#endif

      vel2a+=vel2/(ndim*nadj*gvol);


      if(itraj%iprint==0){

         //If rejected, copy the previously accepted field in for measurements

         if(!acc){

#ifdef __NVCC__

            cudaMemcpyAsync(u11t, u11, ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[0]);

            cudaMemcpyAsync(u12t, u12, ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[1]);

#else

            memcpy(u11t, u11, ndim*kvol*sizeof(Complex));

            memcpy(u12t, u12, ndim*kvol*sizeof(Complex));

#endif

            Trial_Exchange(u11t,u12t,u11t_f,u12t_f);

         }

#ifdef _DEBUG

         if(!rank)

            printf("Starting measurements\n");

#endif

         int itercg=0;

         double endenf, denf;

         Complex qbqb;

         //Stop gap for measurement failure on Kay;

         //If the Congrad in Measure fails, don't measure the Diquark or PBP-Density observables for

         //that trajectory

         int measure_check=0;

         measure_check = Measure(&pbp,&endenf,&denf,&qq,&qbqb,respbp,&itercg,u11t,u12t,u11t_f,u12t_f,iu,id,\

               gamval,gamval_f,gamin,dk4m,dk4p,dk4m_f,dk4p_f,jqq,akappa,Phi,R1);

#ifdef _DEBUG

         if(!rank)

            printf("Finished measurements\n");

#endif

         pbpa+=pbp; endenfa+=endenf; denfa+=denf; ipbp++;

         Average_Plaquette(&hg,&avplaqs,&avplaqt,u11t_f,u12t_f,iu,beta);

         poly = Polyakov(u11t_f,u12t_f);

         //We have four output files, so may as well get the other ranks to help out

         //and abuse scoping rules while we're at it.

         //Can use either OpenMP or MPI to do this

#if (nproc>=4)

         switch(rank)

#else

            if(!rank)

#pragma omp parallel for

               for(int i=0; i<4; i++)

                  switch(i)

#endif

                  {

                     case(0):

                        //Output code... Some files weren't opened in the main loop of the FORTRAN code

                        //That will need to be looked into for the C version

                        //It would explain the weird names like fort.1X that looked like they were somehow

                        //FORTRAN related...

                        fprintf(output, "Measure (CG) %i Update (CG) %.3f Hamiltonian (CG) %.3f\n", itercg, ancg, ancgh);

                        fflush(output);

                        break;

                     case(1):

                        {

                           FILE *fortout;

                           char fortname[FILELEN] = "fermi.";

                           strcat(fortname,suffix);

                           const char *fortop= (itraj==1) ? "w" : "a";

                           if(!(fortout=fopen(fortname, fortop) )){

                              fprintf(stderr, "Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\

                                    OPENERROR, funcname, fortname, fortop);

#if(nproc>1)

                              MPI_Abort(comm,OPENERROR);

#else

                              exit(OPENERROR);

#endif

                           }

                           if(itraj==1)

                              fprintf(fortout, "pbp\tendenf\tdenf\n");

                           if(measure_check)

                              fprintf(fortout, "%e\t%e\t%e\n", NAN, NAN, NAN);

                           else

                              fprintf(fortout, "%e\t%e\t%e\n", pbp, endenf, denf);

                           fclose(fortout);

                           break;

                        }

                     case(2):

                        //The original code implicitly created these files with the name

                        //fort.XX where XX is the file label

                        //from FORTRAN. This was fort.12

                        {

                           FILE *fortout;

                           char fortname[FILELEN] = "bose.";

                           strcat(fortname,suffix);

                           const char *fortop= (itraj==1) ? "w" : "a";

                           if(!(fortout=fopen(fortname, fortop) )){

                              fprintf(stderr, "Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\

                                    OPENERROR, funcname, fortname, fortop);

                           }

                           if(itraj==1)

                              fprintf(fortout, "avplaqs\tavplaqt\tpoly\n");

                           fprintf(fortout, "%e\t%e\t%e\n", avplaqs, avplaqt, poly);

                           fclose(fortout);

                           break;

                        }

                     case(3):

                        {

                           FILE *fortout;

                           char fortname[FILELEN] = "diq.";

                           strcat(fortname,suffix);

                           const char *fortop= (itraj==1) ? "w" : "a";

                           if(!(fortout=fopen(fortname, fortop) )){

                              fprintf(stderr, "Error %i in %s: Failed to open file %s for %s.\nExiting\n\n",\

                                    OPENERROR, funcname, fortname, fortop);

#if(nproc>1)

                              MPI_Abort(comm,OPENERROR);

#else

                              exit(OPENERROR);

#endif

                           }

                           if(itraj==1)

                              fprintf(fortout, "Re(qq)\n");

                           if(measure_check)

                              fprintf(fortout, "%e\n", NAN);

                           else

                              fprintf(fortout, "%e\n", creal(qq));

                           fclose(fortout);

                           break;

                        }

                     default: break;

                  }

      }

      if(itraj%icheck==0){

         Par_swrite(itraj,icheck,beta,fmu,akappa,ajq,u11,u12);

      }

      if(!rank)

         fflush(output);

   }

#if (defined SA3AT)

   double elapsed = 0;

   if(!rank){

#if(nproc>1)

      elapsed = MPI_Wtime()-start_time;

#else

      elapsed = omp_get_wtime()-start_time;

#endif

   }

#endif

   //End of main loop

   //Free arrays

#ifdef __NVCC__

   //Make a routine that does this for us

   cudaFree(dk4m); cudaFree(dk4p); cudaFree(R1); cudaFree(dSdpi); cudaFree(pp);

   cudaFree(Phi); cudaFree(u11t); cudaFree(u12t);

   cudaFree(X0); cudaFree(X1); cudaFree(u11); cudaFree(u12);

   cudaFree(id); cudaFree(iu);

   cudaFree(dk4m_f); cudaFree(dk4p_f); cudaFree(u11t_f); cudaFree(u12t_f);

   cudaFree(gamin); cudaFree(gamval); cudaFree(gamval_f);

   cublasDestroy(cublas_handle);

#else

   free(dk4m); free(dk4p); free(R1); free(dSdpi); free(pp);

   free(Phi); free(u11t); free(u12t);

   free(X0); free(X1); free(u11); free(u12);

   free(id); free(iu);

   free(dk4m_f); free(dk4p_f); free(u11t_f); free(u12t_f);

   free(gamin); free(gamval); free(gamval_f);

#endif

   free(hd); free(hu);free(h1u); free(h1d); free(halosize); free(pcoord);

#ifdef __RANLUX__

   gsl_rng_free(ranlux_instd);

#elif (defined __INTEL_MKL__ &&!defined USE_RAN2)

   vslDeleteStream(&stream);

#endif

#if (defined SA3AT)

   if(!rank){

      FILE *sa3at = fopen("Bench_times.csv", "a");

#ifdef __NVCC__

      char *version[256];

      int cuversion; cudaRuntimeGetVersion(&cuversion);

      sprintf(version,"CUDA %d\tBlock: (%d,%d,%d)\tGrid: (%d,%d,%d)\n%s\n",cuversion,\

               dimBlock.x,dimBlock.y,dimBlock.z,dimGrid.x,dimGrid.y,dimGrid.z,__VERSION__);

#else

      char *version=__VERSION__;

#endif

      fprintf(sa3at, "%s\nβ%0.3f κ:%0.4f μ:%0.4f j:%0.3f s:%i t:%i kvol:%ld\n"

            "npx:%i npt:%i nthread:%i ncore:%i time:%f traj_time:%f\n\n",\

            version,beta,akappa,fmu,ajq,nx,nt,kvol,npx,npt,nthreads,npx*npy*npz*npt*nthreads,elapsed,elapsed/ntraj);

      fclose(sa3at);

   }

#endif

   //Get averages for final output

   actiona/=ntraj; vel2a/=ntraj; pbpa/=ipbp; endenfa/=ipbp; denfa/=ipbp;

   totancg/=ntraj; totancgh/=ntraj;

   e_dH/=ntraj; e_dH_e=sqrt((e_dH_e/ntraj-e_dH*e_dH)/(ntraj-1));

   yav/=ntraj; yyav=sqrt((yyav/ntraj - yav*yav)/(ntraj-1));

   float traj_cost=totancg/dt;

   double atraj=dt*itot/ntraj;


   if(!rank){

      fprintf(output, "Averages for the last %i trajectories\n"\

            "Number of acceptances: %i\tAverage Trajectory Length = %e\n"\

            "<dH>=%e+/-%e\t<exp(dH)>=%e+/-%e\tTrajectory cost=N_cg/dt =%e\n"\

            "Average number of congrad iter guidance: %.3f acceptance %.3f\n"\

            "psibarpsi = %e\n"\

            "Mean Square Velocity = %e\tAction Per Site = %e\n"\

            "Energy Density = %e\tNumber Density %e\n\n\n",\

            ntraj, naccp, atraj, e_dH,e_dH_e, yav, yyav, traj_cost, totancg, totancgh, pbpa, vel2a, actiona, endenfa, denfa);

      fclose(output);

   }

#if(nproc>1)

   //Ensure writing is done before finalising just in case finalise segfaults and crashes the other ranks mid-write

   MPI_Barrier(comm);

   MPI_Finalise();

#endif

   fflush(stdout);

   return 0;

}

coord.h
Header for routines related to lattice sites.

hd
unsigned int * hd
Down halo indices.
Definition coord.c:15

hu
unsigned int * hu
Up halo indices.
Definition coord.c:15

main
int main(int argc, char *argv[])
Definition main.c:79

matrices.h
Matrix multiplication and related declarations.

Dslashd_f
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates  in single precision.
Definition matrices.c:584

par_mpi.h
MPI headers.

Par_fcopy
int Par_fcopy(float *fval)
Broadcasts a float to the other processes.

size
int size
The number of MPI ranks in total.
Definition par_mpi.c:22

rank
int rank
The MPI rank.
Definition par_mpi.c:22

Par_begin
int Par_begin(int argc, char *argv[])
Initialises the MPI configuration.
Definition par_mpi.c:25

Par_icopy
int Par_icopy(int *ival)
Broadcasts an integer to the other processes.

Trial_Exchange
int Trial_Exchange(Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f)
Exchanges the trial fields.
Definition par_mpi.c:1178

M_PI
#define M_PI
if not defined elsewhere
Definition random.c:41

random.h
Header for random number configuration.

Gauss_c
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
Definition random.c:260

Gauss_d
int Gauss_d(double *ps, unsigned int n, const double mu, const double sigma)
Generates a vector of normally distributed random double precision numbers using the Box-Muller Metho...
Definition random.c:306

seed
long seed
RAN2 seed.
Definition random.c:27

AVX
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268

nc
#define nc
Colours.
Definition sizes.h:173

rescgg
#define rescgg
Conjugate gradient residue for update.
Definition sizes.h:240

nt
#define nt
Lattice temporal extent. This also corresponds to the inverse temperature.
Definition sizes.h:86

kmom
#define kmom
sublattice momentum sites
Definition sizes.h:184

nx
#define nx
Lattice x extent.
Definition sizes.h:66

ngorkov
#define ngorkov
Gor'kov indices.
Definition sizes.h:181

ksizet
#define ksizet
Sublattice t extent.
Definition sizes.h:149

kferm2Halo
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227

kvol
#define kvol
Sublattice volume.
Definition sizes.h:154

Complex
#define Complex
Double precision complex number.
Definition sizes.h:58

kferm
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186

rescga
#define rescga
Conjugate gradient residue for acceptance.
Definition sizes.h:242

ksize
#define ksize
Sublattice spatial extent for a cubic lattice.
Definition sizes.h:146

nf
#define nf
Fermion flavours (double it)
Definition sizes.h:151

respbp
#define respbp
Conjugate gradient residue for .
Definition sizes.h:238

gvol
#define gvol
Lattice volume.
Definition sizes.h:92

FILELEN
#define FILELEN
Default file name length.
Definition sizes.h:62

halo
#define halo
Total Halo size.
Definition sizes.h:222

Complex_f
#define Complex_f
Single precision complex number.
Definition sizes.h:56

ndim
#define ndim
Dimensions.
Definition sizes.h:179

kferm2
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188

kfermHalo
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225

nz
#define nz
Lattice z extent. We normally use cubic lattices so this is the same as nx.
Definition sizes.h:80

ny
#define ny
Lattice y extent. We normally use cubic lattices so this is the same as nx.
Definition sizes.h:74

su2hmc.h
Function declarations for most of the routines.

Init
int Init(int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, unsigned int *iu, unsigned int *id)
Initialises the system.
Definition su2hmc.c:19

Hamilton
int Hamilton(double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
Calculate the Hamiltonian.
Definition su2hmc.c:208

Average_Plaquette
int Average_Plaquette(double *hg, double *avplaqs, double *avplaqt, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, float beta)
Calculates the gauge action using new (how new?) lookup table.
Definition bosonic.c:8

Reunitarise
int Reunitarise(Complex *u11t, Complex *u12t)
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
Definition matrices.c:904

Polyakov
double Polyakov(Complex_f *u11t, Complex_f *u12t)
Calculate the Polyakov loop (no prizes for guessing that one...)
Definition bosonic.c:105