MPI routines. More...

#include <par_mpi.h>
#include <random.h>
#include <su2hmc.h>

Include dependency graph for par_mpi.c:

Functions
int	Par_begin (int argc, char *argv[])
	Initialises the MPI configuration.

int	Par_sread (const int iread, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex u11, Complex u12, Complex u11t, Complex u12t)
	Reads and assigns the gauges from file.

int	Par_swrite (const int itraj, const int icheck, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex u11, Complex u12)
	Copies u11 and u12 into arrays without halos which then get written to output.

int	Trial_Exchange (Complex u11t, Complex u12t, Complex_f u11t_f, Complex_f u12t_f)
	Exchanges the trial fields.

Variables
int *	pcoord
	The processor grid.

int pstart[ndim][nproc]	__RANLUX__

int	rank
	The MPI rank.

int	size
	The number of MPI ranks in total.

Detailed Description

MPI routines.

Definition in file par_mpi.c.

Function Documentation

◆ Par_begin()

int Par_begin	(	int	argc,
		char *	argv[] )

Initialises the MPI configuration.

Parameters

argc	Number of arguments given to the programme
argv	Array of arguments

Returns: Zero on success, integer error code otherwise.

Definition at line 25 of file par_mpi.c.

                                     {
   /* Initialises the MPI configuration
    *
    * Parameters:
    * ---------
    * int argc Number of arguments given to the programme
    * char *argv[]   Array of arguments
    *
    * Returns:
    * -------
    * Zero on success, integer error code otherwise.
    */
 
   //TODO: Remove as much non-MPI stuff from here as possible
   const char *funcname = "Par_begin";
   int size;
#if(nproc>1)
   if(MPI_Init(&argc, &argv)){
      fprintf(stderr, "Error %i in %s: Failed to initialise MPI\nExiting\n\n", NO_MPI_INIT, funcname);
      MPI_Abort(comm,NO_MPI_INIT);
      exit(NO_MPI_INIT);
   }
 
   if(MPI_Comm_rank(comm, &rank)){
      fprintf(stderr, "Error %i in %s: Failed to find rank.\nExiting...\n\n", NO_MPI_RANK, funcname);
      MPI_Abort(comm,NO_MPI_RANK);
   }
   if(MPI_Comm_size(comm, &size)){
      fprintf(stderr, "Error %i in %s: Failed to find size\nExiting...\n\n", NO_MPI_SIZE, funcname);
      MPI_Abort(comm,NO_MPI_SIZE);
   }
#else
   size=1; rank=0;
#endif
   //If size isn't the same as the max allowed number of processes, then there's a problem somewhere.
   if(size!=nproc){
      fprintf(stderr, "Error %i in %s: For process %i, size %i is not equal to nproc %i.\n"
            "Exiting...\n\n", SIZEPROC, funcname, rank, size, nproc);
#if(nproc>1)
      MPI_Abort(comm,SIZEPROC);
#else
      exit(SIZEPROC);
#endif
   }
   //gsize is the size of the system, lsize is the size of each MPI Grid
   int gsize[4], lsize[4];
   gsize[0]=nx; gsize[1]=ny; gsize[2]=nz; gsize[3]=nt;
   lsize[0]=ksizex; lsize[1]=ksizey; lsize[2]=ksizez; lsize[3]=ksizet;
 
   //Topology layout
   int cartsize[ndim] __attribute__((aligned(AVX)));
   cartsize[0]=npx; cartsize[1]=npy; cartsize[2]=npz; cartsize[3]=npt;
 
   //For the topology, says if each dimension is periodic or not
   //Probably for us everything will be but using the four vector
   //gives the choice at least
   int periods[ndim] __attribute__((aligned(AVX)));
#pragma unroll
   for(int i=0; i<ndim; i++)
      periods[i] = true;
   //Not going to change the rank order
   int reorder = false;
   //Declare the topology
#if(nproc>1)
   MPI_Comm commcart;
   MPI_Cart_create(comm, ndim, cartsize, periods, reorder, &commcart);
#endif
 
   //Get nearest neighbours of processors
#if(nproc>1)
#pragma unroll
   for(int i= 0; i<ndim; i++)
      MPI_Cart_shift(commcart, i, 1, &pd[i], &pu[i]);
#endif
   //Get coordinates of processors in the grid
   pcoord = (int*)aligned_alloc(AVX,ndim*nproc*sizeof(int));
   memset(pcoord,0,sizeof(int)*ndim*nproc);
#if(nproc>1)
   for(int iproc = 0; iproc<nproc; iproc++){
      MPI_Cart_coords(commcart, iproc, ndim, pcoord+iproc*ndim);
#pragma omp simd aligned(pcoord:AVX)
      for(int idim = 0; idim<ndim; idim++){
         pstart[idim][iproc] = pcoord[idim+ndim*iproc]*lsize[idim];
         pstop[idim][iproc]  = pstart[idim][iproc] + lsize[idim];
      }
   }
#else
   //Set iproc=0 because we only have one proc
   for(int idim = 0; idim<ndim; idim++){
      pstart[idim][0] = 0;
      pstop[idim][0]  = lsize[idim];
   }
#endif
#ifdef _DEBUG
   if(!rank)
      printf("Running on %i processors.\nGrid layout is %ix%ix%ix%i\n",
            nproc, npx,npy,npz,npt);
   printf("Rank: %i pu: %i %i %i %i pd: %i %i %i %i\n", rank, pu[0], pu[1], pu[2], pu[3],
         pd[0], pd[1], pd[2], pd[3]);
#endif
   return 0;
}  

References AVX, ksizet, ksizex, ksizey, ksizez, ndim, nproc, npt, npx, npy, npz, nt, nx, ny, nz, pcoord, pd, pstart, pstop, pu, rank, and size.

Here is the caller graph for this function:

◆ Par_sread()

int Par_sread	(	const int	iread,
		const float	beta,
		const float	fmu,
		const float	akappa,
		const Complex_f	ajq,
		Complex *	u11,
		Complex *	u12,
		Complex *	u11t,
		Complex *	u12t )

Reads and assigns the gauges from file.

Parameters

iread	Configuration to read in
beta	Inverse gauge coupling
fmu	Chemical potential
akappa	Hopping parameter
ajq	Diquark source
u11,u12	Gauge fields
u11t,u12t	Trial fields

Returns: Zero on success, integer error code otherwise

Definition at line 127 of file par_mpi.c.

                                                               {
   /*
    * @brief Reads and assigns the gauges from file
    * 
    * @param   iread:      Configuration to read in
    * @param   beta:       Inverse gauge coupling
    * @param   fmu:        Chemical potential
    * @param   akappa:     Hopping parameter
    * @param   ajq:        Diquark source
    * @param   u11,u12:    Gauge fields
    * @param   u11t,u12t:  Trial fields
    * 
    * @return  Zero on success, integer error code otherwise
    */
   const char *funcname = "Par_sread";
#if(nproc>1)
   MPI_Status status;
   //For sending the seeds later
   MPI_Datatype MPI_SEED_TYPE = (sizeof(seed)==sizeof(int)) ? MPI_INT:MPI_LONG;
#endif
   //We shall allow the almighty master thread to open the file
   Complex *u1buff = (Complex *)aligned_alloc(AVX,kvol*sizeof(Complex));
   Complex *u2buff = (Complex *)aligned_alloc(AVX,kvol*sizeof(Complex));
   if(!rank){
      //Containers for input. Only needed by the master rank
      Complex *u11Read = (Complex *)aligned_alloc(AVX,ndim*gvol*sizeof(Complex));
      Complex *u12Read = (Complex *)aligned_alloc(AVX,ndim*gvol*sizeof(Complex));
      static char gauge_file[FILELEN]="config.";
      int buffer; char buff2[7];
      //Add script for extracting correct mu, j etc.
      buffer = (int)round(100*beta);
      sprintf(buff2,"b%03d",buffer);
      strcat(gauge_file,buff2);
      //κ
      buffer = (int)round(10000*akappa);
      sprintf(buff2,"k%04d",buffer);
      strcat(gauge_file,buff2);
      //μ
      buffer = (int)round(1000*fmu);
      sprintf(buff2,"mu%04d",buffer);
      strcat(gauge_file,buff2);
      //J
      buffer = (int)round(1000*creal(ajq));
      sprintf(buff2,"j%03d",buffer);
      strcat(gauge_file,buff2);
      //nx
      sprintf(buff2,"s%02d",nx);
      strcat(gauge_file,buff2);
      //nt
      sprintf(buff2,"t%02d",nt);
      strcat(gauge_file,buff2);
      //nconfig
      char c[8];
      sprintf(c,".%06d", iread);
      strcat(gauge_file, c);
 
      char *fileop = "rb";
      printf("Opening gauge file on processor: %i\n",rank); 
      FILE *con;
      if(!(con = fopen(gauge_file, fileop))){
         fprintf(stderr, "Error %i in %s: Failed to open %s for %s.\
               \nExiting...\n\n", OPENERROR, funcname, gauge_file, fileop);
#if(nproc>1)
         MPI_Abort(comm,OPENERROR);
#endif
         exit(OPENERROR);
      }
      //TODO: SAFETY CHECKS FOR EACH READ OPERATION
      int old_nproc;
      //What was previously the FORTRAN integer is now used to store the number of processors used to
      //generate the configuration
      fread(&old_nproc, sizeof(int), 1, con);
      if(old_nproc!=nproc)
         fprintf(stderr, "Warning %i in %s: Previous run was done on %i processors, current run uses %i.\n",\
               DIFNPROC,funcname,old_nproc,nproc);
      fread(u11Read, ndim*gvol*sizeof(Complex), 1, con);
      fread(u12Read, ndim*gvol*sizeof(Complex), 1, con);
      //The seed array will be used to gather and sort the seeds from each rank so they can be in a continuation run
      //If less processors are used then only nproc seeds are used (breaking the Markov Chain)
      //If more processors are used then we use the first seed to generate the rest as in Par_ranset
#ifdef __RANLUX__
      unsigned long *seed_array=(unsigned long*)calloc(nproc,sizeof(seed));
#elif defined __INTEL_MKL__ && !defined USE_RAN2
      int *seed_array=(int *)calloc(nproc,sizeof(seed));
#else
      long *seed_array=(long*)calloc(nproc,sizeof(seed));
#endif
      for(int i=0; i<fmin(old_nproc,nproc);i++)
         fread(seed_array+i, sizeof(seed), 1, con);
      fclose(con);
      //Any remaining processors get their initial value set as is done in Par_ranset
      for(int i=old_nproc; i<nproc; i++)
         seed_array[i] = seed_array[0]*(1.0f+8.0f*(float)i/(float)(size-1));
      if(!rank)
         seed=seed_array[0];
#if(nproc>1)
      for(int iproc = 1; iproc<nproc; iproc++)
         if(MPI_Send(&seed_array[iproc], 1, MPI_SEED_TYPE,iproc, 1, comm)){
            fprintf(stderr, "Error %i in %s: Failed to send seed to process %i.\nExiting...\n\n",
                  CANTSEND, funcname, iproc);
            MPI_Abort(comm,CANTSEND);
         }
#endif
 
      for(int iproc = 0; iproc < nproc; iproc++)
         for(int idim = 0; idim < ndim; idim++){
            int i = 0;
            //Index order is reversed from FORTRAN for performance
            //Going to split up assigning icoord[i] to reduce the
            //number of assignments.
            //We're weaving our way through the memory here, converting
            //between lattice and memory coordinates
            for(int it=pstart[3][iproc]; it<pstop[3][iproc]; it++)
               for(int iz=pstart[2][iproc]; iz<pstop[2][iproc]; iz++)
                  for(int iy=pstart[1][iproc]; iy<pstop[1][iproc]; iy++)
                     for(int ix=pstart[0][iproc]; ix<pstop[0][iproc]; ix++){
                        //j is the relative memory index of icoord
                        int j = Coord2gindex(ix,iy,iz,it);
                        u1buff[i]=u11Read[idim*gvol+j];
                        u2buff[i]=u12Read[idim*gvol+j];
                        //C starts counting from zero, not 1 so increment afterwards or start at int i=-1
                        i++;
                     }
            if(i!=kvol){
               fprintf(stderr, "Error %i in %s: Number of elements %i is not equal to\
                     kvol %i.\nExiting...\n\n", NUMELEM, funcname, i, kvol);
#if(nproc>1)
               MPI_Abort(comm,NUMELEM);
#else
               exit(NUMELEM);
#endif
            }
            if(!iproc){
#if defined USE_BLAS
               cblas_zcopy(kvol,u1buff,1,u11+idim,ndim);
               cblas_zcopy(kvol,u2buff,1,u12+idim,ndim);
#else
#pragma omp simd aligned(u11,u12,u1buff,u2buff:AVX)
               for(i=0;i<kvol;i++){
                  u11[i*ndim+idim]=u1buff[i];
                  u12[i*ndim+idim]=u2buff[i];
               }
#endif
            }     
#if(nproc>1)
            else{
               //The master thread did all the hard work, the minions just need to receive their
               //data and go.
               if(MPI_Send(u1buff, kvol, MPI_C_DOUBLE_COMPLEX,iproc, 2*idim, comm)){
                  fprintf(stderr, "Error %i in %s: Failed to send ubuff to process %i.\nExiting...\n\n",
                        CANTSEND, funcname, iproc);
#if(nproc>1)
                  MPI_Abort(comm,CANTSEND);
#else
                  exit(CANTSEND);
#endif
               }
               if(MPI_Send(u2buff, kvol, MPI_C_DOUBLE_COMPLEX,iproc, 2*idim+1, comm)){
                  fprintf(stderr, "Error %i in %s: Failed to send ubuff to process %i.\nExiting...\n\n",
                        CANTSEND, funcname, iproc);
#if(nproc>1)
                  MPI_Abort(comm,CANTSEND);
#else
                  exit(CANTSEND);
#endif
               }
            }
#endif
         }
      free(u11Read); free(u12Read);
      free(seed_array);
   }
#if(nproc>1)
   else{
      if(MPI_Recv(&seed, 1, MPI_SEED_TYPE, masterproc, 1, comm, &status)){
         fprintf(stderr, "Error %i in %s: Falied to receive seed on process %i.\nExiting...\n\n",
               CANTRECV, funcname, rank);
#if(nproc>1)
         MPI_Abort(comm,CANTRECV);
#else
         exit(CANTRECV);
#endif
      }
      for(int idim = 0; idim<ndim; idim++){
         //Receiving the data from the master threads.
         if(MPI_Recv(u1buff, kvol, MPI_C_DOUBLE_COMPLEX, masterproc, 2*idim, comm, &status)){
            fprintf(stderr, "Error %i in %s: Falied to receive u11 on process %i.\nExiting...\n\n",
                  CANTRECV, funcname, rank);
            MPI_Abort(comm,CANTRECV);
         }
         if(MPI_Recv(u2buff, kvol, MPI_C_DOUBLE_COMPLEX, masterproc, 2*idim+1, comm, &status)){
            fprintf(stderr, "Error %i in %s: Falied to receive u12 on process %i.\nExiting...\n\n",
                  CANTRECV, funcname, rank);
            MPI_Abort(comm,CANTRECV);
         }
#if defined USE_BLAS
         cblas_zcopy(kvol,u1buff,1,u11+idim,ndim);
         cblas_zcopy(kvol,u2buff,1,u12+idim,ndim);
#else
#pragma omp parallel for simd aligned(u11,u12,u1buff,u2buff:AVX)
         for(int i=0;i<kvol;i++){
            u11[i*ndim+idim]=u1buff[i];
            u12[i*ndim+idim]=u2buff[i];
         }
#endif
      }
   }
#endif
   free(u1buff); free(u2buff);
   memcpy(u11t, u11, ndim*kvol*sizeof(Complex));
   memcpy(u12t, u12, ndim*kvol*sizeof(Complex));
   return 0;
}

References AVX, Complex, Coord2gindex(), FILELEN, gvol, kvol, masterproc, ndim, nproc, nt, nx, pstart, pstop, rank, seed, and size.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ Par_swrite()

int Par_swrite	(	const int	itraj,
		const int	icheck,
		const float	beta,
		const float	fmu,
		const float	akappa,
		const Complex_f	ajq,
		Complex *	u11,
		Complex *	u12 )

Copies u11 and u12 into arrays without halos which then get written to output.

Modified from an original version of swrite in FORTRAN

Parameters

itraj	Trajectory to write
icheck	Not currently used but haven't gotten around to removing it
beta	Inverse gauge coupling
fmu	Chemical potential
akappa	Hopping parameter
ajq	Diquark source
u11,u12	Gauge fields

Returns: Zero on success, integer error code otherwise

Definition at line 341 of file par_mpi.c.

                                                      {
   /*
    * @brief   Copies u11 and u12 into arrays without halos which then get written to output
    *
    * Modified from an original version of swrite in FORTRAN
    * 
    * @param   itraj:      Trajectory to write
    * @param   icheck:     Not currently used but haven't gotten around to removing it
    * @param   beta:       Inverse gauge coupling
    * @param   fmu:        Chemical potential
    * @param   akappa:     Hopping parameter
    * @param   ajq:        Diquark source
    * @param   u11,u12:    Gauge fields
    * 
    * @return  Zero on success, integer error code otherwise
    */
   const char *funcname = "par_swrite";
   #if (nproc>1)
   MPI_Status status;
   //Used for seed array later on
   MPI_Datatype MPI_SEED_TYPE = (sizeof(seed)==sizeof(int)) ? MPI_INT:MPI_LONG;
   #endif
   Complex *u1buff = (Complex *)aligned_alloc(AVX,kvol*sizeof(Complex));
   Complex *u2buff = (Complex *)aligned_alloc(AVX,kvol*sizeof(Complex));
#ifdef _DEBUG
   char dump_prefix[FILELEN]="u11.";
   char dump_buff[32];
   sprintf(dump_buff,"r%01d_c%06d",rank,itraj);
   strcat(dump_prefix,dump_buff);
   FILE *gauge_dump=fopen(dump_prefix,"wb");
   //Print the local trial field in the order it is stored in memory.
   //This is not the same order as it is stored in secondary storage
   fwrite(u11,ndim*kvol*sizeof(Complex),1,gauge_dump);
   fclose(gauge_dump);
#endif
#ifdef __RANLUX__
   seed=gsl_rng_get(ranlux_instd);
#endif
   if(!rank){
      //Array to store the seeds. nth index is the nth processor
#ifdef __RANLUX__
      unsigned long *seed_array=(unsigned long*)calloc(nproc,sizeof(seed));
#elif defined __INTEL_MKL__ && !defined USE_RAN2
      int *seed_array=(int *)calloc(nproc,sizeof(seed));
#else
      long *seed_array=(long*)calloc(nproc,sizeof(seed));
#endif
      seed_array[0]=seed;
#if(nproc>1)
      for(int iproc = 1; iproc<nproc; iproc++)
         if(MPI_Recv(&seed_array[iproc], 1, MPI_SEED_TYPE,iproc, 1, comm, &status)){
            fprintf(stderr, "Error %i in %s: Failed to receive seed from process %i.\nExiting...\n\n",
                  CANTRECV, funcname, iproc);
            MPI_Abort(comm,CANTRECV);
         }
#endif
      Complex *u11Write = (Complex *)aligned_alloc(AVX,ndim*gvol*sizeof(Complex));
      Complex *u12Write = (Complex *)aligned_alloc(AVX,ndim*gvol*sizeof(Complex));
      //Get correct parts of u11read etc from remote processors
      for(int iproc=0;iproc<nproc;iproc++)
         for(int idim=0;idim<ndim;idim++){
#if(nproc>1)
            if(iproc){
               if(MPI_Recv(u1buff, kvol, MPI_C_DOUBLE_COMPLEX, iproc, 2*idim, comm, &status)){
                  fprintf(stderr, "Error %i in %s: Falied to receive u11 from process %i.\nExiting...\n\n",
                        CANTRECV, funcname, iproc);
                  MPI_Abort(comm,CANTRECV);
               }
               if(MPI_Recv(u2buff, kvol, MPI_C_DOUBLE_COMPLEX, iproc, 2*idim+1, comm, &status)){
                  fprintf(stderr, "Error %i in %s: Falied to receive u12 from process %i.\nExiting...\n\n",
                        CANTRECV, funcname, iproc);
                  MPI_Abort(comm,CANTRECV);
               }
            }
            else{
#endif
               //No need to do MPI Send/Receive on the master rank
               //Array looping is slow so we use memcpy instead
#if defined USE_BLAS
               cblas_zcopy(kvol,u11+idim,ndim,u1buff,1);
               cblas_zcopy(kvol,u12+idim,ndim,u2buff,1);
#else
#pragma omp parallel for simd aligned(u11,u12,u1buff,u2buff:AVX)
               for(int i=0;i<kvol;i++){
                  u1buff[i]=u11[i*ndim+idim];
                  u2buff[i]=u12[i*ndim+idim];
               }
#endif
#ifdef _DEBUG
               char part_dump[FILELEN]="";
               strcat(part_dump,dump_prefix);
               sprintf(dump_buff,"_d%d",idim);
               strcat(part_dump,dump_buff);
               FILE *pdump=fopen(part_dump,"wb");
               fwrite(u1buff,ndim*kvol*sizeof(Complex),1,pdump);
               fclose(pdump);
#endif
#if(nproc>1)
            }
#endif
            int i=0;
            for(int it=pstart[3][iproc]; it<pstop[3][iproc]; it++)
               for(int iz=pstart[2][iproc]; iz<pstop[2][iproc]; iz++)
                  for(int iy=pstart[1][iproc]; iy<pstop[1][iproc]; iy++)
                     for(int ix=pstart[0][iproc]; ix<pstop[0][iproc]; ix++){
                        //j is the relative memory index of icoord
                        int j = Coord2gindex(ix, iy, iz, it);
                        u11Write[idim*gvol+j] = u1buff[i];  
                        u12Write[idim*gvol+j] = u2buff[i];  
                        //C starts counting from zero, not 1 so increment afterwards or start at int i=-1
                        i++;
                     }
            if(i!=kvol){
               fprintf(stderr, "Error %i in %s: Number of elements %i is not equal to\
                     kvol %i.\nExiting...\n\n", NUMELEM, funcname, i, kvol);
#if(nproc>1)
               MPI_Abort(comm,NUMELEM);
#else
               exit(NUMELEM);
#endif
            }
         }
      free(u1buff); free(u2buff);
 
      char gauge_title[FILELEN]="config.";
      int buffer; char buff2[7];
      //Add script for extracting correct mu, j etc.
      buffer = (int)round(100*beta);
      sprintf(buff2,"b%03d",buffer);
      strcat(gauge_title,buff2);
      //κ
      buffer = (int)round(10000*akappa);
      sprintf(buff2,"k%04d",buffer);
      strcat(gauge_title,buff2);
      //μ
      buffer = (int)round(1000*fmu);
      sprintf(buff2,"mu%04d",buffer);
      strcat(gauge_title,buff2);
      //J
      buffer = (int)round(1000*creal(ajq));
      sprintf(buff2,"j%03d",buffer);
      strcat(gauge_title,buff2);
      //nx
      sprintf(buff2,"s%02d",nx);
      strcat(gauge_title,buff2);
      //nt
      sprintf(buff2,"t%02d",nt);
      strcat(gauge_title,buff2);
 
      char gauge_file[FILELEN];
      strcpy(gauge_file,gauge_title);
      char c[8];
      sprintf(c,".%06d", itraj);
      strcat(gauge_file, c);
      printf("Gauge file name is %s\n", gauge_file);
      printf("Writing the gauge file on processor %i.\n", rank);
      FILE *con;
      char *fileop = "wb";
      if(!(con=fopen(gauge_file, fileop))){
         fprintf(stderr, "Error %i in %s: Failed to open %s for %s.\
               \nExiting...\n\n", OPENERROR, funcname, gauge_file, fileop);
#if(nproc>1)
         MPI_Abort(comm,OPENERROR);
#else
         exit(OPENERROR);
#endif
      }
      //TODO: SAFETY CHECKS FOR EACH WRITE OPERATION
      //Write the number of processors used in the previous run. This takes the place of the FORTRAN integer rather nicely
#if(nproc==1)
      int size=nproc;
#endif
      fwrite(&size,sizeof(int),1,con);
      fwrite(u11Write, ndim*gvol*sizeof(Complex), 1, con);
      fwrite(u12Write, ndim*gvol*sizeof(Complex), 1, con);
      //TODO
      //Make a seed array, where the nth component is the seed on the nth rank for continuation runs.
      fwrite(seed_array, nproc*sizeof(seed), 1, con);
      fclose(con);
      free(u11Write); free(u12Write);
      free(seed_array);
   }
#if(nproc>1)
   else{
      if(MPI_Send(&seed, 1, MPI_SEED_TYPE, masterproc, 1, comm)){
         fprintf(stderr, "Error %i in %s: Falied to send u11 from process %i.\nExiting...\n\n",
               CANTSEND, funcname, rank);
         MPI_Abort(comm,CANTSEND);
      }
      for(int idim = 0; idim<ndim; idim++){
#if defined USE_BLAS
         cblas_zcopy(kvol,u11+idim,ndim,u1buff,1);
         cblas_zcopy(kvol,u12+idim,ndim,u2buff,1);
#else
#pragma omp parallel for simd aligned(u11,u12,u1buff,u2buff:AVX)
         for(int i=0;i<kvol;i++){
            u1buff[i]=u11[i*ndim+idim];
            u2buff[i]=u12[i*ndim+idim];
         }
#endif
#ifdef _DEBUG
         char part_dump[FILELEN]="";
         strcat(part_dump,dump_prefix);
         sprintf(dump_buff,"_d%d",idim);
         strcat(part_dump,dump_buff);
         FILE *pdump=fopen(part_dump,"wb");
         fwrite(u1buff,ndim*kvol*sizeof(Complex),1,pdump);
         fclose(pdump);
#endif
         int i=0;
         if(MPI_Send(u1buff, kvol, MPI_C_DOUBLE_COMPLEX, masterproc, 2*idim, comm)){
            fprintf(stderr, "Error %i in %s: Falied to send u11 from process %i.\nExiting...\n\n",
                  CANTSEND, funcname, rank);
            MPI_Abort(comm,CANTSEND);
         }
         if(MPI_Send(u2buff, kvol, MPI_C_DOUBLE_COMPLEX, masterproc, 2*idim+1, comm)){
            fprintf(stderr, "Error %i in %s: Falied to send u12 from process %i.\nExiting...\n\n",
                  CANTSEND, funcname, rank);
            MPI_Abort(comm,CANTSEND);
         }
      }
      free(u1buff); free(u2buff);
   }
#endif
   return 0;
}

References AVX, Complex, Coord2gindex(), FILELEN, gvol, kvol, masterproc, ndim, nproc, nt, nx, pstart, pstop, rank, seed, and size.

Here is the call graph for this function:

◆ Trial_Exchange()

int Trial_Exchange	(	Complex *	u11t,
		Complex *	u12t,
		Complex_f *	u11t_f,
		Complex_f *	u12t_f )

Exchanges the trial fields.

I noticed that this halo exchange was happening even though the trial fields hadn't been updated. To get around this I'm making a function that does the halo exchange and only calling it after the trial fields get updated.

Parameters

u11t,u12t	Double precision trial fields
u11t_f,u12t_f	Single precision trial fields

Returns: Zero on success, Integer Error code otherwise

Definition at line 1178 of file par_mpi.c.

                                                                                      {
   /*
    * Exchanges the trial fields. I noticed that this halo exchange was happening
    * even though the trial fields hadn't been updated. To get around this
    * I'm making a function that does the halo exchange and only calling it after
    * the trial fields get updated.
    */
   const char *funchame = "Trial_Exchange";
   //Prefetch the trial fields from the GPU, halos come later
#if(nproc>1)
#ifdef __NVCC__
   int device=-1;
   cudaGetDevice(&device);
   cudaMemPrefetchAsync(u11t, ndim*kvol*sizeof(Complex),cudaCpuDeviceId,NULL);
   cudaMemPrefetchAsync(u12t, ndim*kvol*sizeof(Complex),cudaCpuDeviceId,NULL);
#endif
   Complex *z = (Complex *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex));
   for(int mu=0;mu<ndim;mu++){
      //Copy the column from u11t
#ifdef USE_BLAS
      cblas_zcopy(kvol, &u11t[mu], ndim, z, 1);
#else
      for(int i=0; i<kvol;i++)
         z[i]=u11t[i*ndim+mu];
#endif
      //Halo exchange on that column
      ZHalo_swap_all(z, 1);
      //And the swap back
#ifdef USE_BLAS
      cblas_zcopy(kvol+halo, z, 1, &u11t[mu], ndim);
      //Now we prefetch the halo
#ifdef __NVCC__
      cudaMemPrefetchAsync(u11t+ndim*kvol, ndim*halo*sizeof(Complex),device,NULL);
#endif
      //Repeat for u12t
      cblas_zcopy(kvol, &u12t[mu], ndim, z, 1);
#else
      for(int i=0; i<kvol+halo;i++){
         u11t[i*ndim+mu]=z[i];
         z[i]=u12t[i*ndim+mu];
      }
#endif
      ZHalo_swap_all(z, 1);
#ifdef USE_BLAS
      cblas_zcopy(kvol+halo, z, 1, &u12t[mu], ndim);
#else
      for(int i=0; i<kvol+halo;i++)
         u12t[i*ndim+mu]=z[i];
#endif
   }
   //Now we prefetch the halo
#ifdef __NVCC__
   cudaMemPrefetchAsync(u12t+ndim*kvol, ndim*halo*sizeof(Complex),device,NULL);
#endif
   free(z);
#endif
//And get the single precision gauge fields preppeed
#ifdef __NVCC__
   cuComplex_convert(u11t_f,u11t,ndim*(kvol+halo),true,dimBlock,dimGrid);
   cuComplex_convert(u12t_f,u12t,ndim*(kvol+halo),true,dimBlock,dimGrid);
   cudaDeviceSynchronise();
#else
#pragma omp parallel for simd aligned(u11t_f,u12t_f,u11t,u12t:AVX)
   for(int i=0;i<ndim*(kvol+halo);i++){
      u11t_f[i]=(Complex_f)u11t[i];
      u12t_f[i]=(Complex_f)u12t[i];
   }
#endif
   return 0;
}

References AVX, Complex, Complex_f, halo, kvol, ndim, and ZHalo_swap_all().

Here is the call graph for this function:

Here is the caller graph for this function:

Variable Documentation

◆ RANLUX

int pd [ndim] __RANLUX__

Definition at line 20 of file par_mpi.c.

◆ pcoord

int* pcoord

The processor grid.

Definition at line 19 of file par_mpi.c.

◆ rank

int rank

The MPI rank.

Definition at line 22 of file par_mpi.c.

◆ size

int size

The number of MPI ranks in total.

Definition at line 22 of file par_mpi.c.

Functions

Variables

Detailed Description

Function Documentation

◆ Par_begin()

◆ Par_sread()

◆ Par_swrite()

◆ Trial_Exchange()

Variable Documentation

◆ __RANLUX__

◆ pcoord

◆ rank

◆ size

◆ RANLUX