su2hmc
Loading...
Searching...
No Matches
par_mpi.c File Reference

MPI routines. More...

#include <par_mpi.h>
#include <random.h>
#include <su2hmc.h>
Include dependency graph for par_mpi.c:

Go to the source code of this file.

Functions

int Par_begin (int argc, char *argv[])
 Initialises the MPI configuration.
 
int Par_sread (const int iread, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t)
 Reads and assigns the gauges from file.
 
int Par_swrite (const int itraj, const int icheck, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12)
 Copies u11 and u12 into arrays without halos which then get written to output.
 
int Trial_Exchange (Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f)
 Exchanges the trial fields.
 

Variables

int * pcoord
 The processor grid.
 
int pstart[ndim][nproc__RANLUX__
 
int rank
 The MPI rank.
 
int size
 The number of MPI ranks in total.
 

Detailed Description

MPI routines.

Definition in file par_mpi.c.

Function Documentation

◆ Par_begin()

int Par_begin ( int argc,
char * argv[] )

Initialises the MPI configuration.

Parameters
argcNumber of arguments given to the programme
argvArray of arguments
Returns
Zero on success, integer error code otherwise.

Definition at line 25 of file par_mpi.c.

25 {
26 /* Initialises the MPI configuration
27 *
28 * Parameters:
29 * ---------
30 * int argc Number of arguments given to the programme
31 * char *argv[] Array of arguments
32 *
33 * Returns:
34 * -------
35 * Zero on success, integer error code otherwise.
36 */
37
38 //TODO: Remove as much non-MPI stuff from here as possible
39 const char *funcname = "Par_begin";
40 int size;
41#if(nproc>1)
42 if(MPI_Init(&argc, &argv)){
43 fprintf(stderr, "Error %i in %s: Failed to initialise MPI\nExiting\n\n", NO_MPI_INIT, funcname);
44 MPI_Abort(comm,NO_MPI_INIT);
45 exit(NO_MPI_INIT);
46 }
47
48 if(MPI_Comm_rank(comm, &rank)){
49 fprintf(stderr, "Error %i in %s: Failed to find rank.\nExiting...\n\n", NO_MPI_RANK, funcname);
50 MPI_Abort(comm,NO_MPI_RANK);
51 }
52 if(MPI_Comm_size(comm, &size)){
53 fprintf(stderr, "Error %i in %s: Failed to find size\nExiting...\n\n", NO_MPI_SIZE, funcname);
54 MPI_Abort(comm,NO_MPI_SIZE);
55 }
56#else
57 size=1; rank=0;
58#endif
59 //If size isn't the same as the max allowed number of processes, then there's a problem somewhere.
60 if(size!=nproc){
61 fprintf(stderr, "Error %i in %s: For process %i, size %i is not equal to nproc %i.\n"
62 "Exiting...\n\n", SIZEPROC, funcname, rank, size, nproc);
63#if(nproc>1)
64 MPI_Abort(comm,SIZEPROC);
65#else
66 exit(SIZEPROC);
67#endif
68 }
69 //gsize is the size of the system, lsize is the size of each MPI Grid
70 int gsize[4], lsize[4];
71 gsize[0]=nx; gsize[1]=ny; gsize[2]=nz; gsize[3]=nt;
72 lsize[0]=ksizex; lsize[1]=ksizey; lsize[2]=ksizez; lsize[3]=ksizet;
73
74 //Topology layout
75 int cartsize[ndim] __attribute__((aligned(AVX)));
76 cartsize[0]=npx; cartsize[1]=npy; cartsize[2]=npz; cartsize[3]=npt;
77
78 //For the topology, says if each dimension is periodic or not
79 //Probably for us everything will be but using the four vector
80 //gives the choice at least
81 int periods[ndim] __attribute__((aligned(AVX)));
82#pragma unroll
83 for(int i=0; i<ndim; i++)
84 periods[i] = true;
85 //Not going to change the rank order
86 int reorder = false;
87 //Declare the topology
88#if(nproc>1)
89 MPI_Comm commcart;
90 MPI_Cart_create(comm, ndim, cartsize, periods, reorder, &commcart);
91#endif
92
93 //Get nearest neighbours of processors
94#if(nproc>1)
95#pragma unroll
96 for(int i= 0; i<ndim; i++)
97 MPI_Cart_shift(commcart, i, 1, &pd[i], &pu[i]);
98#endif
99 //Get coordinates of processors in the grid
100 pcoord = (int*)aligned_alloc(AVX,ndim*nproc*sizeof(int));
101 memset(pcoord,0,sizeof(int)*ndim*nproc);
102#if(nproc>1)
103 for(int iproc = 0; iproc<nproc; iproc++){
104 MPI_Cart_coords(commcart, iproc, ndim, pcoord+iproc*ndim);
105#pragma omp simd aligned(pcoord:AVX)
106 for(int idim = 0; idim<ndim; idim++){
107 pstart[idim][iproc] = pcoord[idim+ndim*iproc]*lsize[idim];
108 pstop[idim][iproc] = pstart[idim][iproc] + lsize[idim];
109 }
110 }
111#else
112 //Set iproc=0 because we only have one proc
113 for(int idim = 0; idim<ndim; idim++){
114 pstart[idim][0] = 0;
115 pstop[idim][0] = lsize[idim];
116 }
117#endif
118#ifdef _DEBUG
119 if(!rank)
120 printf("Running on %i processors.\nGrid layout is %ix%ix%ix%i\n",
121 nproc, npx,npy,npz,npt);
122 printf("Rank: %i pu: %i %i %i %i pd: %i %i %i %i\n", rank, pu[0], pu[1], pu[2], pu[3],
123 pd[0], pd[1], pd[2], pd[3]);
124#endif
125 return 0;
126}
int size
The number of MPI ranks in total.
Definition par_mpi.c:22
int rank
The MPI rank.
Definition par_mpi.c:22
int * pcoord
The processor grid.
Definition par_mpi.c:19
int __RANLUX__ pstart[ndim][nproc]
The initial lattice site on each sublattice in a given direction.
int __RANLUX__ pu[ndim]
Processors in the up direction.
Definition par_mpi.c:23
int __RANLUX__ pstop[ndim][nproc]
The final lattice site on each sublattice in a given direction.
Definition par_mpi.c:21
int __RANLUX__ pd[ndim]
Processors in the down direction.
Definition par_mpi.c:24
#define ksizex
Sublattice x extent.
Definition sizes.h:139
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nt
Lattice temporal extent. This also corresponds to the inverse temperature.
Definition sizes.h:86
#define nproc
Number of processors for MPI.
Definition sizes.h:132
#define nx
Lattice x extent.
Definition sizes.h:66
#define ksizet
Sublattice t extent.
Definition sizes.h:149
#define npx
Processor grid x extent. This must be a divisor of nx.
Definition sizes.h:97
#define npz
Processor grid z extent.
Definition sizes.h:116
#define ksizez
Sublattice z extent.
Definition sizes.h:143
#define npy
Processor grid y extent.
Definition sizes.h:108
#define ndim
Dimensions.
Definition sizes.h:179
#define npt
Processor grid t extent.
Definition sizes.h:124
#define ksizey
Sublattice y extent.
Definition sizes.h:141
#define nz
Lattice z extent. We normally use cubic lattices so this is the same as nx.
Definition sizes.h:80
#define ny
Lattice y extent. We normally use cubic lattices so this is the same as nx.
Definition sizes.h:74

References AVX, ksizet, ksizex, ksizey, ksizez, ndim, nproc, npt, npx, npy, npz, nt, nx, ny, nz, pcoord, pd, pstart, pstop, pu, rank, and size.

Here is the caller graph for this function:

◆ Par_sread()

int Par_sread ( const int iread,
const float beta,
const float fmu,
const float akappa,
const Complex_f ajq,
Complex * u11,
Complex * u12,
Complex * u11t,
Complex * u12t )

Reads and assigns the gauges from file.

Parameters
ireadConfiguration to read in
betaInverse gauge coupling
fmuChemical potential
akappaHopping parameter
ajqDiquark source
u11,u12Gauge fields
u11t,u12tTrial fields
Returns
Zero on success, integer error code otherwise

Definition at line 127 of file par_mpi.c.

128 {
129 /*
130 * @brief Reads and assigns the gauges from file
131 *
132 * @param iread: Configuration to read in
133 * @param beta: Inverse gauge coupling
134 * @param fmu: Chemical potential
135 * @param akappa: Hopping parameter
136 * @param ajq: Diquark source
137 * @param u11,u12: Gauge fields
138 * @param u11t,u12t: Trial fields
139 *
140 * @return Zero on success, integer error code otherwise
141 */
142 const char *funcname = "Par_sread";
143#if(nproc>1)
144 MPI_Status status;
145 //For sending the seeds later
146 MPI_Datatype MPI_SEED_TYPE = (sizeof(seed)==sizeof(int)) ? MPI_INT:MPI_LONG;
147#endif
148 //We shall allow the almighty master thread to open the file
149 Complex *u1buff = (Complex *)aligned_alloc(AVX,kvol*sizeof(Complex));
150 Complex *u2buff = (Complex *)aligned_alloc(AVX,kvol*sizeof(Complex));
151 if(!rank){
152 //Containers for input. Only needed by the master rank
153 Complex *u11Read = (Complex *)aligned_alloc(AVX,ndim*gvol*sizeof(Complex));
154 Complex *u12Read = (Complex *)aligned_alloc(AVX,ndim*gvol*sizeof(Complex));
155 static char gauge_file[FILELEN]="config.";
156 int buffer; char buff2[7];
157 //Add script for extracting correct mu, j etc.
158 buffer = (int)round(100*beta);
159 sprintf(buff2,"b%03d",buffer);
160 strcat(gauge_file,buff2);
161 //κ
162 buffer = (int)round(10000*akappa);
163 sprintf(buff2,"k%04d",buffer);
164 strcat(gauge_file,buff2);
165 //μ
166 buffer = (int)round(1000*fmu);
167 sprintf(buff2,"mu%04d",buffer);
168 strcat(gauge_file,buff2);
169 //J
170 buffer = (int)round(1000*creal(ajq));
171 sprintf(buff2,"j%03d",buffer);
172 strcat(gauge_file,buff2);
173 //nx
174 sprintf(buff2,"s%02d",nx);
175 strcat(gauge_file,buff2);
176 //nt
177 sprintf(buff2,"t%02d",nt);
178 strcat(gauge_file,buff2);
179 //nconfig
180 char c[8];
181 sprintf(c,".%06d", iread);
182 strcat(gauge_file, c);
183
184 char *fileop = "rb";
185 printf("Opening gauge file on processor: %i\n",rank);
186 FILE *con;
187 if(!(con = fopen(gauge_file, fileop))){
188 fprintf(stderr, "Error %i in %s: Failed to open %s for %s.\
189 \nExiting...\n\n", OPENERROR, funcname, gauge_file, fileop);
190#if(nproc>1)
191 MPI_Abort(comm,OPENERROR);
192#endif
193 exit(OPENERROR);
194 }
195 //TODO: SAFETY CHECKS FOR EACH READ OPERATION
196 int old_nproc;
197 //What was previously the FORTRAN integer is now used to store the number of processors used to
198 //generate the configuration
199 fread(&old_nproc, sizeof(int), 1, con);
200 if(old_nproc!=nproc)
201 fprintf(stderr, "Warning %i in %s: Previous run was done on %i processors, current run uses %i.\n",\
202 DIFNPROC,funcname,old_nproc,nproc);
203 fread(u11Read, ndim*gvol*sizeof(Complex), 1, con);
204 fread(u12Read, ndim*gvol*sizeof(Complex), 1, con);
205 //The seed array will be used to gather and sort the seeds from each rank so they can be in a continuation run
206 //If less processors are used then only nproc seeds are used (breaking the Markov Chain)
207 //If more processors are used then we use the first seed to generate the rest as in Par_ranset
208#ifdef __RANLUX__
209 unsigned long *seed_array=(unsigned long*)calloc(nproc,sizeof(seed));
210#elif defined __INTEL_MKL__ && !defined USE_RAN2
211 int *seed_array=(int *)calloc(nproc,sizeof(seed));
212#else
213 long *seed_array=(long*)calloc(nproc,sizeof(seed));
214#endif
215 for(int i=0; i<fmin(old_nproc,nproc);i++)
216 fread(seed_array+i, sizeof(seed), 1, con);
217 fclose(con);
218 //Any remaining processors get their initial value set as is done in Par_ranset
219 for(int i=old_nproc; i<nproc; i++)
220 seed_array[i] = seed_array[0]*(1.0f+8.0f*(float)i/(float)(size-1));
221 if(!rank)
222 seed=seed_array[0];
223#if(nproc>1)
224 for(int iproc = 1; iproc<nproc; iproc++)
225 if(MPI_Send(&seed_array[iproc], 1, MPI_SEED_TYPE,iproc, 1, comm)){
226 fprintf(stderr, "Error %i in %s: Failed to send seed to process %i.\nExiting...\n\n",
227 CANTSEND, funcname, iproc);
228 MPI_Abort(comm,CANTSEND);
229 }
230#endif
231
232 for(int iproc = 0; iproc < nproc; iproc++)
233 for(int idim = 0; idim < ndim; idim++){
234 int i = 0;
235 //Index order is reversed from FORTRAN for performance
236 //Going to split up assigning icoord[i] to reduce the
237 //number of assignments.
238 //We're weaving our way through the memory here, converting
239 //between lattice and memory coordinates
240 for(int it=pstart[3][iproc]; it<pstop[3][iproc]; it++)
241 for(int iz=pstart[2][iproc]; iz<pstop[2][iproc]; iz++)
242 for(int iy=pstart[1][iproc]; iy<pstop[1][iproc]; iy++)
243 for(int ix=pstart[0][iproc]; ix<pstop[0][iproc]; ix++){
244 //j is the relative memory index of icoord
245 int j = Coord2gindex(ix,iy,iz,it);
246 u1buff[i]=u11Read[idim*gvol+j];
247 u2buff[i]=u12Read[idim*gvol+j];
248 //C starts counting from zero, not 1 so increment afterwards or start at int i=-1
249 i++;
250 }
251 if(i!=kvol){
252 fprintf(stderr, "Error %i in %s: Number of elements %i is not equal to\
253 kvol %i.\nExiting...\n\n", NUMELEM, funcname, i, kvol);
254#if(nproc>1)
255 MPI_Abort(comm,NUMELEM);
256#else
257 exit(NUMELEM);
258#endif
259 }
260 if(!iproc){
261#if defined USE_BLAS
262 cblas_zcopy(kvol,u1buff,1,u11+idim,ndim);
263 cblas_zcopy(kvol,u2buff,1,u12+idim,ndim);
264#else
265#pragma omp simd aligned(u11,u12,u1buff,u2buff:AVX)
266 for(i=0;i<kvol;i++){
267 u11[i*ndim+idim]=u1buff[i];
268 u12[i*ndim+idim]=u2buff[i];
269 }
270#endif
271 }
272#if(nproc>1)
273 else{
274 //The master thread did all the hard work, the minions just need to receive their
275 //data and go.
276 if(MPI_Send(u1buff, kvol, MPI_C_DOUBLE_COMPLEX,iproc, 2*idim, comm)){
277 fprintf(stderr, "Error %i in %s: Failed to send ubuff to process %i.\nExiting...\n\n",
278 CANTSEND, funcname, iproc);
279#if(nproc>1)
280 MPI_Abort(comm,CANTSEND);
281#else
282 exit(CANTSEND);
283#endif
284 }
285 if(MPI_Send(u2buff, kvol, MPI_C_DOUBLE_COMPLEX,iproc, 2*idim+1, comm)){
286 fprintf(stderr, "Error %i in %s: Failed to send ubuff to process %i.\nExiting...\n\n",
287 CANTSEND, funcname, iproc);
288#if(nproc>1)
289 MPI_Abort(comm,CANTSEND);
290#else
291 exit(CANTSEND);
292#endif
293 }
294 }
295#endif
296 }
297 free(u11Read); free(u12Read);
298 free(seed_array);
299 }
300#if(nproc>1)
301 else{
302 if(MPI_Recv(&seed, 1, MPI_SEED_TYPE, masterproc, 1, comm, &status)){
303 fprintf(stderr, "Error %i in %s: Falied to receive seed on process %i.\nExiting...\n\n",
304 CANTRECV, funcname, rank);
305#if(nproc>1)
306 MPI_Abort(comm,CANTRECV);
307#else
308 exit(CANTRECV);
309#endif
310 }
311 for(int idim = 0; idim<ndim; idim++){
312 //Receiving the data from the master threads.
313 if(MPI_Recv(u1buff, kvol, MPI_C_DOUBLE_COMPLEX, masterproc, 2*idim, comm, &status)){
314 fprintf(stderr, "Error %i in %s: Falied to receive u11 on process %i.\nExiting...\n\n",
315 CANTRECV, funcname, rank);
316 MPI_Abort(comm,CANTRECV);
317 }
318 if(MPI_Recv(u2buff, kvol, MPI_C_DOUBLE_COMPLEX, masterproc, 2*idim+1, comm, &status)){
319 fprintf(stderr, "Error %i in %s: Falied to receive u12 on process %i.\nExiting...\n\n",
320 CANTRECV, funcname, rank);
321 MPI_Abort(comm,CANTRECV);
322 }
323#if defined USE_BLAS
324 cblas_zcopy(kvol,u1buff,1,u11+idim,ndim);
325 cblas_zcopy(kvol,u2buff,1,u12+idim,ndim);
326#else
327#pragma omp parallel for simd aligned(u11,u12,u1buff,u2buff:AVX)
328 for(int i=0;i<kvol;i++){
329 u11[i*ndim+idim]=u1buff[i];
330 u12[i*ndim+idim]=u2buff[i];
331 }
332#endif
333 }
334 }
335#endif
336 free(u1buff); free(u2buff);
337 memcpy(u11t, u11, ndim*kvol*sizeof(Complex));
338 memcpy(u12t, u12, ndim*kvol*sizeof(Complex));
339 return 0;
340}
int Coord2gindex(int ix, int iy, int iz, int it)
Converts the coordinates of a global lattice point to its index in the computer memory.
Definition coord.c:469
#define masterproc
The main rank. Used for serial tasks.
Definition par_mpi.h:40
long seed
RAN2 seed.
Definition random.c:27
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define gvol
Lattice volume.
Definition sizes.h:92
#define FILELEN
Default file name length.
Definition sizes.h:62

References AVX, Complex, Coord2gindex(), FILELEN, gvol, kvol, masterproc, ndim, nproc, nt, nx, pstart, pstop, rank, seed, and size.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Par_swrite()

int Par_swrite ( const int itraj,
const int icheck,
const float beta,
const float fmu,
const float akappa,
const Complex_f ajq,
Complex * u11,
Complex * u12 )

Copies u11 and u12 into arrays without halos which then get written to output.

Modified from an original version of swrite in FORTRAN

Parameters
itrajTrajectory to write
icheckNot currently used but haven't gotten around to removing it
betaInverse gauge coupling
fmuChemical potential
akappaHopping parameter
ajqDiquark source
u11,u12Gauge fields
Returns
Zero on success, integer error code otherwise

Definition at line 341 of file par_mpi.c.

342 {
343 /*
344 * @brief Copies u11 and u12 into arrays without halos which then get written to output
345 *
346 * Modified from an original version of swrite in FORTRAN
347 *
348 * @param itraj: Trajectory to write
349 * @param icheck: Not currently used but haven't gotten around to removing it
350 * @param beta: Inverse gauge coupling
351 * @param fmu: Chemical potential
352 * @param akappa: Hopping parameter
353 * @param ajq: Diquark source
354 * @param u11,u12: Gauge fields
355 *
356 * @return Zero on success, integer error code otherwise
357 */
358 const char *funcname = "par_swrite";
359 #if (nproc>1)
360 MPI_Status status;
361 //Used for seed array later on
362 MPI_Datatype MPI_SEED_TYPE = (sizeof(seed)==sizeof(int)) ? MPI_INT:MPI_LONG;
363 #endif
364 Complex *u1buff = (Complex *)aligned_alloc(AVX,kvol*sizeof(Complex));
365 Complex *u2buff = (Complex *)aligned_alloc(AVX,kvol*sizeof(Complex));
366#ifdef _DEBUG
367 char dump_prefix[FILELEN]="u11.";
368 char dump_buff[32];
369 sprintf(dump_buff,"r%01d_c%06d",rank,itraj);
370 strcat(dump_prefix,dump_buff);
371 FILE *gauge_dump=fopen(dump_prefix,"wb");
372 //Print the local trial field in the order it is stored in memory.
373 //This is not the same order as it is stored in secondary storage
374 fwrite(u11,ndim*kvol*sizeof(Complex),1,gauge_dump);
375 fclose(gauge_dump);
376#endif
377#ifdef __RANLUX__
378 seed=gsl_rng_get(ranlux_instd);
379#endif
380 if(!rank){
381 //Array to store the seeds. nth index is the nth processor
382#ifdef __RANLUX__
383 unsigned long *seed_array=(unsigned long*)calloc(nproc,sizeof(seed));
384#elif defined __INTEL_MKL__ && !defined USE_RAN2
385 int *seed_array=(int *)calloc(nproc,sizeof(seed));
386#else
387 long *seed_array=(long*)calloc(nproc,sizeof(seed));
388#endif
389 seed_array[0]=seed;
390#if(nproc>1)
391 for(int iproc = 1; iproc<nproc; iproc++)
392 if(MPI_Recv(&seed_array[iproc], 1, MPI_SEED_TYPE,iproc, 1, comm, &status)){
393 fprintf(stderr, "Error %i in %s: Failed to receive seed from process %i.\nExiting...\n\n",
394 CANTRECV, funcname, iproc);
395 MPI_Abort(comm,CANTRECV);
396 }
397#endif
398 Complex *u11Write = (Complex *)aligned_alloc(AVX,ndim*gvol*sizeof(Complex));
399 Complex *u12Write = (Complex *)aligned_alloc(AVX,ndim*gvol*sizeof(Complex));
400 //Get correct parts of u11read etc from remote processors
401 for(int iproc=0;iproc<nproc;iproc++)
402 for(int idim=0;idim<ndim;idim++){
403#if(nproc>1)
404 if(iproc){
405 if(MPI_Recv(u1buff, kvol, MPI_C_DOUBLE_COMPLEX, iproc, 2*idim, comm, &status)){
406 fprintf(stderr, "Error %i in %s: Falied to receive u11 from process %i.\nExiting...\n\n",
407 CANTRECV, funcname, iproc);
408 MPI_Abort(comm,CANTRECV);
409 }
410 if(MPI_Recv(u2buff, kvol, MPI_C_DOUBLE_COMPLEX, iproc, 2*idim+1, comm, &status)){
411 fprintf(stderr, "Error %i in %s: Falied to receive u12 from process %i.\nExiting...\n\n",
412 CANTRECV, funcname, iproc);
413 MPI_Abort(comm,CANTRECV);
414 }
415 }
416 else{
417#endif
418 //No need to do MPI Send/Receive on the master rank
419 //Array looping is slow so we use memcpy instead
420#if defined USE_BLAS
421 cblas_zcopy(kvol,u11+idim,ndim,u1buff,1);
422 cblas_zcopy(kvol,u12+idim,ndim,u2buff,1);
423#else
424#pragma omp parallel for simd aligned(u11,u12,u1buff,u2buff:AVX)
425 for(int i=0;i<kvol;i++){
426 u1buff[i]=u11[i*ndim+idim];
427 u2buff[i]=u12[i*ndim+idim];
428 }
429#endif
430#ifdef _DEBUG
431 char part_dump[FILELEN]="";
432 strcat(part_dump,dump_prefix);
433 sprintf(dump_buff,"_d%d",idim);
434 strcat(part_dump,dump_buff);
435 FILE *pdump=fopen(part_dump,"wb");
436 fwrite(u1buff,ndim*kvol*sizeof(Complex),1,pdump);
437 fclose(pdump);
438#endif
439#if(nproc>1)
440 }
441#endif
442 int i=0;
443 for(int it=pstart[3][iproc]; it<pstop[3][iproc]; it++)
444 for(int iz=pstart[2][iproc]; iz<pstop[2][iproc]; iz++)
445 for(int iy=pstart[1][iproc]; iy<pstop[1][iproc]; iy++)
446 for(int ix=pstart[0][iproc]; ix<pstop[0][iproc]; ix++){
447 //j is the relative memory index of icoord
448 int j = Coord2gindex(ix, iy, iz, it);
449 u11Write[idim*gvol+j] = u1buff[i];
450 u12Write[idim*gvol+j] = u2buff[i];
451 //C starts counting from zero, not 1 so increment afterwards or start at int i=-1
452 i++;
453 }
454 if(i!=kvol){
455 fprintf(stderr, "Error %i in %s: Number of elements %i is not equal to\
456 kvol %i.\nExiting...\n\n", NUMELEM, funcname, i, kvol);
457#if(nproc>1)
458 MPI_Abort(comm,NUMELEM);
459#else
460 exit(NUMELEM);
461#endif
462 }
463 }
464 free(u1buff); free(u2buff);
465
466 char gauge_title[FILELEN]="config.";
467 int buffer; char buff2[7];
468 //Add script for extracting correct mu, j etc.
469 buffer = (int)round(100*beta);
470 sprintf(buff2,"b%03d",buffer);
471 strcat(gauge_title,buff2);
472 //κ
473 buffer = (int)round(10000*akappa);
474 sprintf(buff2,"k%04d",buffer);
475 strcat(gauge_title,buff2);
476 //μ
477 buffer = (int)round(1000*fmu);
478 sprintf(buff2,"mu%04d",buffer);
479 strcat(gauge_title,buff2);
480 //J
481 buffer = (int)round(1000*creal(ajq));
482 sprintf(buff2,"j%03d",buffer);
483 strcat(gauge_title,buff2);
484 //nx
485 sprintf(buff2,"s%02d",nx);
486 strcat(gauge_title,buff2);
487 //nt
488 sprintf(buff2,"t%02d",nt);
489 strcat(gauge_title,buff2);
490
491 char gauge_file[FILELEN];
492 strcpy(gauge_file,gauge_title);
493 char c[8];
494 sprintf(c,".%06d", itraj);
495 strcat(gauge_file, c);
496 printf("Gauge file name is %s\n", gauge_file);
497 printf("Writing the gauge file on processor %i.\n", rank);
498 FILE *con;
499 char *fileop = "wb";
500 if(!(con=fopen(gauge_file, fileop))){
501 fprintf(stderr, "Error %i in %s: Failed to open %s for %s.\
502 \nExiting...\n\n", OPENERROR, funcname, gauge_file, fileop);
503#if(nproc>1)
504 MPI_Abort(comm,OPENERROR);
505#else
506 exit(OPENERROR);
507#endif
508 }
509 //TODO: SAFETY CHECKS FOR EACH WRITE OPERATION
510 //Write the number of processors used in the previous run. This takes the place of the FORTRAN integer rather nicely
511#if(nproc==1)
512 int size=nproc;
513#endif
514 fwrite(&size,sizeof(int),1,con);
515 fwrite(u11Write, ndim*gvol*sizeof(Complex), 1, con);
516 fwrite(u12Write, ndim*gvol*sizeof(Complex), 1, con);
517 //TODO
518 //Make a seed array, where the nth component is the seed on the nth rank for continuation runs.
519 fwrite(seed_array, nproc*sizeof(seed), 1, con);
520 fclose(con);
521 free(u11Write); free(u12Write);
522 free(seed_array);
523 }
524#if(nproc>1)
525 else{
526 if(MPI_Send(&seed, 1, MPI_SEED_TYPE, masterproc, 1, comm)){
527 fprintf(stderr, "Error %i in %s: Falied to send u11 from process %i.\nExiting...\n\n",
528 CANTSEND, funcname, rank);
529 MPI_Abort(comm,CANTSEND);
530 }
531 for(int idim = 0; idim<ndim; idim++){
532#if defined USE_BLAS
533 cblas_zcopy(kvol,u11+idim,ndim,u1buff,1);
534 cblas_zcopy(kvol,u12+idim,ndim,u2buff,1);
535#else
536#pragma omp parallel for simd aligned(u11,u12,u1buff,u2buff:AVX)
537 for(int i=0;i<kvol;i++){
538 u1buff[i]=u11[i*ndim+idim];
539 u2buff[i]=u12[i*ndim+idim];
540 }
541#endif
542#ifdef _DEBUG
543 char part_dump[FILELEN]="";
544 strcat(part_dump,dump_prefix);
545 sprintf(dump_buff,"_d%d",idim);
546 strcat(part_dump,dump_buff);
547 FILE *pdump=fopen(part_dump,"wb");
548 fwrite(u1buff,ndim*kvol*sizeof(Complex),1,pdump);
549 fclose(pdump);
550#endif
551 int i=0;
552 if(MPI_Send(u1buff, kvol, MPI_C_DOUBLE_COMPLEX, masterproc, 2*idim, comm)){
553 fprintf(stderr, "Error %i in %s: Falied to send u11 from process %i.\nExiting...\n\n",
554 CANTSEND, funcname, rank);
555 MPI_Abort(comm,CANTSEND);
556 }
557 if(MPI_Send(u2buff, kvol, MPI_C_DOUBLE_COMPLEX, masterproc, 2*idim+1, comm)){
558 fprintf(stderr, "Error %i in %s: Falied to send u12 from process %i.\nExiting...\n\n",
559 CANTSEND, funcname, rank);
560 MPI_Abort(comm,CANTSEND);
561 }
562 }
563 free(u1buff); free(u2buff);
564 }
565#endif
566 return 0;
567}

References AVX, Complex, Coord2gindex(), FILELEN, gvol, kvol, masterproc, ndim, nproc, nt, nx, pstart, pstop, rank, seed, and size.

Here is the call graph for this function:

◆ Trial_Exchange()

int Trial_Exchange ( Complex * u11t,
Complex * u12t,
Complex_f * u11t_f,
Complex_f * u12t_f )

Exchanges the trial fields.

I noticed that this halo exchange was happening even though the trial fields hadn't been updated. To get around this I'm making a function that does the halo exchange and only calling it after the trial fields get updated.

Parameters
u11t,u12tDouble precision trial fields
u11t_f,u12t_fSingle precision trial fields
Returns
Zero on success, Integer Error code otherwise

Definition at line 1178 of file par_mpi.c.

1178 {
1179 /*
1180 * Exchanges the trial fields. I noticed that this halo exchange was happening
1181 * even though the trial fields hadn't been updated. To get around this
1182 * I'm making a function that does the halo exchange and only calling it after
1183 * the trial fields get updated.
1184 */
1185 const char *funchame = "Trial_Exchange";
1186 //Prefetch the trial fields from the GPU, halos come later
1187#if(nproc>1)
1188#ifdef __NVCC__
1189 int device=-1;
1190 cudaGetDevice(&device);
1191 cudaMemPrefetchAsync(u11t, ndim*kvol*sizeof(Complex),cudaCpuDeviceId,NULL);
1192 cudaMemPrefetchAsync(u12t, ndim*kvol*sizeof(Complex),cudaCpuDeviceId,NULL);
1193#endif
1194 Complex *z = (Complex *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex));
1195 for(int mu=0;mu<ndim;mu++){
1196 //Copy the column from u11t
1197#ifdef USE_BLAS
1198 cblas_zcopy(kvol, &u11t[mu], ndim, z, 1);
1199#else
1200 for(int i=0; i<kvol;i++)
1201 z[i]=u11t[i*ndim+mu];
1202#endif
1203 //Halo exchange on that column
1204 ZHalo_swap_all(z, 1);
1205 //And the swap back
1206#ifdef USE_BLAS
1207 cblas_zcopy(kvol+halo, z, 1, &u11t[mu], ndim);
1208 //Now we prefetch the halo
1209#ifdef __NVCC__
1210 cudaMemPrefetchAsync(u11t+ndim*kvol, ndim*halo*sizeof(Complex),device,NULL);
1211#endif
1212 //Repeat for u12t
1213 cblas_zcopy(kvol, &u12t[mu], ndim, z, 1);
1214#else
1215 for(int i=0; i<kvol+halo;i++){
1216 u11t[i*ndim+mu]=z[i];
1217 z[i]=u12t[i*ndim+mu];
1218 }
1219#endif
1220 ZHalo_swap_all(z, 1);
1221#ifdef USE_BLAS
1222 cblas_zcopy(kvol+halo, z, 1, &u12t[mu], ndim);
1223#else
1224 for(int i=0; i<kvol+halo;i++)
1225 u12t[i*ndim+mu]=z[i];
1226#endif
1227 }
1228 //Now we prefetch the halo
1229#ifdef __NVCC__
1230 cudaMemPrefetchAsync(u12t+ndim*kvol, ndim*halo*sizeof(Complex),device,NULL);
1231#endif
1232 free(z);
1233#endif
1234//And get the single precision gauge fields preppeed
1235#ifdef __NVCC__
1236 cuComplex_convert(u11t_f,u11t,ndim*(kvol+halo),true,dimBlock,dimGrid);
1237 cuComplex_convert(u12t_f,u12t,ndim*(kvol+halo),true,dimBlock,dimGrid);
1238 cudaDeviceSynchronise();
1239#else
1240#pragma omp parallel for simd aligned(u11t_f,u12t_f,u11t,u12t:AVX)
1241 for(int i=0;i<ndim*(kvol+halo);i++){
1242 u11t_f[i]=(Complex_f)u11t[i];
1243 u12t_f[i]=(Complex_f)u12t[i];
1244 }
1245#endif
1246 return 0;
1247}
int ZHalo_swap_all(Complex *z, int ncpt)
Calls the functions to send data to both the up and down halos.
#define halo
Total Halo size.
Definition sizes.h:222
#define Complex_f
Single precision complex number.
Definition sizes.h:56

References AVX, Complex, Complex_f, halo, kvol, ndim, and ZHalo_swap_all().

Here is the call graph for this function:
Here is the caller graph for this function:

Variable Documentation

◆ __RANLUX__

int pd [ndim] __RANLUX__

Definition at line 20 of file par_mpi.c.

◆ pcoord

int* pcoord

The processor grid.

Definition at line 19 of file par_mpi.c.

◆ rank

int rank

The MPI rank.

Definition at line 22 of file par_mpi.c.

◆ size

int size

The number of MPI ranks in total.

Definition at line 22 of file par_mpi.c.