su2hmc
Loading...
Searching...
No Matches
par_mpi.c
Go to the documentation of this file.
1
6#include <par_mpi.h>
7#include <random.h>
8#include <su2hmc.h>
9
10//NOTE: In FORTRAN code everything was capitalised (despite being case insensitive)
11//C is case sensitive, so the equivalent C command has the case format MPI_Xyz_abc
12//Non-commands (like MPI_COMM_WORLD) don't always change
13
14#if(nproc>1)
15MPI_Comm comm = MPI_COMM_WORLD;
16MPI_Request request;
17#endif
18
19int *pcoord;
20int pstart[ndim][nproc] __attribute__((aligned(AVX)));
21int pstop [ndim][nproc] __attribute__((aligned(AVX)));
23int pu[ndim] __attribute__((aligned(AVX)));
24int pd[ndim] __attribute__((aligned(AVX)));
25int Par_begin(int argc, char *argv[]){
26 /* Initialises the MPI configuration
27 *
28 * Parameters:
29 * ---------
30 * int argc Number of arguments given to the programme
31 * char *argv[] Array of arguments
32 *
33 * Returns:
34 * -------
35 * Zero on success, integer error code otherwise.
36 */
37
38 //TODO: Remove as much non-MPI stuff from here as possible
39 const char *funcname = "Par_begin";
40 int size;
41#if(nproc>1)
42 if(MPI_Init(&argc, &argv)){
43 fprintf(stderr, "Error %i in %s: Failed to initialise MPI\nExiting\n\n", NO_MPI_INIT, funcname);
44 MPI_Abort(comm,NO_MPI_INIT);
45 exit(NO_MPI_INIT);
46 }
47
48 if(MPI_Comm_rank(comm, &rank)){
49 fprintf(stderr, "Error %i in %s: Failed to find rank.\nExiting...\n\n", NO_MPI_RANK, funcname);
50 MPI_Abort(comm,NO_MPI_RANK);
51 }
52 if(MPI_Comm_size(comm, &size)){
53 fprintf(stderr, "Error %i in %s: Failed to find size\nExiting...\n\n", NO_MPI_SIZE, funcname);
54 MPI_Abort(comm,NO_MPI_SIZE);
55 }
56#else
57 size=1; rank=0;
58#endif
59 //If size isn't the same as the max allowed number of processes, then there's a problem somewhere.
60 if(size!=nproc){
61 fprintf(stderr, "Error %i in %s: For process %i, size %i is not equal to nproc %i.\n"
62 "Exiting...\n\n", SIZEPROC, funcname, rank, size, nproc);
63#if(nproc>1)
64 MPI_Abort(comm,SIZEPROC);
65#else
66 exit(SIZEPROC);
67#endif
68 }
69 //gsize is the size of the system, lsize is the size of each MPI Grid
70 int gsize[4], lsize[4];
71 gsize[0]=nx; gsize[1]=ny; gsize[2]=nz; gsize[3]=nt;
72 lsize[0]=ksizex; lsize[1]=ksizey; lsize[2]=ksizez; lsize[3]=ksizet;
73
74 //Topology layout
75 int cartsize[ndim] __attribute__((aligned(AVX)));
76 cartsize[0]=npx; cartsize[1]=npy; cartsize[2]=npz; cartsize[3]=npt;
77
78 //For the topology, says if each dimension is periodic or not
79 //Probably for us everything will be but using the four vector
80 //gives the choice at least
81 int periods[ndim] __attribute__((aligned(AVX)));
82#pragma unroll
83 for(int i=0; i<ndim; i++)
84 periods[i] = true;
85 //Not going to change the rank order
86 int reorder = false;
87 //Declare the topology
88#if(nproc>1)
89 MPI_Comm commcart;
90 MPI_Cart_create(comm, ndim, cartsize, periods, reorder, &commcart);
91#endif
92
93 //Get nearest neighbours of processors
94#if(nproc>1)
95#pragma unroll
96 for(int i= 0; i<ndim; i++)
97 MPI_Cart_shift(commcart, i, 1, &pd[i], &pu[i]);
98#endif
99 //Get coordinates of processors in the grid
100 pcoord = (int*)aligned_alloc(AVX,ndim*nproc*sizeof(int));
101 memset(pcoord,0,sizeof(int)*ndim*nproc);
102#if(nproc>1)
103 for(int iproc = 0; iproc<nproc; iproc++){
104 MPI_Cart_coords(commcart, iproc, ndim, pcoord+iproc*ndim);
105#pragma omp simd aligned(pcoord:AVX)
106 for(int idim = 0; idim<ndim; idim++){
107 pstart[idim][iproc] = pcoord[idim+ndim*iproc]*lsize[idim];
108 pstop[idim][iproc] = pstart[idim][iproc] + lsize[idim];
109 }
110 }
111#else
112 //Set iproc=0 because we only have one proc
113 for(int idim = 0; idim<ndim; idim++){
114 pstart[idim][0] = 0;
115 pstop[idim][0] = lsize[idim];
116 }
117#endif
118#ifdef _DEBUG
119 if(!rank)
120 printf("Running on %i processors.\nGrid layout is %ix%ix%ix%i\n",
121 nproc, npx,npy,npz,npt);
122 printf("Rank: %i pu: %i %i %i %i pd: %i %i %i %i\n", rank, pu[0], pu[1], pu[2], pu[3],
123 pd[0], pd[1], pd[2], pd[3]);
124#endif
125 return 0;
126}
127int Par_sread(const int iread, const float beta, const float fmu, const float akappa, const Complex_f ajq,\
128 Complex *u11, Complex *u12, Complex *u11t, Complex *u12t){
129 /*
130 * @brief Reads and assigns the gauges from file
131 *
132 * @param iread: Configuration to read in
133 * @param beta: Inverse gauge coupling
134 * @param fmu: Chemical potential
135 * @param akappa: Hopping parameter
136 * @param ajq: Diquark source
137 * @param u11,u12: Gauge fields
138 * @param u11t,u12t: Trial fields
139 *
140 * @return Zero on success, integer error code otherwise
141 */
142 const char *funcname = "Par_sread";
143#if(nproc>1)
144 MPI_Status status;
145 //For sending the seeds later
146 MPI_Datatype MPI_SEED_TYPE = (sizeof(seed)==sizeof(int)) ? MPI_INT:MPI_LONG;
147#endif
148 //We shall allow the almighty master thread to open the file
149 Complex *u1buff = (Complex *)aligned_alloc(AVX,kvol*sizeof(Complex));
150 Complex *u2buff = (Complex *)aligned_alloc(AVX,kvol*sizeof(Complex));
151 if(!rank){
152 //Containers for input. Only needed by the master rank
153 Complex *u11Read = (Complex *)aligned_alloc(AVX,ndim*gvol*sizeof(Complex));
154 Complex *u12Read = (Complex *)aligned_alloc(AVX,ndim*gvol*sizeof(Complex));
155 static char gauge_file[FILELEN]="config.";
156 int buffer; char buff2[7];
157 //Add script for extracting correct mu, j etc.
158 buffer = (int)round(100*beta);
159 sprintf(buff2,"b%03d",buffer);
160 strcat(gauge_file,buff2);
161 //κ
162 buffer = (int)round(10000*akappa);
163 sprintf(buff2,"k%04d",buffer);
164 strcat(gauge_file,buff2);
165 //μ
166 buffer = (int)round(1000*fmu);
167 sprintf(buff2,"mu%04d",buffer);
168 strcat(gauge_file,buff2);
169 //J
170 buffer = (int)round(1000*creal(ajq));
171 sprintf(buff2,"j%03d",buffer);
172 strcat(gauge_file,buff2);
173 //nx
174 sprintf(buff2,"s%02d",nx);
175 strcat(gauge_file,buff2);
176 //nt
177 sprintf(buff2,"t%02d",nt);
178 strcat(gauge_file,buff2);
179 //nconfig
180 char c[8];
181 sprintf(c,".%06d", iread);
182 strcat(gauge_file, c);
183
184 char *fileop = "rb";
185 printf("Opening gauge file on processor: %i\n",rank);
186 FILE *con;
187 if(!(con = fopen(gauge_file, fileop))){
188 fprintf(stderr, "Error %i in %s: Failed to open %s for %s.\
189 \nExiting...\n\n", OPENERROR, funcname, gauge_file, fileop);
190#if(nproc>1)
191 MPI_Abort(comm,OPENERROR);
192#endif
193 exit(OPENERROR);
194 }
195 //TODO: SAFETY CHECKS FOR EACH READ OPERATION
196 int old_nproc;
197 //What was previously the FORTRAN integer is now used to store the number of processors used to
198 //generate the configuration
199 fread(&old_nproc, sizeof(int), 1, con);
200 if(old_nproc!=nproc)
201 fprintf(stderr, "Warning %i in %s: Previous run was done on %i processors, current run uses %i.\n",\
202 DIFNPROC,funcname,old_nproc,nproc);
203 fread(u11Read, ndim*gvol*sizeof(Complex), 1, con);
204 fread(u12Read, ndim*gvol*sizeof(Complex), 1, con);
205 //The seed array will be used to gather and sort the seeds from each rank so they can be in a continuation run
206 //If less processors are used then only nproc seeds are used (breaking the Markov Chain)
207 //If more processors are used then we use the first seed to generate the rest as in Par_ranset
208#ifdef __RANLUX__
209 unsigned long *seed_array=(unsigned long*)calloc(nproc,sizeof(seed));
210#elif defined __INTEL_MKL__ && !defined USE_RAN2
211 int *seed_array=(int *)calloc(nproc,sizeof(seed));
212#else
213 long *seed_array=(long*)calloc(nproc,sizeof(seed));
214#endif
215 for(int i=0; i<fmin(old_nproc,nproc);i++)
216 fread(seed_array+i, sizeof(seed), 1, con);
217 fclose(con);
218 //Any remaining processors get their initial value set as is done in Par_ranset
219 for(int i=old_nproc; i<nproc; i++)
220 seed_array[i] = seed_array[0]*(1.0f+8.0f*(float)i/(float)(size-1));
221 if(!rank)
222 seed=seed_array[0];
223#if(nproc>1)
224 for(int iproc = 1; iproc<nproc; iproc++)
225 if(MPI_Send(&seed_array[iproc], 1, MPI_SEED_TYPE,iproc, 1, comm)){
226 fprintf(stderr, "Error %i in %s: Failed to send seed to process %i.\nExiting...\n\n",
227 CANTSEND, funcname, iproc);
228 MPI_Abort(comm,CANTSEND);
229 }
230#endif
231
232 for(int iproc = 0; iproc < nproc; iproc++)
233 for(int idim = 0; idim < ndim; idim++){
234 int i = 0;
235 //Index order is reversed from FORTRAN for performance
236 //Going to split up assigning icoord[i] to reduce the
237 //number of assignments.
238 //We're weaving our way through the memory here, converting
239 //between lattice and memory coordinates
240 for(int it=pstart[3][iproc]; it<pstop[3][iproc]; it++)
241 for(int iz=pstart[2][iproc]; iz<pstop[2][iproc]; iz++)
242 for(int iy=pstart[1][iproc]; iy<pstop[1][iproc]; iy++)
243 for(int ix=pstart[0][iproc]; ix<pstop[0][iproc]; ix++){
244 //j is the relative memory index of icoord
245 int j = Coord2gindex(ix,iy,iz,it);
246 u1buff[i]=u11Read[idim*gvol+j];
247 u2buff[i]=u12Read[idim*gvol+j];
248 //C starts counting from zero, not 1 so increment afterwards or start at int i=-1
249 i++;
250 }
251 if(i!=kvol){
252 fprintf(stderr, "Error %i in %s: Number of elements %i is not equal to\
253 kvol %i.\nExiting...\n\n", NUMELEM, funcname, i, kvol);
254#if(nproc>1)
255 MPI_Abort(comm,NUMELEM);
256#else
257 exit(NUMELEM);
258#endif
259 }
260 if(!iproc){
261#if defined USE_BLAS
262 cblas_zcopy(kvol,u1buff,1,u11+idim,ndim);
263 cblas_zcopy(kvol,u2buff,1,u12+idim,ndim);
264#else
265#pragma omp simd aligned(u11,u12,u1buff,u2buff:AVX)
266 for(i=0;i<kvol;i++){
267 u11[i*ndim+idim]=u1buff[i];
268 u12[i*ndim+idim]=u2buff[i];
269 }
270#endif
271 }
272#if(nproc>1)
273 else{
274 //The master thread did all the hard work, the minions just need to receive their
275 //data and go.
276 if(MPI_Send(u1buff, kvol, MPI_C_DOUBLE_COMPLEX,iproc, 2*idim, comm)){
277 fprintf(stderr, "Error %i in %s: Failed to send ubuff to process %i.\nExiting...\n\n",
278 CANTSEND, funcname, iproc);
279#if(nproc>1)
280 MPI_Abort(comm,CANTSEND);
281#else
282 exit(CANTSEND);
283#endif
284 }
285 if(MPI_Send(u2buff, kvol, MPI_C_DOUBLE_COMPLEX,iproc, 2*idim+1, comm)){
286 fprintf(stderr, "Error %i in %s: Failed to send ubuff to process %i.\nExiting...\n\n",
287 CANTSEND, funcname, iproc);
288#if(nproc>1)
289 MPI_Abort(comm,CANTSEND);
290#else
291 exit(CANTSEND);
292#endif
293 }
294 }
295#endif
296 }
297 free(u11Read); free(u12Read);
298 free(seed_array);
299 }
300#if(nproc>1)
301 else{
302 if(MPI_Recv(&seed, 1, MPI_SEED_TYPE, masterproc, 1, comm, &status)){
303 fprintf(stderr, "Error %i in %s: Falied to receive seed on process %i.\nExiting...\n\n",
304 CANTRECV, funcname, rank);
305#if(nproc>1)
306 MPI_Abort(comm,CANTRECV);
307#else
308 exit(CANTRECV);
309#endif
310 }
311 for(int idim = 0; idim<ndim; idim++){
312 //Receiving the data from the master threads.
313 if(MPI_Recv(u1buff, kvol, MPI_C_DOUBLE_COMPLEX, masterproc, 2*idim, comm, &status)){
314 fprintf(stderr, "Error %i in %s: Falied to receive u11 on process %i.\nExiting...\n\n",
315 CANTRECV, funcname, rank);
316 MPI_Abort(comm,CANTRECV);
317 }
318 if(MPI_Recv(u2buff, kvol, MPI_C_DOUBLE_COMPLEX, masterproc, 2*idim+1, comm, &status)){
319 fprintf(stderr, "Error %i in %s: Falied to receive u12 on process %i.\nExiting...\n\n",
320 CANTRECV, funcname, rank);
321 MPI_Abort(comm,CANTRECV);
322 }
323#if defined USE_BLAS
324 cblas_zcopy(kvol,u1buff,1,u11+idim,ndim);
325 cblas_zcopy(kvol,u2buff,1,u12+idim,ndim);
326#else
327#pragma omp parallel for simd aligned(u11,u12,u1buff,u2buff:AVX)
328 for(int i=0;i<kvol;i++){
329 u11[i*ndim+idim]=u1buff[i];
330 u12[i*ndim+idim]=u2buff[i];
331 }
332#endif
333 }
334 }
335#endif
336 free(u1buff); free(u2buff);
337 memcpy(u11t, u11, ndim*kvol*sizeof(Complex));
338 memcpy(u12t, u12, ndim*kvol*sizeof(Complex));
339 return 0;
340}
341int Par_swrite(const int itraj, const int icheck, const float beta, const float fmu, const float akappa,
342 const Complex_f ajq, Complex *u11, Complex *u12){
343 /*
344 * @brief Copies u11 and u12 into arrays without halos which then get written to output
345 *
346 * Modified from an original version of swrite in FORTRAN
347 *
348 * @param itraj: Trajectory to write
349 * @param icheck: Not currently used but haven't gotten around to removing it
350 * @param beta: Inverse gauge coupling
351 * @param fmu: Chemical potential
352 * @param akappa: Hopping parameter
353 * @param ajq: Diquark source
354 * @param u11,u12: Gauge fields
355 *
356 * @return Zero on success, integer error code otherwise
357 */
358 const char *funcname = "par_swrite";
359 #if (nproc>1)
360 MPI_Status status;
361 //Used for seed array later on
362 MPI_Datatype MPI_SEED_TYPE = (sizeof(seed)==sizeof(int)) ? MPI_INT:MPI_LONG;
363 #endif
364 Complex *u1buff = (Complex *)aligned_alloc(AVX,kvol*sizeof(Complex));
365 Complex *u2buff = (Complex *)aligned_alloc(AVX,kvol*sizeof(Complex));
366#ifdef _DEBUG
367 char dump_prefix[FILELEN]="u11.";
368 char dump_buff[32];
369 sprintf(dump_buff,"r%01d_c%06d",rank,itraj);
370 strcat(dump_prefix,dump_buff);
371 FILE *gauge_dump=fopen(dump_prefix,"wb");
372 //Print the local trial field in the order it is stored in memory.
373 //This is not the same order as it is stored in secondary storage
374 fwrite(u11,ndim*kvol*sizeof(Complex),1,gauge_dump);
375 fclose(gauge_dump);
376#endif
377#ifdef __RANLUX__
378 seed=gsl_rng_get(ranlux_instd);
379#endif
380 if(!rank){
381 //Array to store the seeds. nth index is the nth processor
382#ifdef __RANLUX__
383 unsigned long *seed_array=(unsigned long*)calloc(nproc,sizeof(seed));
384#elif defined __INTEL_MKL__ && !defined USE_RAN2
385 int *seed_array=(int *)calloc(nproc,sizeof(seed));
386#else
387 long *seed_array=(long*)calloc(nproc,sizeof(seed));
388#endif
389 seed_array[0]=seed;
390#if(nproc>1)
391 for(int iproc = 1; iproc<nproc; iproc++)
392 if(MPI_Recv(&seed_array[iproc], 1, MPI_SEED_TYPE,iproc, 1, comm, &status)){
393 fprintf(stderr, "Error %i in %s: Failed to receive seed from process %i.\nExiting...\n\n",
394 CANTRECV, funcname, iproc);
395 MPI_Abort(comm,CANTRECV);
396 }
397#endif
398 Complex *u11Write = (Complex *)aligned_alloc(AVX,ndim*gvol*sizeof(Complex));
399 Complex *u12Write = (Complex *)aligned_alloc(AVX,ndim*gvol*sizeof(Complex));
400 //Get correct parts of u11read etc from remote processors
401 for(int iproc=0;iproc<nproc;iproc++)
402 for(int idim=0;idim<ndim;idim++){
403#if(nproc>1)
404 if(iproc){
405 if(MPI_Recv(u1buff, kvol, MPI_C_DOUBLE_COMPLEX, iproc, 2*idim, comm, &status)){
406 fprintf(stderr, "Error %i in %s: Falied to receive u11 from process %i.\nExiting...\n\n",
407 CANTRECV, funcname, iproc);
408 MPI_Abort(comm,CANTRECV);
409 }
410 if(MPI_Recv(u2buff, kvol, MPI_C_DOUBLE_COMPLEX, iproc, 2*idim+1, comm, &status)){
411 fprintf(stderr, "Error %i in %s: Falied to receive u12 from process %i.\nExiting...\n\n",
412 CANTRECV, funcname, iproc);
413 MPI_Abort(comm,CANTRECV);
414 }
415 }
416 else{
417#endif
418 //No need to do MPI Send/Receive on the master rank
419 //Array looping is slow so we use memcpy instead
420#if defined USE_BLAS
421 cblas_zcopy(kvol,u11+idim,ndim,u1buff,1);
422 cblas_zcopy(kvol,u12+idim,ndim,u2buff,1);
423#else
424#pragma omp parallel for simd aligned(u11,u12,u1buff,u2buff:AVX)
425 for(int i=0;i<kvol;i++){
426 u1buff[i]=u11[i*ndim+idim];
427 u2buff[i]=u12[i*ndim+idim];
428 }
429#endif
430#ifdef _DEBUG
431 char part_dump[FILELEN]="";
432 strcat(part_dump,dump_prefix);
433 sprintf(dump_buff,"_d%d",idim);
434 strcat(part_dump,dump_buff);
435 FILE *pdump=fopen(part_dump,"wb");
436 fwrite(u1buff,ndim*kvol*sizeof(Complex),1,pdump);
437 fclose(pdump);
438#endif
439#if(nproc>1)
440 }
441#endif
442 int i=0;
443 for(int it=pstart[3][iproc]; it<pstop[3][iproc]; it++)
444 for(int iz=pstart[2][iproc]; iz<pstop[2][iproc]; iz++)
445 for(int iy=pstart[1][iproc]; iy<pstop[1][iproc]; iy++)
446 for(int ix=pstart[0][iproc]; ix<pstop[0][iproc]; ix++){
447 //j is the relative memory index of icoord
448 int j = Coord2gindex(ix, iy, iz, it);
449 u11Write[idim*gvol+j] = u1buff[i];
450 u12Write[idim*gvol+j] = u2buff[i];
451 //C starts counting from zero, not 1 so increment afterwards or start at int i=-1
452 i++;
453 }
454 if(i!=kvol){
455 fprintf(stderr, "Error %i in %s: Number of elements %i is not equal to\
456 kvol %i.\nExiting...\n\n", NUMELEM, funcname, i, kvol);
457#if(nproc>1)
458 MPI_Abort(comm,NUMELEM);
459#else
460 exit(NUMELEM);
461#endif
462 }
463 }
464 free(u1buff); free(u2buff);
465
466 char gauge_title[FILELEN]="config.";
467 int buffer; char buff2[7];
468 //Add script for extracting correct mu, j etc.
469 buffer = (int)round(100*beta);
470 sprintf(buff2,"b%03d",buffer);
471 strcat(gauge_title,buff2);
472 //κ
473 buffer = (int)round(10000*akappa);
474 sprintf(buff2,"k%04d",buffer);
475 strcat(gauge_title,buff2);
476 //μ
477 buffer = (int)round(1000*fmu);
478 sprintf(buff2,"mu%04d",buffer);
479 strcat(gauge_title,buff2);
480 //J
481 buffer = (int)round(1000*creal(ajq));
482 sprintf(buff2,"j%03d",buffer);
483 strcat(gauge_title,buff2);
484 //nx
485 sprintf(buff2,"s%02d",nx);
486 strcat(gauge_title,buff2);
487 //nt
488 sprintf(buff2,"t%02d",nt);
489 strcat(gauge_title,buff2);
490
491 char gauge_file[FILELEN];
492 strcpy(gauge_file,gauge_title);
493 char c[8];
494 sprintf(c,".%06d", itraj);
495 strcat(gauge_file, c);
496 printf("Gauge file name is %s\n", gauge_file);
497 printf("Writing the gauge file on processor %i.\n", rank);
498 FILE *con;
499 char *fileop = "wb";
500 if(!(con=fopen(gauge_file, fileop))){
501 fprintf(stderr, "Error %i in %s: Failed to open %s for %s.\
502 \nExiting...\n\n", OPENERROR, funcname, gauge_file, fileop);
503#if(nproc>1)
504 MPI_Abort(comm,OPENERROR);
505#else
506 exit(OPENERROR);
507#endif
508 }
509 //TODO: SAFETY CHECKS FOR EACH WRITE OPERATION
510 //Write the number of processors used in the previous run. This takes the place of the FORTRAN integer rather nicely
511#if(nproc==1)
512 int size=nproc;
513#endif
514 fwrite(&size,sizeof(int),1,con);
515 fwrite(u11Write, ndim*gvol*sizeof(Complex), 1, con);
516 fwrite(u12Write, ndim*gvol*sizeof(Complex), 1, con);
517 //TODO
518 //Make a seed array, where the nth component is the seed on the nth rank for continuation runs.
519 fwrite(seed_array, nproc*sizeof(seed), 1, con);
520 fclose(con);
521 free(u11Write); free(u12Write);
522 free(seed_array);
523 }
524#if(nproc>1)
525 else{
526 if(MPI_Send(&seed, 1, MPI_SEED_TYPE, masterproc, 1, comm)){
527 fprintf(stderr, "Error %i in %s: Falied to send u11 from process %i.\nExiting...\n\n",
528 CANTSEND, funcname, rank);
529 MPI_Abort(comm,CANTSEND);
530 }
531 for(int idim = 0; idim<ndim; idim++){
532#if defined USE_BLAS
533 cblas_zcopy(kvol,u11+idim,ndim,u1buff,1);
534 cblas_zcopy(kvol,u12+idim,ndim,u2buff,1);
535#else
536#pragma omp parallel for simd aligned(u11,u12,u1buff,u2buff:AVX)
537 for(int i=0;i<kvol;i++){
538 u1buff[i]=u11[i*ndim+idim];
539 u2buff[i]=u12[i*ndim+idim];
540 }
541#endif
542#ifdef _DEBUG
543 char part_dump[FILELEN]="";
544 strcat(part_dump,dump_prefix);
545 sprintf(dump_buff,"_d%d",idim);
546 strcat(part_dump,dump_buff);
547 FILE *pdump=fopen(part_dump,"wb");
548 fwrite(u1buff,ndim*kvol*sizeof(Complex),1,pdump);
549 fclose(pdump);
550#endif
551 int i=0;
552 if(MPI_Send(u1buff, kvol, MPI_C_DOUBLE_COMPLEX, masterproc, 2*idim, comm)){
553 fprintf(stderr, "Error %i in %s: Falied to send u11 from process %i.\nExiting...\n\n",
554 CANTSEND, funcname, rank);
555 MPI_Abort(comm,CANTSEND);
556 }
557 if(MPI_Send(u2buff, kvol, MPI_C_DOUBLE_COMPLEX, masterproc, 2*idim+1, comm)){
558 fprintf(stderr, "Error %i in %s: Falied to send u12 from process %i.\nExiting...\n\n",
559 CANTSEND, funcname, rank);
560 MPI_Abort(comm,CANTSEND);
561 }
562 }
563 free(u1buff); free(u2buff);
564 }
565#endif
566 return 0;
567}
568//To be lazy, we've got modules to help us do reductions and broadcasts with a single argument
569//rather than type them all every single time
570#if(nproc>1)
571inline int Par_isum(int *ival){
572 /*
573 * Performs a reduction on a double ival to get a sum which is
574 * then distributed to all ranks.
575 *
576 * Parameters:
577 * -----------
578 * double *ival: The pointer to the element being summed, and
579 * the container for said sum.
580 *
581 * Returns:
582 * --------
583 * Zero on success. Integer error code otherwise.
584 *
585 */
586 const char *funcname = "Par_isum";
587 //Container to receive data.
588 int *itmp;
589
590 if(MPI_Allreduce(ival, itmp, 1, MPI_DOUBLE, MPI_SUM, comm)){
591 fprintf(stderr,"Error %i in %s: Couldn't complete reduction for %i.\nExiting...\n\n", REDUCERR, funcname, *ival);
592 MPI_Abort(comm,REDUCERR);
593 }
594 return 0;
595}
596inline int Par_dsum(double *dval){
597 /*
598 * Performs a reduction on a double dval to get a sum which is
599 * then distributed to all ranks.
600 *
601 * Parameters:
602 * -----------
603 * double *dval: The pointer to the element being summed, and
604 * the container for said sum.
605 *
606 * Returns:
607 * --------
608 * Zero on success. Integer error code otherwise.
609 *
610 */
611 const char *funcname = "Par_dsum";
612 //Container to receive data.
613 double dtmp;
614
615 if(MPI_Allreduce(dval, &dtmp, 1, MPI_DOUBLE, MPI_SUM, comm)){
616 fprintf(stderr,"Error %i in %s: Couldn't complete reduction for %f.\nExiting...\n\n", REDUCERR, funcname, *dval);
617 MPI_Abort(comm,REDUCERR);
618 }
619 *dval = dtmp;
620 return 0;
621}
622inline int Par_fsum(float *fval){
623 /*
624 * Performs a reduction on a double dval to get a sum which is
625 * then distributed to all ranks.
626 *
627 * Parameters:
628 * -----------
629 * double *dval: The pointer to the element being summed, and
630 * the container for said sum.
631 *
632 * Returns:
633 * --------
634 * Zero on success. Integer error code otherwise.
635 *
636 */
637 const char *funcname = "far_dsum";
638 //Container to receive data.
639 float ftmp;
640
641 if(MPI_Allreduce(fval, &ftmp, 1, MPI_FLOAT, MPI_SUM, comm)){
642 fprintf(stderr,"Error %i in %s: Couldn't complete reduction for %f.\nExiting...\n\n", REDUCERR, funcname, *fval);
643 MPI_Abort(comm,REDUCERR);
644 }
645 *fval = ftmp;
646 return 0;
647}
648inline int Par_csum(Complex_f *cval){
649 /*
650 * Performs a reduction on a Complex zval to get a sum which is
651 * then distributed to all ranks.
652 *
653 * Parameters:
654 * -----------
655 * Complex_f *cval: The pointer to the element being summed, and
656 * the container for said sum.
657 *
658 * Returns:
659 * --------
660 * Zero on success. Integer error code otherwise.
661 *
662 */
663 const char *funcname = "Par_csum";
664 //Container to receive data.
665 Complex_f ctmp;
666
667 if(MPI_Allreduce(cval, &ctmp, 1, MPI_C_FLOAT_COMPLEX, MPI_SUM, comm)){
668#ifndef __NVCC__
669 fprintf(stderr, "Error %i in %s: Couldn't complete reduction for %f+%f i.\nExiting...\n\n",
670 REDUCERR, funcname, creal(*cval), cimag(*cval));
671#endif
672 MPI_Abort(comm,REDUCERR);
673 }
674 *cval = ctmp;
675 return 0;
676}
677inline int Par_zsum(Complex *zval){
678 /*
679 * Performs a reduction on a Complex zval to get a sum which is
680 * then distributed to all ranks.
681 *
682 * Parameters:
683 * -----------
684 * Complex *zval: The pointer to the element being summed, and
685 * the container for said sum.
686 *
687 * Returns:
688 * --------
689 * Zero on success. Integer error code otherwise.
690 *
691 */
692 const char *funcname = "Par_zsum";
693 //Container to receive data.
694 Complex ztmp;
695
696 if(MPI_Allreduce(zval, &ztmp, 1, MPI_C_DOUBLE_COMPLEX, MPI_SUM, comm)){
697#ifndef __NVCC__
698 fprintf(stderr, "Error %i in %s: Couldn't complete reduction for %f+%f i.\nExiting...\n\n",
699 REDUCERR, funcname, creal(*zval), cimag(*zval));
700#endif
701 MPI_Abort(comm,REDUCERR);
702 }
703 *zval = ztmp;
704 return 0;
705}
706inline int Par_icopy(int *ival){
707 /*
708 * Broadcasts an integer to the other processes
709 *
710 * Parameters:
711 * ----------
712 * int ival
713 *
714 * Returns:
715 * -------
716 * Zero on success, integer error code otherwise
717 */
718 const char *funcname = "Par_icopy";
719 if(MPI_Bcast(ival,1,MPI_INT,masterproc,comm)){
720 fprintf(stderr, "Error %i in %s: Failed to broadcast %i from %i.\nExiting...\n\n",
721 BROADERR, funcname, *ival, rank);
722 MPI_Abort(comm,BROADERR);
723 }
724 return 0;
725}
726inline int Par_dcopy(double *dval){
727 /*
728 * Broadcasts an double to the other processes
729 *
730 * Parameters:
731 * ----------
732 * double dval
733 *
734 * Returns:
735 * -------
736 * Zero on success, integer error code otherwise
737 */
738 const char *funcname = "Par_dcopy";
739 if(MPI_Bcast(dval,1,MPI_DOUBLE,masterproc,comm)){
740 fprintf(stderr, "Error %i in %s: Failed to broadcast %f from %i.\nExiting...\n\n",
741 BROADERR, funcname, *dval, rank);
742 MPI_Abort(comm,BROADERR);
743 }
744 return 0;
745}
746inline int Par_fcopy(float *fval){
747 /*
748 * Broadcasts an float to the other processes
749 *
750 * Parameters:
751 * ----------
752 * float dval
753 *
754 * Returns:
755 * -------
756 * Zero on success, integer error code otherwise
757 */
758 const char *funcname = "Par_dfopy";
759 if(MPI_Bcast(fval,1,MPI_FLOAT,masterproc,comm)){
760 fprintf(stderr, "Error %i in %s: Failed to broadcast %f from %i.\nExiting...\n\n",
761 BROADERR, funcname, *fval, rank);
762 MPI_Abort(comm,BROADERR);
763 }
764 return 0;
765}
766inline int Par_ccopy(Complex *cval){
767 /*
768 * Broadcasts a Complex value to the other processes
769 *
770 * Parameters:
771 * ----------
772 * Complex *zval
773 *
774 * Returns:
775 * -------
776 * Zero on success, integer error code otherwise
777 */
778 const char *funcname = "Par_ccopy";
779 if(MPI_Bcast(cval,1,MPI_C_FLOAT_COMPLEX,masterproc,comm)){
780#ifndef __NVCC__
781 fprintf(stderr, "Error %i in %s: Failed to broadcast %f+i%f from %i.\nExiting...\n\n",
782 BROADERR, funcname, creal(*cval), cimag(*cval), rank);
783#endif
784 MPI_Abort(comm,BROADERR);
785 }
786 return 0;
787}
788inline int Par_zcopy(Complex *zval){
789 /*
790 * Broadcasts a Complex value to the other processes
791 *
792 * Parameters:
793 * ----------
794 * Complex *zval
795 *
796 * Returns:
797 * -------
798 * Zero on success, integer error code otherwise
799 */
800 const char *funcname = "Par_zcopy";
801 if(MPI_Bcast(zval,1,MPI_C_DOUBLE_COMPLEX,masterproc,comm)){
802#ifndef __NVCC__
803 fprintf(stderr, "Error %i in %s: Failed to broadcast %f+i%f from %i.\nExiting...\n\n",
804 BROADERR, funcname, creal(*zval), cimag(*zval), rank);
805#endif
806 MPI_Abort(comm,BROADERR);
807 }
808 return 0;
809}
810
811/* Code for swapping halos.
812 * In the original FORTRAN there were separate subroutines for up and down halos
813 * To make code maintenance easier I'm going to implement this with switches
814 * and common functions
815 * We will define in su2hmc UP and DOWN. And add a parameter called layer to
816 * functions. layer will be used to tell us if we wanted to call the up FORTRAN
817 * function or DOWN FORTRAN function
818 */
819inline int ZHalo_swap_all(Complex *z, int ncpt){
820 /*
821 * Calls the functions to send data to both the up and down halos
822 *
823 * Parameters:
824 * -----------
825 * Complex z: The data being sent
826 * int ncpt: Number of components being sent
827 *
828 * Returns:
829 * -------
830 * Zero on success, integer error code otherwise
831 */
832 const char *funcname = "ZHalo_swap_all";
833
834 //FORTRAN called zdnhaloswapall and zuphaloswapall here
835 //Those functions looped over the directions and called zXXhaloswapdir
836 //As the only place they are called in the FORTRAN code is right here,
837 //I'm going to omit them entirely and just put the direction loop here
838 //instead
839 //Unrolling the loop so we can have pre-processor directives for each dimension
840#if(npx>1)
841 ZHalo_swap_dir(z, ncpt, 0, DOWN);
842 ZHalo_swap_dir(z, ncpt, 0, UP);
843#endif
844#if(npy>1)
845 ZHalo_swap_dir(z, ncpt, 1, DOWN);
846 ZHalo_swap_dir(z, ncpt, 1, UP);
847#endif
848#if(npz>1)
849 ZHalo_swap_dir(z, ncpt, 2, DOWN);
850 ZHalo_swap_dir(z, ncpt, 2, UP);
851#endif
852#if(npt>1)
853 ZHalo_swap_dir(z, ncpt, 3, DOWN);
854 ZHalo_swap_dir(z, ncpt, 3, UP);
855#endif
856 return 0;
857}
858int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer){
859 /*
860 * Swaps the halos along the axis given by idir in the direction
861 * given by layer
862 *
863 * Parameters:
864 * -----------
865 * Complex *z: The data being moved about. It should be an array of dimension [kvol+halo][something else]
866 * int ncpt: Number of components being sent
867 * int idir: The axis being moved along in C Indexing
868 * int layer: Either DOWN (0) or UP (1)
869 *
870 * Returns:
871 * -------
872 * Zero on success, Integer Error code otherwise
873 */
874 const char *funcname = "ZHalo_swap_dir";
875 MPI_Status status;
876 if(layer!=DOWN && layer!=UP){
877 fprintf(stderr, "Error %i in %s: Cannot swap in the direction given by %i.\nExiting...\n\n",
878 LAYERROR, funcname, layer);
879 MPI_Abort(comm,BROADERR);
880 }
881 //How big is the data being sent and received
882 int msg_size=ncpt*halosize[idir];
883 Complex *sendbuf = (Complex *)aligned_alloc(AVX,msg_size*sizeof(Complex));
884 //In each case we set up the data being sent then do the exchange
885 switch(layer){
886 case(DOWN):
887 if(halosize[idir]+h1u[idir]>kvol+halo){
888 fprintf(stderr, "Error %i in %s: Writing a message of size %i to flattened index %i will cause "\
889 "a memory leak on rank %i.\nExiting...\n\n"
890 ,BOUNDERROR, funcname, msg_size, ncpt*h1u[idir], rank);
891 MPI_Abort(comm,BOUNDERROR);
892 }
893#pragma omp parallel for
894 for(int ihalo = 0; ihalo < halosize[idir]; ihalo++)
895#pragma omp simd aligned(z, sendbuf:AVX)
896 for(int icpt = 0; icpt <ncpt; icpt++)
897 sendbuf[ihalo*ncpt+icpt]=z[ncpt*hd[ndim*ihalo+idir]+icpt];
898 //For the zdnhaloswapdir we send off the down halo and receive into the up halo
899 if(MPI_Isend(sendbuf, msg_size, MPI_C_DOUBLE_COMPLEX, pd[idir], tag, comm, &request)){
900 fprintf(stderr,"Error %i in %s: Failed to send off the down halo from rank %i to rank %i.\nExiting...\n"
901 ,CANTSEND, funcname, rank, pd[idir]);
902 MPI_Abort(comm,CANTSEND);
903 }
904 if(MPI_Recv(&z[ncpt*h1u[idir]], msg_size, MPI_C_DOUBLE_COMPLEX, pu[idir], tag, comm, &status)){
905 fprintf(stderr,"Error %i in %s: Rank %i failed to receive into up halo from rank %i.\nExiting...\n",
906 CANTRECV, funcname, rank, pu[idir]);
907 MPI_Abort(comm,CANTRECV);
908 }
909 break;
910 case(UP):
911 if(halosize[idir]+h1d[idir]>kvol+halo){
912 fprintf(stderr, "Error %i in %s: Writing a message of size %i to flattened index %i will cause "\
913 "a memory leak on rank %i.\nExiting...\n\n"
914 ,BOUNDERROR, funcname, msg_size, ncpt*h1d[idir], rank);
915 MPI_Abort(comm,BOUNDERROR);
916 }
917#pragma omp parallel for
918 for(int ihalo = 0; ihalo < halosize[idir]; ihalo++)
919#pragma omp simd aligned(z, sendbuf:AVX)
920 for(int icpt = 0; icpt <ncpt; icpt++)
921 sendbuf[ihalo*ncpt+icpt]=z[ncpt*hu[ndim*ihalo+idir]+icpt];
922 //For the zuphaloswapdir we send off the up halo and receive into the down halo
923 if(MPI_Isend(sendbuf, msg_size, MPI_C_DOUBLE_COMPLEX, pu[idir], 0, comm, &request)){
924 fprintf(stderr,"Error %i in %s: Failed to send off the up halo from rank %i to rank %i.\nExiting...\n",
925 CANTSEND, funcname, rank, pu[idir]);
926 MPI_Abort(comm,CANTSEND);
927 }
928 if(MPI_Recv(&z[ncpt*h1d[idir]], msg_size, MPI_C_DOUBLE_COMPLEX, pd[idir], tag, comm, &status)){
929 fprintf(stderr,"Error %i in %s: Rank %i failed to receive into doww halo from rank %i.\nExiting...\n",
930 CANTRECV, funcname, rank, pd[idir]);
931 MPI_Abort(comm,CANTRECV);
932 }
933 break;
934 }
935 free(sendbuf);
936 MPI_Wait(&request, &status);
937 return 0;
938}
939inline int CHalo_swap_all(Complex_f *c, int ncpt){
940 /*
941 * Calls the functions to send data to both the up and down halos
942 *
943 * Parameters:
944 * -----------
945 * Complex z: The data being sent
946 * int ncpt: Number of components being sent
947 *
948 * Returns:
949 * -------
950 * Zero on success, integer error code otherwise
951 */
952 const char *funcname = "ZHalo_swap_all";
953
954 //FORTRAN called zdnhaloswapall and zuphaloswapall here
955 //Those functions looped over the directions and called zXXhaloswapdir
956 //As the only place they are called in the FORTRAN code is right here,
957 //I'm going to omit them entirely and just put the direction loop here
958 //instead
959 //Unrolling the loop so we can have pre-processor directives for each dimension
960#if(npx>1)
961 CHalo_swap_dir(c, ncpt, 0, DOWN);
962 CHalo_swap_dir(c, ncpt, 0, UP);
963#endif
964#if(npy>1)
965 CHalo_swap_dir(c, ncpt, 1, DOWN);
966 CHalo_swap_dir(c, ncpt, 1, UP);
967#endif
968#if(npz>1)
969 CHalo_swap_dir(c, ncpt, 2, DOWN);
970 CHalo_swap_dir(c, ncpt, 2, UP);
971#endif
972#if(npt>1)
973 CHalo_swap_dir(c, ncpt, 3, DOWN);
974 CHalo_swap_dir(c, ncpt, 3, UP);
975#endif
976 return 0;
977}
978int CHalo_swap_dir(Complex_f *c, int ncpt, int idir, int layer){
979 /*
980 * Swaps the halos along the axis given by idir in the direction
981 * given by layer
982 *
983 * Parameters:
984 * -----------
985 * Complex *z: The data being moved about. It should be an array of dimension [kvol+halo][something else]
986 * int ncpt: The size of something else above.
987 * int idir: The axis being moved along in C Indexing
988 * int layer: Either DOWN (0) or UP (1)
989 *
990 * Returns:
991 * -------
992 * Zero on success, Integer Error code otherwise
993 */
994 const char *funcname = "CHalo_swap_dir";
995 MPI_Status status;
996 if(layer!=DOWN && layer!=UP){
997 fprintf(stderr, "Error %i in %s: Cannot swap in the direction given by %i.\nExiting...\n\n",
998 LAYERROR, funcname, layer);
999 MPI_Abort(comm,LAYERROR);
1000 }
1001 //How big is the data being sent and received
1002 int msg_size=ncpt*halosize[idir];
1003 Complex_f *sendbuf = (Complex_f *)aligned_alloc(AVX,msg_size*sizeof(Complex_f));
1004 //In each case we set up the data being sent then do the exchange
1005 switch(layer){
1006 case(DOWN):
1007 if(halosize[idir]+h1u[idir]>kvol+halo){
1008 fprintf(stderr, "Error %i in %s: Writing a message of size %i to flattened index %i will cause "\
1009 "a memory leak on rank %i.\nExiting...\n\n"
1010 ,BOUNDERROR, funcname, msg_size, ncpt*h1u[idir], rank);
1011 MPI_Abort(comm,BOUNDERROR);
1012 }
1013#pragma omp parallel for
1014 for(int ihalo = 0; ihalo < halosize[idir]; ihalo++)
1015#pragma omp simd aligned(c, sendbuf:AVX)
1016 for(int icpt = 0; icpt <ncpt; icpt++)
1017 sendbuf[ihalo*ncpt+icpt]=c[ncpt*hd[ndim*ihalo+idir]+icpt];
1018 //For the zdnhaloswapdir we send off the down halo and receive into the up halo
1019 if(MPI_Isend(sendbuf, msg_size, MPI_C_FLOAT_COMPLEX, pd[idir], tag, comm, &request)){
1020 fprintf(stderr,"Error %i in %s: Failed to send off the down halo from rank %i to rank %i.\nExiting...\n"
1021 ,CANTSEND, funcname, rank, pd[idir]);
1022 MPI_Abort(comm,CANTSEND);
1023 }
1024 if(MPI_Recv(&c[ncpt*h1u[idir]], msg_size, MPI_C_FLOAT_COMPLEX, pu[idir], tag, comm, &status)){
1025 fprintf(stderr,"Error %i in %s: Rank %i failed to receive into up halo from rank %i.\nExiting...\n",
1026 CANTRECV, funcname, rank, pu[idir]);
1027 MPI_Abort(comm,CANTRECV);
1028 }
1029 break;
1030 case(UP):
1031 if(halosize[idir]+h1d[idir]>kvol+halo){
1032 fprintf(stderr, "Error %i in %s: Writing a message of size %i to flattened index %i will cause "\
1033 "a memory leak on rank %i.\nExiting...\n\n"
1034 ,BOUNDERROR, funcname, msg_size, ncpt*h1d[idir], rank);
1035 MPI_Abort(comm,BOUNDERROR);
1036 }
1037#pragma omp parallel for
1038 for(int ihalo = 0; ihalo < halosize[idir]; ihalo++)
1039#pragma omp simd aligned(c, sendbuf:AVX)
1040 for(int icpt = 0; icpt <ncpt; icpt++)
1041 sendbuf[ihalo*ncpt+icpt]=c[ncpt*hu[ndim*ihalo+idir]+icpt];
1042 //For the zuphaloswapdir we send off the up halo and receive into the down halo
1043 if(MPI_Isend(sendbuf, msg_size, MPI_C_FLOAT_COMPLEX, pu[idir], 0, comm, &request)){
1044 fprintf(stderr,"Error %i in %s: Failed to send off the up halo from rank %i to rank %i.\nExiting...\n",
1045 CANTSEND, funcname, rank, pu[idir]);
1046 MPI_Abort(comm,CANTSEND);
1047 }
1048 if(MPI_Recv(&c[ncpt*h1d[idir]], msg_size, MPI_C_FLOAT_COMPLEX, pd[idir], tag, comm, &status)){
1049 fprintf(stderr,"Error %i in %s: Rank %i failed to receive into doww halo from rank %i.\nExiting...\n",
1050 CANTRECV, funcname, rank, pd[idir]);
1051 MPI_Abort(comm,CANTRECV);
1052 }
1053 break;
1054 }
1055 free(sendbuf);
1056 MPI_Wait(&request, &status);
1057 return 0;
1058}
1059inline int DHalo_swap_all(double *d, int ncpt){
1060 /*
1061 * Calls the functions to send data to both the up and down halos
1062 *
1063 * Parameters:
1064 * -----------
1065 * Complex z: The data being sent
1066 * int ncpt: Number of components being sent
1067 *
1068 * Returns:
1069 * -------
1070 * Zero on success, integer error code otherwise
1071 */
1072 const char *funcname = "DHalo_swap_all";
1073
1074 //FORTRAN called zdnhaloswapall and zuphaloswapall here
1075 //Those functions looped over the directions and called zXXhaloswapdir
1076 //As the only place they are called in the FORTRAN code is right here,
1077 //I'm going to omit them entirely and just put the direction loop here
1078 //instead
1079 //Unrolling the loop so we can have pre-processor directives for each dimension
1080#if(npx>1)
1081 DHalo_swap_dir(d, ncpt, 0, DOWN);
1082 DHalo_swap_dir(d, ncpt, 0, UP);
1083#endif
1084#if(npy>1)
1085 DHalo_swap_dir(d, ncpt, 1, DOWN);
1086 DHalo_swap_dir(d, ncpt, 1, UP);
1087#endif
1088#if(npz>1)
1089 DHalo_swap_dir(d, ncpt, 2, DOWN);
1090 DHalo_swap_dir(d, ncpt, 2, UP);
1091#endif
1092#if(npt>1)
1093 DHalo_swap_dir(d, ncpt, 3, DOWN);
1094 DHalo_swap_dir(d, ncpt, 3, UP);
1095#endif
1096 return 0;
1097}
1098int DHalo_swap_dir(double *d, int ncpt, int idir, int layer){
1099 /*
1100 * Swaps the halos along the axis given by idir in the direction
1101 * given by layer
1102 *
1103 * Parameters:
1104 * -----------
1105 * double *d: The data being moved about
1106 * int ncpt: Number of components being sent
1107 * int idir: The axis being moved along
1108 * int layer: Either DOWN (0) or UP (1)
1109 *
1110 * Returns:
1111 * -------
1112 * Zero on success, Integer Error code otherwise
1113 */
1114 const char *funcname = "DHalo_swap_dir";
1115 MPI_Status status;
1116 //How big is the data being sent and received
1117 int msg_size=ncpt*halosize[idir];
1118 double *sendbuf = (double *)aligned_alloc(AVX,msg_size*sizeof(double));
1119 if(layer!=DOWN && layer!=UP){
1120 fprintf(stderr, "Error %i in %s: Cannot swap in the direction given by %i.\nExiting...\n\n",
1121 LAYERROR, funcname, layer);
1122 MPI_Abort(comm,LAYERROR);
1123 }
1124 //Impliment the switch. The code is taken from the end of the dedicated functions in the FORTRAN code.
1125 switch(layer){
1126 case(DOWN):
1127 if(halosize[idir]+h1u[idir]>kvol+halo){
1128 fprintf(stderr, "Error %i in %s: Writing a message of size %i to flattened index %i will cause "\
1129 "a memory leak on rank %i.\nExiting...\n\n"
1130 ,BOUNDERROR, funcname, msg_size, ncpt*h1u[idir], rank);
1131 MPI_Abort(comm,BOUNDERROR);
1132 }
1133#pragma omp parallel for
1134 for(int ihalo = 0; ihalo < halosize[idir]; ihalo++)
1135#pragma omp simd aligned(d,sendbuf:AVX)
1136 for(int icpt = 0; icpt <ncpt; icpt++)
1137 sendbuf[ihalo*ncpt+icpt]=d[ncpt*hd[ndim*ihalo+idir]+icpt];
1138 //For the cdnhaloswapdir we send off the down halo and receive into the up halo
1139 if(MPI_Isend(sendbuf, msg_size, MPI_DOUBLE, pd[idir], tag, comm, &request)){
1140 fprintf(stderr, "Error %i in %s: Failed to send off the down halo from rank %i to rank %i.\nExiting...\n\n",
1141 CANTSEND, funcname, rank, pd[idir]);
1142 MPI_Abort(comm,CANTSEND);
1143 }
1144 if(MPI_Recv(&d[ncpt*h1u[idir]], msg_size, MPI_DOUBLE, pu[idir], tag, comm, &status)){
1145 fprintf(stderr, "Error %i in %s: Rank %i failed to receive into up halo from rank %i.\nExiting...\n\n",
1146 CANTRECV, funcname, rank, pu[idir]);
1147 MPI_Abort(comm,CANTRECV);
1148 }
1149 case(UP):
1150 if(halosize[idir]+h1d[idir]>kvol+halo){
1151 fprintf(stderr, "Error %i in %s: Writing a message of size %i to flattened index %i will cause "\
1152 "a memory leak on rank %i.\nExiting...\n\n"
1153 ,BOUNDERROR, funcname, msg_size, ncpt*h1d[idir], rank);
1154 MPI_Abort(comm,BOUNDERROR);
1155 }
1156#pragma omp parallel for
1157 for(int ihalo = 0; ihalo < halosize[idir]; ihalo++)
1158#pragma omp simd aligned(d,sendbuf:AVX)
1159 for(int icpt = 0; icpt <ncpt; icpt++)
1160 sendbuf[ihalo*ncpt+icpt]=d[ncpt*hu[ndim*ihalo+idir]+icpt];
1161 //For the cuphaloswapdir we send off the up halo and receive into the down halo
1162 if(MPI_Isend(sendbuf, msg_size, MPI_DOUBLE, pu[idir], 0, comm, &request)){
1163 fprintf(stderr,"Error %i in %s: Failed to send off the up halo from rank %i to rank %i.\nExiting...\n\n",
1164 CANTSEND, funcname, rank, pu[idir]);
1165 MPI_Abort(comm,CANTSEND);
1166 }
1167 if(MPI_Recv(&d[ncpt*h1d[idir]], msg_size, MPI_DOUBLE, pd[idir], tag, comm, &status)){
1168 fprintf(stderr, "Error %i in %s: Rank %i failed to receive into doww halo from rank %i.\nExiting...\n\n",
1169 CANTRECV, funcname, rank, pd[idir]);
1170 MPI_Abort(comm,CANTRECV);
1171 }
1172 }
1173 free(sendbuf);
1174 MPI_Wait(&request, &status);
1175 return 0;
1176}
1177#endif
1178int Trial_Exchange(Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f){
1179 /*
1180 * Exchanges the trial fields. I noticed that this halo exchange was happening
1181 * even though the trial fields hadn't been updated. To get around this
1182 * I'm making a function that does the halo exchange and only calling it after
1183 * the trial fields get updated.
1184 */
1185 const char *funchame = "Trial_Exchange";
1186 //Prefetch the trial fields from the GPU, halos come later
1187#if(nproc>1)
1188#ifdef __NVCC__
1189 int device=-1;
1190 cudaGetDevice(&device);
1191 cudaMemPrefetchAsync(u11t, ndim*kvol*sizeof(Complex),cudaCpuDeviceId,NULL);
1192 cudaMemPrefetchAsync(u12t, ndim*kvol*sizeof(Complex),cudaCpuDeviceId,NULL);
1193#endif
1194 Complex *z = (Complex *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex));
1195 for(int mu=0;mu<ndim;mu++){
1196 //Copy the column from u11t
1197#ifdef USE_BLAS
1198 cblas_zcopy(kvol, &u11t[mu], ndim, z, 1);
1199#else
1200 for(int i=0; i<kvol;i++)
1201 z[i]=u11t[i*ndim+mu];
1202#endif
1203 //Halo exchange on that column
1204 ZHalo_swap_all(z, 1);
1205 //And the swap back
1206#ifdef USE_BLAS
1207 cblas_zcopy(kvol+halo, z, 1, &u11t[mu], ndim);
1208 //Now we prefetch the halo
1209#ifdef __NVCC__
1210 cudaMemPrefetchAsync(u11t+ndim*kvol, ndim*halo*sizeof(Complex),device,NULL);
1211#endif
1212 //Repeat for u12t
1213 cblas_zcopy(kvol, &u12t[mu], ndim, z, 1);
1214#else
1215 for(int i=0; i<kvol+halo;i++){
1216 u11t[i*ndim+mu]=z[i];
1217 z[i]=u12t[i*ndim+mu];
1218 }
1219#endif
1220 ZHalo_swap_all(z, 1);
1221#ifdef USE_BLAS
1222 cblas_zcopy(kvol+halo, z, 1, &u12t[mu], ndim);
1223#else
1224 for(int i=0; i<kvol+halo;i++)
1225 u12t[i*ndim+mu]=z[i];
1226#endif
1227 }
1228 //Now we prefetch the halo
1229#ifdef __NVCC__
1230 cudaMemPrefetchAsync(u12t+ndim*kvol, ndim*halo*sizeof(Complex),device,NULL);
1231#endif
1232 free(z);
1233#endif
1234//And get the single precision gauge fields preppeed
1235#ifdef __NVCC__
1236 cuComplex_convert(u11t_f,u11t,ndim*(kvol+halo),true,dimBlock,dimGrid);
1237 cuComplex_convert(u12t_f,u12t,ndim*(kvol+halo),true,dimBlock,dimGrid);
1238 cudaDeviceSynchronise();
1239#else
1240#pragma omp parallel for simd aligned(u11t_f,u12t_f,u11t,u12t:AVX)
1241 for(int i=0;i<ndim*(kvol+halo);i++){
1242 u11t_f[i]=(Complex_f)u11t[i];
1243 u12t_f[i]=(Complex_f)u12t[i];
1244 }
1245#endif
1246 return 0;
1247}
1248#if(npt>1)
1249int Par_tmul(Complex_f *z11, Complex_f *z12){
1250 /*
1251 * Parameters:
1252 * ===========
1253 * Complex *z11
1254 * Complex *z12
1255 *
1256 * Returns:
1257 * =======
1258 * Zero on success, integer error code otherwise.
1259 */
1260#ifdef __NVCC_
1261#error Par_tmul is not yet implimented in CUDA as Sigma12 in Polyakov is device only memory
1262#endif
1263 MPI_Status status;
1264 const char *funcname = "Par_tmul";
1265 Complex_f *a11, *a12, *t11, *t12;
1266 int i, itime;
1267
1268 a11=(Complex_f *)aligned_alloc(AVX,kvol3*sizeof(Complex_f));
1269 a12=(Complex_f *)aligned_alloc(AVX,kvol3*sizeof(Complex_f));
1270 t11=(Complex_f *)aligned_alloc(AVX,kvol3*sizeof(Complex_f));
1271 t12=(Complex_f *)aligned_alloc(AVX,kvol3*sizeof(Complex_f));
1272 //Initialise for the first loop
1273 memcpy(a11, z11, kvol3*sizeof(Complex_f));
1274 memcpy(a12, z12, kvol3*sizeof(Complex_f));
1275
1276 //Since the index of the outer loop isn't used as an array index anywhere
1277 //I'm going format it exactly like the original FORTRAN
1278#ifdef _DEBUG
1279 if(!rank) printf("Sending between halos in the time direction. For rank %i pu[3]=%i and pd[3] = %i\n",
1280 rank, pu[3], pd[3]);
1281#endif
1282 for(itime=1;itime<npt; itime++){
1283 memcpy(t11, a11, kvol3*sizeof(Complex_f));
1284 memcpy(t12, a12, kvol3*sizeof(Complex_f));
1285#ifdef _DEBUG
1286 if(!rank) printf("t11 and t12 assigned. Getting ready to send to other processes.\n");
1287#endif
1288 //Send results to other processes down the line
1289 //What I don't quite get (except possibly avoiding race conditions) is
1290 //why we send t11 and not a11. Surely by eliminating the assignment of a11 to t11
1291 //and using a blocking send we would have one fewer loop to worry about and improve performance?
1292 if(MPI_Isend(t11, kvol3, MPI_C_FLOAT_COMPLEX, pd[3], tag, comm, &request)){
1293 fprintf(stderr, "Error %i in %s: Failed to send t11 to process %i.\nExiting...\n\n",
1294 CANTSEND, funcname, pd[3]);
1295 MPI_Abort(comm,CANTSEND);
1296 }
1297#ifdef _DEBUG
1298 printf("Sent t11 from rank %i to the down halo on rank %i\n", rank, pd[3]);
1299#endif
1300 if(MPI_Recv(a11, kvol3, MPI_C_FLOAT_COMPLEX, pu[3], tag, comm, &status)){
1301 fprintf(stderr, "Error %i in %s: Failed to receive a11 from process %i.\nExiting...\n\n",
1302 CANTSEND, funcname, pu[3]);
1303 MPI_Abort(comm,CANTSEND);
1304 }
1305#ifdef _DEBUG
1306 printf("Received t11 from rank %i in the up halo on rank %i\n", pu[3], rank);
1307#endif
1308 MPI_Wait(&request, &status);
1309 if(MPI_Isend(t12, kvol3, MPI_C_FLOAT_COMPLEX, pd[3], tag, comm, &request)){
1310 fprintf(stderr, "Error %i in %s: Failed to send t12 to process %i.\nExiting...\n\n",
1311 CANTSEND, funcname, pd[3]);
1312 MPI_Abort(comm,CANTSEND);
1313 }
1314 if(MPI_Recv(a12, kvol3, MPI_C_FLOAT_COMPLEX, pu[3], tag, comm, &status)){
1315 fprintf(stderr, "Error %i in %s: Failed to receive a12 from process %i.\nExiting...\n\n",
1316 CANTSEND, funcname, pu[3]);
1317 MPI_Abort(comm,CANTSEND);
1318 }
1319#ifdef _DEBUG
1320 printf("Finished sending and receiving on rank %i\n", rank);
1321#endif
1322 MPI_Wait(&request, &status);
1323
1324 //Post-multiply current loop by incoming one.
1325 //This is begging to be done in CUDA or BLAS
1326#pragma omp parallel for simd aligned(a11,a12,t11,t12,z11,z12:AVX)
1327 for(i=0;i<kvol3;i++){
1328 t11[i]=z11[i]*a11[i]-z12[i]*conj(a12[i]);
1329 t12[i]=z11[i]*a12[i]+z12[i]*conj(a11[i]);
1330 }
1331 memcpy(z11, t11, kvol3*sizeof(Complex_f));
1332 memcpy(z12, t12, kvol3*sizeof(Complex_f));
1333 }
1334 free(a11); free(a12); free(t11); free(t12);
1335 return 0;
1336}
1337#endif
int Coord2gindex(int ix, int iy, int iz, int it)
Converts the coordinates of a global lattice point to its index in the computer memory.
Definition coord.c:469
unsigned int * h1u
Up halo starting element.
Definition coord.c:15
unsigned int * h1d
Down halo starting element.
Definition coord.c:15
unsigned int * halosize
Array containing the size of the halo in each direction.
Definition coord.c:15
unsigned int * hd
Down halo indices.
Definition coord.c:15
unsigned int * hu
Up halo indices.
Definition coord.c:15
int Par_swrite(const int itraj, const int icheck, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12)
Copies u11 and u12 into arrays without halos which then get written to output.
Definition par_mpi.c:341
int size
The number of MPI ranks in total.
Definition par_mpi.c:22
int Par_sread(const int iread, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t)
Reads and assigns the gauges from file.
Definition par_mpi.c:127
int rank
The MPI rank.
Definition par_mpi.c:22
int Par_begin(int argc, char *argv[])
Initialises the MPI configuration.
Definition par_mpi.c:25
int Trial_Exchange(Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f)
Exchanges the trial fields.
Definition par_mpi.c:1178
int * pcoord
The processor grid.
Definition par_mpi.c:19
MPI headers.
int CHalo_swap_all(Complex_f *c, int ncpt)
Calls the functions to send data to both the up and down halos.
#define UP
Flag for send up.
Definition par_mpi.h:37
int __RANLUX__ pstart[ndim][nproc]
The initial lattice site on each sublattice in a given direction.
int Par_fcopy(float *fval)
Broadcasts a float to the other processes.
int ZHalo_swap_all(Complex *z, int ncpt)
Calls the functions to send data to both the up and down halos.
#define DOWN
Flag for send down.
Definition par_mpi.h:35
int Par_fsum(float *dval)
Performs a reduction on a float dval to get a sum which is then distributed to all ranks.
int Par_dcopy(double *dval)
Broadcasts a double to the other processes.
int Par_isum(int *ival)
Performs a reduction on an integer ival to get a sum which is then distributed to all ranks.
int Par_icopy(int *ival)
Broadcasts an integer to the other processes.
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Par_ccopy(Complex *cval)
Broadcasts a complex float to the other processes.
int Par_zcopy(Complex *zval)
Broadcasts a complex double to the other processes.
int DHalo_swap_all(double *d, int ncpt)
Calls the functions to send data to both the up and down halos.
int __RANLUX__ pu[ndim]
Processors in the up direction.
Definition par_mpi.c:23
#define masterproc
The main rank. Used for serial tasks.
Definition par_mpi.h:40
int CHalo_swap_dir(Complex_f *c, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int __RANLUX__ pstop[ndim][nproc]
The final lattice site on each sublattice in a given direction.
Definition par_mpi.c:21
int Par_csum(Complex_f *cval)
Performs a reduction on a complex float cval to get a sum which is then distributed to all ranks.
int Par_zsum(Complex *zval)
Performs a reduction on a complex double zval to get a sum which is then distributed to all ranks.
int __RANLUX__ pd[ndim]
Processors in the down direction.
Definition par_mpi.c:24
int DHalo_swap_dir(double *d, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
#define tag
default MPI tag
Definition par_mpi.h:43
Header for random number configuration.
long seed
RAN2 seed.
Definition random.c:27
#define ksizex
Sublattice x extent.
Definition sizes.h:139
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nt
Lattice temporal extent. This also corresponds to the inverse temperature.
Definition sizes.h:86
#define nproc
Number of processors for MPI.
Definition sizes.h:132
#define nx
Lattice x extent.
Definition sizes.h:66
#define ksizet
Sublattice t extent.
Definition sizes.h:149
#define npx
Processor grid x extent. This must be a divisor of nx.
Definition sizes.h:97
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define npz
Processor grid z extent.
Definition sizes.h:116
#define gvol
Lattice volume.
Definition sizes.h:92
#define FILELEN
Default file name length.
Definition sizes.h:62
#define kvol3
Sublattice spatial volume.
Definition sizes.h:156
#define halo
Total Halo size.
Definition sizes.h:222
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ksizez
Sublattice z extent.
Definition sizes.h:143
#define npy
Processor grid y extent.
Definition sizes.h:108
#define ndim
Dimensions.
Definition sizes.h:179
#define npt
Processor grid t extent.
Definition sizes.h:124
#define ksizey
Sublattice y extent.
Definition sizes.h:141
#define nz
Lattice z extent. We normally use cubic lattices so this is the same as nx.
Definition sizes.h:80
#define ny
Lattice y extent. We normally use cubic lattices so this is the same as nx.
Definition sizes.h:74
Function declarations for most of the routines.