15MPI_Comm comm = MPI_COMM_WORLD;
39 const char *funcname =
"Par_begin";
42 if(MPI_Init(&argc, &argv)){
43 fprintf(stderr,
"Error %i in %s: Failed to initialise MPI\nExiting\n\n", NO_MPI_INIT, funcname);
44 MPI_Abort(comm,NO_MPI_INIT);
48 if(MPI_Comm_rank(comm, &
rank)){
49 fprintf(stderr,
"Error %i in %s: Failed to find rank.\nExiting...\n\n", NO_MPI_RANK, funcname);
50 MPI_Abort(comm,NO_MPI_RANK);
52 if(MPI_Comm_size(comm, &
size)){
53 fprintf(stderr,
"Error %i in %s: Failed to find size\nExiting...\n\n", NO_MPI_SIZE, funcname);
54 MPI_Abort(comm,NO_MPI_SIZE);
61 fprintf(stderr,
"Error %i in %s: For process %i, size %i is not equal to nproc %i.\n"
64 MPI_Abort(comm,SIZEPROC);
70 int gsize[4], lsize[4];
71 gsize[0]=
nx; gsize[1]=
ny; gsize[2]=
nz; gsize[3]=
nt;
75 int cartsize[
ndim] __attribute__((aligned(
AVX)));
76 cartsize[0]=
npx; cartsize[1]=
npy; cartsize[2]=
npz; cartsize[3]=
npt;
81 int periods[
ndim] __attribute__((aligned(
AVX)));
83 for(
int i=0; i<
ndim; i++)
90 MPI_Cart_create(comm,
ndim, cartsize, periods, reorder, &commcart);
96 for(
int i= 0; i<
ndim; i++)
97 MPI_Cart_shift(commcart, i, 1, &
pd[i], &
pu[i]);
103 for(
int iproc = 0; iproc<
nproc; iproc++){
105#pragma omp simd aligned(pcoord:AVX)
106 for(
int idim = 0; idim<
ndim; idim++){
108 pstop[idim][iproc] =
pstart[idim][iproc] + lsize[idim];
113 for(
int idim = 0; idim<
ndim; idim++){
115 pstop[idim][0] = lsize[idim];
120 printf(
"Running on %i processors.\nGrid layout is %ix%ix%ix%i\n",
122 printf(
"Rank: %i pu: %i %i %i %i pd: %i %i %i %i\n",
rank,
pu[0],
pu[1],
pu[2],
pu[3],
127int Par_sread(
const int iread,
const float beta,
const float fmu,
const float akappa,
const Complex_f ajq,\
142 const char *funcname =
"Par_sread";
146 MPI_Datatype MPI_SEED_TYPE = (
sizeof(
seed)==
sizeof(int)) ? MPI_INT:MPI_LONG;
155 static char gauge_file[
FILELEN]=
"config.";
156 int buffer;
char buff2[7];
158 buffer = (int)round(100*beta);
159 sprintf(buff2,
"b%03d",buffer);
160 strcat(gauge_file,buff2);
162 buffer = (int)round(10000*akappa);
163 sprintf(buff2,
"k%04d",buffer);
164 strcat(gauge_file,buff2);
166 buffer = (int)round(1000*fmu);
167 sprintf(buff2,
"mu%04d",buffer);
168 strcat(gauge_file,buff2);
170 buffer = (int)round(1000*creal(ajq));
171 sprintf(buff2,
"j%03d",buffer);
172 strcat(gauge_file,buff2);
174 sprintf(buff2,
"s%02d",
nx);
175 strcat(gauge_file,buff2);
177 sprintf(buff2,
"t%02d",
nt);
178 strcat(gauge_file,buff2);
181 sprintf(c,
".%06d", iread);
182 strcat(gauge_file, c);
185 printf(
"Opening gauge file on processor: %i\n",
rank);
187 if(!(con = fopen(gauge_file, fileop))){
188 fprintf(stderr,
"Error %i in %s: Failed to open %s for %s.\
189 \nExiting...\n\n", OPENERROR, funcname, gauge_file, fileop);
191 MPI_Abort(comm,OPENERROR);
199 fread(&old_nproc,
sizeof(
int), 1, con);
201 fprintf(stderr,
"Warning %i in %s: Previous run was done on %i processors, current run uses %i.\n",\
202 DIFNPROC,funcname,old_nproc,
nproc);
209 unsigned long *seed_array=(
unsigned long*)calloc(
nproc,
sizeof(
seed));
210#elif defined __INTEL_MKL__ && !defined USE_RAN2
211 int *seed_array=(
int *)calloc(
nproc,
sizeof(
seed));
213 long *seed_array=(
long*)calloc(
nproc,
sizeof(
seed));
215 for(
int i=0; i<fmin(old_nproc,
nproc);i++)
216 fread(seed_array+i,
sizeof(
seed), 1, con);
219 for(
int i=old_nproc; i<
nproc; i++)
220 seed_array[i] = seed_array[0]*(1.0f+8.0f*(
float)i/(
float)(
size-1));
224 for(
int iproc = 1; iproc<
nproc; iproc++)
225 if(MPI_Send(&seed_array[iproc], 1, MPI_SEED_TYPE,iproc, 1, comm)){
226 fprintf(stderr,
"Error %i in %s: Failed to send seed to process %i.\nExiting...\n\n",
227 CANTSEND, funcname, iproc);
228 MPI_Abort(comm,CANTSEND);
232 for(
int iproc = 0; iproc <
nproc; iproc++)
233 for(
int idim = 0; idim <
ndim; idim++){
240 for(
int it=
pstart[3][iproc]; it<
pstop[3][iproc]; it++)
241 for(
int iz=
pstart[2][iproc]; iz<
pstop[2][iproc]; iz++)
242 for(
int iy=
pstart[1][iproc]; iy<
pstop[1][iproc]; iy++)
243 for(
int ix=
pstart[0][iproc]; ix<
pstop[0][iproc]; ix++){
246 u1buff[i]=u11Read[idim*
gvol+j];
247 u2buff[i]=u12Read[idim*
gvol+j];
252 fprintf(stderr,
"Error %i in %s: Number of elements %i is not equal to\
253 kvol %i.\nExiting...\n\n", NUMELEM, funcname, i,
kvol);
255 MPI_Abort(comm,NUMELEM);
262 cblas_zcopy(
kvol,u1buff,1,u11+idim,
ndim);
263 cblas_zcopy(
kvol,u2buff,1,u12+idim,
ndim);
265#pragma omp simd aligned(u11,u12,u1buff,u2buff:AVX)
267 u11[i*
ndim+idim]=u1buff[i];
268 u12[i*
ndim+idim]=u2buff[i];
276 if(MPI_Send(u1buff,
kvol, MPI_C_DOUBLE_COMPLEX,iproc, 2*idim, comm)){
277 fprintf(stderr,
"Error %i in %s: Failed to send ubuff to process %i.\nExiting...\n\n",
278 CANTSEND, funcname, iproc);
280 MPI_Abort(comm,CANTSEND);
285 if(MPI_Send(u2buff,
kvol, MPI_C_DOUBLE_COMPLEX,iproc, 2*idim+1, comm)){
286 fprintf(stderr,
"Error %i in %s: Failed to send ubuff to process %i.\nExiting...\n\n",
287 CANTSEND, funcname, iproc);
289 MPI_Abort(comm,CANTSEND);
297 free(u11Read); free(u12Read);
302 if(MPI_Recv(&
seed, 1, MPI_SEED_TYPE,
masterproc, 1, comm, &status)){
303 fprintf(stderr,
"Error %i in %s: Falied to receive seed on process %i.\nExiting...\n\n",
304 CANTRECV, funcname,
rank);
306 MPI_Abort(comm,CANTRECV);
311 for(
int idim = 0; idim<
ndim; idim++){
313 if(MPI_Recv(u1buff,
kvol, MPI_C_DOUBLE_COMPLEX,
masterproc, 2*idim, comm, &status)){
314 fprintf(stderr,
"Error %i in %s: Falied to receive u11 on process %i.\nExiting...\n\n",
315 CANTRECV, funcname,
rank);
316 MPI_Abort(comm,CANTRECV);
318 if(MPI_Recv(u2buff,
kvol, MPI_C_DOUBLE_COMPLEX,
masterproc, 2*idim+1, comm, &status)){
319 fprintf(stderr,
"Error %i in %s: Falied to receive u12 on process %i.\nExiting...\n\n",
320 CANTRECV, funcname,
rank);
321 MPI_Abort(comm,CANTRECV);
324 cblas_zcopy(
kvol,u1buff,1,u11+idim,
ndim);
325 cblas_zcopy(
kvol,u2buff,1,u12+idim,
ndim);
327#pragma omp parallel for simd aligned(u11,u12,u1buff,u2buff:AVX)
328 for(
int i=0;i<
kvol;i++){
329 u11[i*
ndim+idim]=u1buff[i];
330 u12[i*
ndim+idim]=u2buff[i];
336 free(u1buff); free(u2buff);
341int Par_swrite(
const int itraj,
const int icheck,
const float beta,
const float fmu,
const float akappa,
358 const char *funcname =
"par_swrite";
362 MPI_Datatype MPI_SEED_TYPE = (
sizeof(
seed)==
sizeof(int)) ? MPI_INT:MPI_LONG;
367 char dump_prefix[
FILELEN]=
"u11.";
369 sprintf(dump_buff,
"r%01d_c%06d",
rank,itraj);
370 strcat(dump_prefix,dump_buff);
371 FILE *gauge_dump=fopen(dump_prefix,
"wb");
378 seed=gsl_rng_get(ranlux_instd);
383 unsigned long *seed_array=(
unsigned long*)calloc(
nproc,
sizeof(
seed));
384#elif defined __INTEL_MKL__ && !defined USE_RAN2
385 int *seed_array=(
int *)calloc(
nproc,
sizeof(
seed));
387 long *seed_array=(
long*)calloc(
nproc,
sizeof(
seed));
391 for(
int iproc = 1; iproc<
nproc; iproc++)
392 if(MPI_Recv(&seed_array[iproc], 1, MPI_SEED_TYPE,iproc, 1, comm, &status)){
393 fprintf(stderr,
"Error %i in %s: Failed to receive seed from process %i.\nExiting...\n\n",
394 CANTRECV, funcname, iproc);
395 MPI_Abort(comm,CANTRECV);
401 for(
int iproc=0;iproc<
nproc;iproc++)
402 for(
int idim=0;idim<
ndim;idim++){
405 if(MPI_Recv(u1buff,
kvol, MPI_C_DOUBLE_COMPLEX, iproc, 2*idim, comm, &status)){
406 fprintf(stderr,
"Error %i in %s: Falied to receive u11 from process %i.\nExiting...\n\n",
407 CANTRECV, funcname, iproc);
408 MPI_Abort(comm,CANTRECV);
410 if(MPI_Recv(u2buff,
kvol, MPI_C_DOUBLE_COMPLEX, iproc, 2*idim+1, comm, &status)){
411 fprintf(stderr,
"Error %i in %s: Falied to receive u12 from process %i.\nExiting...\n\n",
412 CANTRECV, funcname, iproc);
413 MPI_Abort(comm,CANTRECV);
421 cblas_zcopy(
kvol,u11+idim,
ndim,u1buff,1);
422 cblas_zcopy(
kvol,u12+idim,
ndim,u2buff,1);
424#pragma omp parallel for simd aligned(u11,u12,u1buff,u2buff:AVX)
425 for(
int i=0;i<
kvol;i++){
426 u1buff[i]=u11[i*
ndim+idim];
427 u2buff[i]=u12[i*
ndim+idim];
432 strcat(part_dump,dump_prefix);
433 sprintf(dump_buff,
"_d%d",idim);
434 strcat(part_dump,dump_buff);
435 FILE *pdump=fopen(part_dump,
"wb");
443 for(
int it=
pstart[3][iproc]; it<
pstop[3][iproc]; it++)
444 for(
int iz=
pstart[2][iproc]; iz<
pstop[2][iproc]; iz++)
445 for(
int iy=
pstart[1][iproc]; iy<
pstop[1][iproc]; iy++)
446 for(
int ix=
pstart[0][iproc]; ix<
pstop[0][iproc]; ix++){
449 u11Write[idim*
gvol+j] = u1buff[i];
450 u12Write[idim*
gvol+j] = u2buff[i];
455 fprintf(stderr,
"Error %i in %s: Number of elements %i is not equal to\
456 kvol %i.\nExiting...\n\n", NUMELEM, funcname, i,
kvol);
458 MPI_Abort(comm,NUMELEM);
464 free(u1buff); free(u2buff);
466 char gauge_title[
FILELEN]=
"config.";
467 int buffer;
char buff2[7];
469 buffer = (int)round(100*beta);
470 sprintf(buff2,
"b%03d",buffer);
471 strcat(gauge_title,buff2);
473 buffer = (int)round(10000*akappa);
474 sprintf(buff2,
"k%04d",buffer);
475 strcat(gauge_title,buff2);
477 buffer = (int)round(1000*fmu);
478 sprintf(buff2,
"mu%04d",buffer);
479 strcat(gauge_title,buff2);
481 buffer = (int)round(1000*creal(ajq));
482 sprintf(buff2,
"j%03d",buffer);
483 strcat(gauge_title,buff2);
485 sprintf(buff2,
"s%02d",
nx);
486 strcat(gauge_title,buff2);
488 sprintf(buff2,
"t%02d",
nt);
489 strcat(gauge_title,buff2);
492 strcpy(gauge_file,gauge_title);
494 sprintf(c,
".%06d", itraj);
495 strcat(gauge_file, c);
496 printf(
"Gauge file name is %s\n", gauge_file);
497 printf(
"Writing the gauge file on processor %i.\n",
rank);
500 if(!(con=fopen(gauge_file, fileop))){
501 fprintf(stderr,
"Error %i in %s: Failed to open %s for %s.\
502 \nExiting...\n\n", OPENERROR, funcname, gauge_file, fileop);
504 MPI_Abort(comm,OPENERROR);
514 fwrite(&
size,
sizeof(
int),1,con);
519 fwrite(seed_array,
nproc*
sizeof(
seed), 1, con);
521 free(u11Write); free(u12Write);
527 fprintf(stderr,
"Error %i in %s: Falied to send u11 from process %i.\nExiting...\n\n",
528 CANTSEND, funcname,
rank);
529 MPI_Abort(comm,CANTSEND);
531 for(
int idim = 0; idim<
ndim; idim++){
533 cblas_zcopy(
kvol,u11+idim,
ndim,u1buff,1);
534 cblas_zcopy(
kvol,u12+idim,
ndim,u2buff,1);
536#pragma omp parallel for simd aligned(u11,u12,u1buff,u2buff:AVX)
537 for(
int i=0;i<
kvol;i++){
538 u1buff[i]=u11[i*
ndim+idim];
539 u2buff[i]=u12[i*
ndim+idim];
544 strcat(part_dump,dump_prefix);
545 sprintf(dump_buff,
"_d%d",idim);
546 strcat(part_dump,dump_buff);
547 FILE *pdump=fopen(part_dump,
"wb");
552 if(MPI_Send(u1buff,
kvol, MPI_C_DOUBLE_COMPLEX,
masterproc, 2*idim, comm)){
553 fprintf(stderr,
"Error %i in %s: Falied to send u11 from process %i.\nExiting...\n\n",
554 CANTSEND, funcname,
rank);
555 MPI_Abort(comm,CANTSEND);
557 if(MPI_Send(u2buff,
kvol, MPI_C_DOUBLE_COMPLEX,
masterproc, 2*idim+1, comm)){
558 fprintf(stderr,
"Error %i in %s: Falied to send u12 from process %i.\nExiting...\n\n",
559 CANTSEND, funcname,
rank);
560 MPI_Abort(comm,CANTSEND);
563 free(u1buff); free(u2buff);
586 const char *funcname =
"Par_isum";
590 if(MPI_Allreduce(ival, itmp, 1, MPI_DOUBLE, MPI_SUM, comm)){
591 fprintf(stderr,
"Error %i in %s: Couldn't complete reduction for %i.\nExiting...\n\n", REDUCERR, funcname, *ival);
592 MPI_Abort(comm,REDUCERR);
611 const char *funcname =
"Par_dsum";
615 if(MPI_Allreduce(dval, &dtmp, 1, MPI_DOUBLE, MPI_SUM, comm)){
616 fprintf(stderr,
"Error %i in %s: Couldn't complete reduction for %f.\nExiting...\n\n", REDUCERR, funcname, *dval);
617 MPI_Abort(comm,REDUCERR);
637 const char *funcname =
"far_dsum";
641 if(MPI_Allreduce(fval, &ftmp, 1, MPI_FLOAT, MPI_SUM, comm)){
642 fprintf(stderr,
"Error %i in %s: Couldn't complete reduction for %f.\nExiting...\n\n", REDUCERR, funcname, *fval);
643 MPI_Abort(comm,REDUCERR);
663 const char *funcname =
"Par_csum";
667 if(MPI_Allreduce(cval, &ctmp, 1, MPI_C_FLOAT_COMPLEX, MPI_SUM, comm)){
669 fprintf(stderr,
"Error %i in %s: Couldn't complete reduction for %f+%f i.\nExiting...\n\n",
670 REDUCERR, funcname, creal(*cval), cimag(*cval));
672 MPI_Abort(comm,REDUCERR);
692 const char *funcname =
"Par_zsum";
696 if(MPI_Allreduce(zval, &ztmp, 1, MPI_C_DOUBLE_COMPLEX, MPI_SUM, comm)){
698 fprintf(stderr,
"Error %i in %s: Couldn't complete reduction for %f+%f i.\nExiting...\n\n",
699 REDUCERR, funcname, creal(*zval), cimag(*zval));
701 MPI_Abort(comm,REDUCERR);
718 const char *funcname =
"Par_icopy";
719 if(MPI_Bcast(ival,1,MPI_INT,
masterproc,comm)){
720 fprintf(stderr,
"Error %i in %s: Failed to broadcast %i from %i.\nExiting...\n\n",
721 BROADERR, funcname, *ival,
rank);
722 MPI_Abort(comm,BROADERR);
738 const char *funcname =
"Par_dcopy";
739 if(MPI_Bcast(dval,1,MPI_DOUBLE,
masterproc,comm)){
740 fprintf(stderr,
"Error %i in %s: Failed to broadcast %f from %i.\nExiting...\n\n",
741 BROADERR, funcname, *dval,
rank);
742 MPI_Abort(comm,BROADERR);
758 const char *funcname =
"Par_dfopy";
759 if(MPI_Bcast(fval,1,MPI_FLOAT,
masterproc,comm)){
760 fprintf(stderr,
"Error %i in %s: Failed to broadcast %f from %i.\nExiting...\n\n",
761 BROADERR, funcname, *fval,
rank);
762 MPI_Abort(comm,BROADERR);
778 const char *funcname =
"Par_ccopy";
779 if(MPI_Bcast(cval,1,MPI_C_FLOAT_COMPLEX,
masterproc,comm)){
781 fprintf(stderr,
"Error %i in %s: Failed to broadcast %f+i%f from %i.\nExiting...\n\n",
782 BROADERR, funcname, creal(*cval), cimag(*cval),
rank);
784 MPI_Abort(comm,BROADERR);
800 const char *funcname =
"Par_zcopy";
801 if(MPI_Bcast(zval,1,MPI_C_DOUBLE_COMPLEX,
masterproc,comm)){
803 fprintf(stderr,
"Error %i in %s: Failed to broadcast %f+i%f from %i.\nExiting...\n\n",
804 BROADERR, funcname, creal(*zval), cimag(*zval),
rank);
806 MPI_Abort(comm,BROADERR);
832 const char *funcname =
"ZHalo_swap_all";
874 const char *funcname =
"ZHalo_swap_dir";
876 if(layer!=
DOWN && layer!=
UP){
877 fprintf(stderr,
"Error %i in %s: Cannot swap in the direction given by %i.\nExiting...\n\n",
878 LAYERROR, funcname, layer);
879 MPI_Abort(comm,BROADERR);
888 fprintf(stderr,
"Error %i in %s: Writing a message of size %i to flattened index %i will cause "\
889 "a memory leak on rank %i.\nExiting...\n\n"
890 ,BOUNDERROR, funcname, msg_size, ncpt*
h1u[idir],
rank);
891 MPI_Abort(comm,BOUNDERROR);
893#pragma omp parallel for
894 for(
int ihalo = 0; ihalo <
halosize[idir]; ihalo++)
895#pragma omp simd aligned(z, sendbuf:
AVX)
896 for(
int icpt = 0; icpt <ncpt; icpt++)
897 sendbuf[ihalo*ncpt+icpt]=z[ncpt*
hd[
ndim*ihalo+idir]+icpt];
899 if(MPI_Isend(sendbuf, msg_size, MPI_C_DOUBLE_COMPLEX,
pd[idir],
tag, comm, &request)){
900 fprintf(stderr,
"Error %i in %s: Failed to send off the down halo from rank %i to rank %i.\nExiting...\n"
901 ,CANTSEND, funcname,
rank,
pd[idir]);
902 MPI_Abort(comm,CANTSEND);
904 if(MPI_Recv(&z[ncpt*
h1u[idir]], msg_size, MPI_C_DOUBLE_COMPLEX,
pu[idir],
tag, comm, &status)){
905 fprintf(stderr,
"Error %i in %s: Rank %i failed to receive into up halo from rank %i.\nExiting...\n",
906 CANTRECV, funcname,
rank,
pu[idir]);
907 MPI_Abort(comm,CANTRECV);
912 fprintf(stderr,
"Error %i in %s: Writing a message of size %i to flattened index %i will cause "\
913 "a memory leak on rank %i.\nExiting...\n\n"
914 ,BOUNDERROR, funcname, msg_size, ncpt*
h1d[idir],
rank);
915 MPI_Abort(comm,BOUNDERROR);
917#pragma omp parallel for
918 for(
int ihalo = 0; ihalo <
halosize[idir]; ihalo++)
919#pragma omp simd aligned(z, sendbuf:
AVX)
920 for(
int icpt = 0; icpt <ncpt; icpt++)
921 sendbuf[ihalo*ncpt+icpt]=z[ncpt*
hu[
ndim*ihalo+idir]+icpt];
923 if(MPI_Isend(sendbuf, msg_size, MPI_C_DOUBLE_COMPLEX,
pu[idir], 0, comm, &request)){
924 fprintf(stderr,
"Error %i in %s: Failed to send off the up halo from rank %i to rank %i.\nExiting...\n",
925 CANTSEND, funcname,
rank,
pu[idir]);
926 MPI_Abort(comm,CANTSEND);
928 if(MPI_Recv(&z[ncpt*
h1d[idir]], msg_size, MPI_C_DOUBLE_COMPLEX,
pd[idir],
tag, comm, &status)){
929 fprintf(stderr,
"Error %i in %s: Rank %i failed to receive into doww halo from rank %i.\nExiting...\n",
930 CANTRECV, funcname,
rank,
pd[idir]);
931 MPI_Abort(comm,CANTRECV);
936 MPI_Wait(&request, &status);
952 const char *funcname =
"ZHalo_swap_all";
994 const char *funcname =
"CHalo_swap_dir";
996 if(layer!=
DOWN && layer!=
UP){
997 fprintf(stderr,
"Error %i in %s: Cannot swap in the direction given by %i.\nExiting...\n\n",
998 LAYERROR, funcname, layer);
999 MPI_Abort(comm,LAYERROR);
1008 fprintf(stderr,
"Error %i in %s: Writing a message of size %i to flattened index %i will cause "\
1009 "a memory leak on rank %i.\nExiting...\n\n"
1010 ,BOUNDERROR, funcname, msg_size, ncpt*
h1u[idir],
rank);
1011 MPI_Abort(comm,BOUNDERROR);
1013#pragma omp parallel for
1014 for(
int ihalo = 0; ihalo <
halosize[idir]; ihalo++)
1015#pragma omp simd aligned(c, sendbuf:
AVX)
1016 for(
int icpt = 0; icpt <ncpt; icpt++)
1017 sendbuf[ihalo*ncpt+icpt]=c[ncpt*
hd[
ndim*ihalo+idir]+icpt];
1019 if(MPI_Isend(sendbuf, msg_size, MPI_C_FLOAT_COMPLEX,
pd[idir],
tag, comm, &request)){
1020 fprintf(stderr,
"Error %i in %s: Failed to send off the down halo from rank %i to rank %i.\nExiting...\n"
1021 ,CANTSEND, funcname,
rank,
pd[idir]);
1022 MPI_Abort(comm,CANTSEND);
1024 if(MPI_Recv(&c[ncpt*
h1u[idir]], msg_size, MPI_C_FLOAT_COMPLEX,
pu[idir],
tag, comm, &status)){
1025 fprintf(stderr,
"Error %i in %s: Rank %i failed to receive into up halo from rank %i.\nExiting...\n",
1026 CANTRECV, funcname,
rank,
pu[idir]);
1027 MPI_Abort(comm,CANTRECV);
1032 fprintf(stderr,
"Error %i in %s: Writing a message of size %i to flattened index %i will cause "\
1033 "a memory leak on rank %i.\nExiting...\n\n"
1034 ,BOUNDERROR, funcname, msg_size, ncpt*
h1d[idir],
rank);
1035 MPI_Abort(comm,BOUNDERROR);
1037#pragma omp parallel for
1038 for(
int ihalo = 0; ihalo <
halosize[idir]; ihalo++)
1039#pragma omp simd aligned(c, sendbuf:
AVX)
1040 for(
int icpt = 0; icpt <ncpt; icpt++)
1041 sendbuf[ihalo*ncpt+icpt]=c[ncpt*
hu[
ndim*ihalo+idir]+icpt];
1043 if(MPI_Isend(sendbuf, msg_size, MPI_C_FLOAT_COMPLEX,
pu[idir], 0, comm, &request)){
1044 fprintf(stderr,
"Error %i in %s: Failed to send off the up halo from rank %i to rank %i.\nExiting...\n",
1045 CANTSEND, funcname,
rank,
pu[idir]);
1046 MPI_Abort(comm,CANTSEND);
1048 if(MPI_Recv(&c[ncpt*
h1d[idir]], msg_size, MPI_C_FLOAT_COMPLEX,
pd[idir],
tag, comm, &status)){
1049 fprintf(stderr,
"Error %i in %s: Rank %i failed to receive into doww halo from rank %i.\nExiting...\n",
1050 CANTRECV, funcname,
rank,
pd[idir]);
1051 MPI_Abort(comm,CANTRECV);
1056 MPI_Wait(&request, &status);
1072 const char *funcname =
"DHalo_swap_all";
1114 const char *funcname =
"DHalo_swap_dir";
1118 double *sendbuf = (
double *)aligned_alloc(
AVX,msg_size*
sizeof(
double));
1119 if(layer!=
DOWN && layer!=
UP){
1120 fprintf(stderr,
"Error %i in %s: Cannot swap in the direction given by %i.\nExiting...\n\n",
1121 LAYERROR, funcname, layer);
1122 MPI_Abort(comm,LAYERROR);
1128 fprintf(stderr,
"Error %i in %s: Writing a message of size %i to flattened index %i will cause "\
1129 "a memory leak on rank %i.\nExiting...\n\n"
1130 ,BOUNDERROR, funcname, msg_size, ncpt*
h1u[idir],
rank);
1131 MPI_Abort(comm,BOUNDERROR);
1133#pragma omp parallel for
1134 for(
int ihalo = 0; ihalo <
halosize[idir]; ihalo++)
1135#pragma omp simd aligned(d,sendbuf:
AVX)
1136 for(
int icpt = 0; icpt <ncpt; icpt++)
1137 sendbuf[ihalo*ncpt+icpt]=d[ncpt*
hd[
ndim*ihalo+idir]+icpt];
1139 if(MPI_Isend(sendbuf, msg_size, MPI_DOUBLE,
pd[idir],
tag, comm, &request)){
1140 fprintf(stderr,
"Error %i in %s: Failed to send off the down halo from rank %i to rank %i.\nExiting...\n\n",
1141 CANTSEND, funcname,
rank,
pd[idir]);
1142 MPI_Abort(comm,CANTSEND);
1144 if(MPI_Recv(&d[ncpt*
h1u[idir]], msg_size, MPI_DOUBLE,
pu[idir],
tag, comm, &status)){
1145 fprintf(stderr,
"Error %i in %s: Rank %i failed to receive into up halo from rank %i.\nExiting...\n\n",
1146 CANTRECV, funcname,
rank,
pu[idir]);
1147 MPI_Abort(comm,CANTRECV);
1151 fprintf(stderr,
"Error %i in %s: Writing a message of size %i to flattened index %i will cause "\
1152 "a memory leak on rank %i.\nExiting...\n\n"
1153 ,BOUNDERROR, funcname, msg_size, ncpt*
h1d[idir],
rank);
1154 MPI_Abort(comm,BOUNDERROR);
1156#pragma omp parallel for
1157 for(
int ihalo = 0; ihalo <
halosize[idir]; ihalo++)
1158#pragma omp simd aligned(d,sendbuf:
AVX)
1159 for(
int icpt = 0; icpt <ncpt; icpt++)
1160 sendbuf[ihalo*ncpt+icpt]=d[ncpt*
hu[
ndim*ihalo+idir]+icpt];
1162 if(MPI_Isend(sendbuf, msg_size, MPI_DOUBLE,
pu[idir], 0, comm, &request)){
1163 fprintf(stderr,
"Error %i in %s: Failed to send off the up halo from rank %i to rank %i.\nExiting...\n\n",
1164 CANTSEND, funcname,
rank,
pu[idir]);
1165 MPI_Abort(comm,CANTSEND);
1167 if(MPI_Recv(&d[ncpt*
h1d[idir]], msg_size, MPI_DOUBLE,
pd[idir],
tag, comm, &status)){
1168 fprintf(stderr,
"Error %i in %s: Rank %i failed to receive into doww halo from rank %i.\nExiting...\n\n",
1169 CANTRECV, funcname,
rank,
pd[idir]);
1170 MPI_Abort(comm,CANTRECV);
1174 MPI_Wait(&request, &status);
1185 const char *funchame =
"Trial_Exchange";
1190 cudaGetDevice(&device);
1191 cudaMemPrefetchAsync(u11t,
ndim*
kvol*
sizeof(
Complex),cudaCpuDeviceId,NULL);
1192 cudaMemPrefetchAsync(u12t,
ndim*
kvol*
sizeof(
Complex),cudaCpuDeviceId,NULL);
1195 for(
int mu=0;mu<
ndim;mu++){
1198 cblas_zcopy(
kvol, &u11t[mu],
ndim, z, 1);
1200 for(
int i=0; i<
kvol;i++)
1201 z[i]=u11t[i*
ndim+mu];
1213 cblas_zcopy(
kvol, &u12t[mu],
ndim, z, 1);
1216 u11t[i*
ndim+mu]=z[i];
1217 z[i]=u12t[i*
ndim+mu];
1225 u12t[i*
ndim+mu]=z[i];
1236 cuComplex_convert(u11t_f,u11t,
ndim*(
kvol+
halo),
true,dimBlock,dimGrid);
1237 cuComplex_convert(u12t_f,u12t,
ndim*(
kvol+
halo),
true,dimBlock,dimGrid);
1238 cudaDeviceSynchronise();
1240#pragma omp parallel for simd aligned(u11t_f,u12t_f,u11t,u12t:AVX)
1261#error Par_tmul is not yet implimented in CUDA as Sigma12 in Polyakov is device only memory
1264 const char *funcname =
"Par_tmul";
1279 if(!
rank) printf(
"Sending between halos in the time direction. For rank %i pu[3]=%i and pd[3] = %i\n",
1282 for(itime=1;itime<
npt; itime++){
1286 if(!
rank) printf(
"t11 and t12 assigned. Getting ready to send to other processes.\n");
1292 if(MPI_Isend(t11,
kvol3, MPI_C_FLOAT_COMPLEX,
pd[3],
tag, comm, &request)){
1293 fprintf(stderr,
"Error %i in %s: Failed to send t11 to process %i.\nExiting...\n\n",
1294 CANTSEND, funcname,
pd[3]);
1295 MPI_Abort(comm,CANTSEND);
1298 printf(
"Sent t11 from rank %i to the down halo on rank %i\n",
rank,
pd[3]);
1300 if(MPI_Recv(a11,
kvol3, MPI_C_FLOAT_COMPLEX,
pu[3],
tag, comm, &status)){
1301 fprintf(stderr,
"Error %i in %s: Failed to receive a11 from process %i.\nExiting...\n\n",
1302 CANTSEND, funcname,
pu[3]);
1303 MPI_Abort(comm,CANTSEND);
1306 printf(
"Received t11 from rank %i in the up halo on rank %i\n",
pu[3],
rank);
1308 MPI_Wait(&request, &status);
1309 if(MPI_Isend(t12,
kvol3, MPI_C_FLOAT_COMPLEX,
pd[3],
tag, comm, &request)){
1310 fprintf(stderr,
"Error %i in %s: Failed to send t12 to process %i.\nExiting...\n\n",
1311 CANTSEND, funcname,
pd[3]);
1312 MPI_Abort(comm,CANTSEND);
1314 if(MPI_Recv(a12,
kvol3, MPI_C_FLOAT_COMPLEX,
pu[3],
tag, comm, &status)){
1315 fprintf(stderr,
"Error %i in %s: Failed to receive a12 from process %i.\nExiting...\n\n",
1316 CANTSEND, funcname,
pu[3]);
1317 MPI_Abort(comm,CANTSEND);
1320 printf(
"Finished sending and receiving on rank %i\n",
rank);
1322 MPI_Wait(&request, &status);
1326#pragma omp parallel for simd aligned(a11,a12,t11,t12,z11,z12:AVX)
1327 for(i=0;i<
kvol3;i++){
1328 t11[i]=z11[i]*a11[i]-z12[i]*conj(a12[i]);
1329 t12[i]=z11[i]*a12[i]+z12[i]*conj(a11[i]);
1334 free(a11); free(a12); free(t11); free(t12);
int Coord2gindex(int ix, int iy, int iz, int it)
Converts the coordinates of a global lattice point to its index in the computer memory.
unsigned int * h1u
Up halo starting element.
unsigned int * h1d
Down halo starting element.
unsigned int * halosize
Array containing the size of the halo in each direction.
unsigned int * hd
Down halo indices.
unsigned int * hu
Up halo indices.
int Par_swrite(const int itraj, const int icheck, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12)
Copies u11 and u12 into arrays without halos which then get written to output.
int size
The number of MPI ranks in total.
int Par_sread(const int iread, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t)
Reads and assigns the gauges from file.
int Par_begin(int argc, char *argv[])
Initialises the MPI configuration.
int Trial_Exchange(Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f)
Exchanges the trial fields.
int * pcoord
The processor grid.
int CHalo_swap_all(Complex_f *c, int ncpt)
Calls the functions to send data to both the up and down halos.
#define UP
Flag for send up.
int __RANLUX__ pstart[ndim][nproc]
The initial lattice site on each sublattice in a given direction.
int Par_fcopy(float *fval)
Broadcasts a float to the other processes.
int ZHalo_swap_all(Complex *z, int ncpt)
Calls the functions to send data to both the up and down halos.
#define DOWN
Flag for send down.
int Par_fsum(float *dval)
Performs a reduction on a float dval to get a sum which is then distributed to all ranks.
int Par_dcopy(double *dval)
Broadcasts a double to the other processes.
int Par_isum(int *ival)
Performs a reduction on an integer ival to get a sum which is then distributed to all ranks.
int Par_icopy(int *ival)
Broadcasts an integer to the other processes.
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Par_ccopy(Complex *cval)
Broadcasts a complex float to the other processes.
int Par_zcopy(Complex *zval)
Broadcasts a complex double to the other processes.
int DHalo_swap_all(double *d, int ncpt)
Calls the functions to send data to both the up and down halos.
int __RANLUX__ pu[ndim]
Processors in the up direction.
#define masterproc
The main rank. Used for serial tasks.
int CHalo_swap_dir(Complex_f *c, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int __RANLUX__ pstop[ndim][nproc]
The final lattice site on each sublattice in a given direction.
int Par_csum(Complex_f *cval)
Performs a reduction on a complex float cval to get a sum which is then distributed to all ranks.
int Par_zsum(Complex *zval)
Performs a reduction on a complex double zval to get a sum which is then distributed to all ranks.
int __RANLUX__ pd[ndim]
Processors in the down direction.
int DHalo_swap_dir(double *d, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
#define tag
default MPI tag
Header for random number configuration.
#define ksizex
Sublattice x extent.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
#define nt
Lattice temporal extent. This also corresponds to the inverse temperature.
#define nproc
Number of processors for MPI.
#define nx
Lattice x extent.
#define ksizet
Sublattice t extent.
#define npx
Processor grid x extent. This must be a divisor of nx.
#define kvol
Sublattice volume.
#define Complex
Double precision complex number.
#define npz
Processor grid z extent.
#define gvol
Lattice volume.
#define FILELEN
Default file name length.
#define kvol3
Sublattice spatial volume.
#define halo
Total Halo size.
#define Complex_f
Single precision complex number.
#define ksizez
Sublattice z extent.
#define npy
Processor grid y extent.
#define npt
Processor grid t extent.
#define ksizey
Sublattice y extent.
#define nz
Lattice z extent. We normally use cubic lattices so this is the same as nx.
#define ny
Lattice y extent. We normally use cubic lattices so this is the same as nx.
Function declarations for most of the routines.