su2hmc
Loading...
Searching...
No Matches
bosonic.c
Go to the documentation of this file.
1
6#include <par_mpi.h>
7#include <su2hmc.h>
8int Average_Plaquette(double *hg, double *avplaqs, double *avplaqt, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, float beta){
9 /*
10 * Calculates the gauge action using new (how new?) lookup table
11 * Follows a routine called qedplaq in some QED3 code
12 *
13 * Parameters:
14 * =========
15 * hg Gauge component of Hamilton
16 * avplaqs Average spacial Plaquette
17 * avplaqt Average Temporal Plaquette
18 * u11t,u12t The trial fields
19 * iu Upper halo indices
20 * beta Inverse gauge coupling
21 *
22 * Calls:
23 * =====
24 * Par_dsum
25 *
26 * Return:
27 * ======
28 * Zero on success, integer error code otherwise
29 */
30 const char *funcname = "Average_Plaquette";
31 /*There was a halo exchange here but moved it outside
32 The FORTRAN code used several consecutive loops to get the plaquette
33 Instead we'll just make the arrays variables and do everything in one loop
34 Should work since in the FORTRAN Sigma11[i] only depends on i components for example
35 Since the ν loop doesn't get called for μ=0 we'll start at μ=1
36 */
37#ifdef __NVCC__
38 __managed__ double hgs = 0; __managed__ double hgt = 0;
39 cuAverage_Plaquette(&hgs, &hgt, u11t, u12t, iu,dimGrid,dimBlock);
40#else
41 double hgs = 0; double hgt = 0;
42 for(int mu=1;mu<ndim;mu++)
43 for(int nu=0;nu<mu;nu++)
44 //Don't merge into a single loop. Makes vectorisation easier?
45 //Or merge into a single loop and dispense with the a arrays?
46#pragma omp parallel for simd aligned(u11t,u12t,iu:AVX) reduction(+:hgs,hgt)
47 for(int i=0;i<kvol;i++){
48 //Save us from typing iu[mu+ndim*i] everywhere
49 switch(mu){
50 //Time component
51 case(ndim-1): hgt -= SU2plaq(u11t,u12t,iu,i,mu,nu);
52 break;
53 //Space component
54 default: hgs -= SU2plaq(u11t,u12t,iu,i,mu,nu);
55 break;
56 }
57 }
58#endif
59#if(nproc>1)
60 Par_dsum(&hgs); Par_dsum(&hgt);
61#endif
62 *avplaqs=-hgs/(3.0*gvol); *avplaqt=-hgt/(gvol*3.0);
63 *hg=(hgs+hgt)*beta;
64#ifdef _DEBUG
65 if(!rank)
66 printf("hgs=%e hgt=%e hg=%e\n", hgs, hgt, *hg);
67#endif
68 return 0;
69}
70#if (!defined __NVCC__ && !defined __HIPCC__)
71#pragma omp declare simd
72inline float SU2plaq(Complex_f *u11t, Complex_f *u12t, unsigned int *iu, int i, int mu, int nu){
73 /*
74 * Calculates the plaquette at site i in the μ-ν direction
75 *
76 * Parameters:
77 * ==========
78 * u11t, u12t: Trial fields
79 * int *iu: Upper halo indices
80 * mu, nu: Plaquette direction. Note that mu and nu can be negative
81 * to facilitate calculating plaquettes for Clover terms. No
82 * sanity checks are conducted on them in this routine.
83 *
84 * Return:
85 * =======
86 * double corresponding to the plaquette value
87 *
88 */
89 const char *funcname = "SU2plaq";
90 int uidm = iu[mu+ndim*i];
91
92 Complex_f Sigma11=u11t[i*ndim+mu]*u11t[uidm*ndim+nu]-u12t[i*ndim+mu]*conj(u12t[uidm*ndim+nu]);
93 Complex_f Sigma12=u11t[i*ndim+mu]*u12t[uidm*ndim+nu]+u12t[i*ndim+mu]*conj(u11t[uidm*ndim+nu]);
94
95 int uidn = iu[nu+ndim*i];
96 Complex_f a11=Sigma11*conj(u11t[uidn*ndim+mu])+Sigma12*conj(u12t[uidn*ndim+mu]);
97 Complex_f a12=-Sigma11*u12t[uidn*ndim+mu]+Sigma12*u11t[uidn*ndim+mu];
98
99 Sigma11=a11*conj(u11t[i*ndim+nu])+a12*conj(u12t[i*ndim+nu]);
100 // Sigma12[i]=-a11[i]*u12t[i*ndim+nu]+a12*u11t[i*ndim+mu];
101 // Not needed in final result as it traces out
102 return creal(Sigma11);
103}
104#endif
105double Polyakov(Complex_f *u11t, Complex_f *u12t){
106 /*
107 * Calculate the Polyakov loop (no prizes for guessing that one...)
108 *
109 * Parameters:
110 * =========
111 * u11t, u12t The gauge fields
112 *
113 * Calls:
114 * ======
115 * Par_tmul, Par_dsum
116 *
117 * Return:
118 * ======
119 * Double corresponding to the polyakov loop
120 */
121 const char *funcname = "Polyakov";
122 double poly = 0;
123#ifdef __NVCC__
124 int device=-1;
125 cudaGetDevice(&device);
126 Complex_f *Sigma11,*Sigma12;
127 cudaMallocManaged((void **)&Sigma11,kvol3*sizeof(Complex_f),cudaMemAttachGlobal);
128#ifdef _DEBUG
129 cudaMallocManaged((void **)&Sigma12,kvol3*sizeof(Complex_f),cudaMemAttachGlobal);
130#else
131 cudaMallocAsync((void **)&Sigma12,kvol3*sizeof(Complex_f),streams[0]);
132#endif
133#else
134 Complex_f *Sigma11 = aligned_alloc(AVX,kvol3*sizeof(Complex_f));
135 Complex_f *Sigma12 = aligned_alloc(AVX,kvol3*sizeof(Complex_f));
136#endif
137
138 //Extract the time component from each site and save in corresponding Sigma
139#ifdef __NVCC__
140 cublasCcopy(cublas_handle,kvol3, (cuComplex *)(u11t+3), ndim, (cuComplex *)Sigma11, 1);
141 cublasCcopy(cublas_handle,kvol3, (cuComplex *)(u12t+3), ndim, (cuComplex *)Sigma12, 1);
142#elif defined USE_BLAS
143 cblas_ccopy(kvol3, u11t+3, ndim, Sigma11, 1);
144 cblas_ccopy(kvol3, u12t+3, ndim, Sigma12, 1);
145#else
146 for(int i=0; i<kvol3; i++){
147 Sigma11[i]=u11t[i*ndim+3];
148 Sigma12[i]=u12t[i*ndim+3];
149 }
150#endif
151 /* Some Fortran commentary
152 Changed this routine.
153 u11t and u12t now defined as normal ie (kvol+halo,4).
154 Copy of Sigma11 and Sigma12 is changed so that it copies
155 in blocks of ksizet.
156 Variable indexu also used to select correct element of u11t and u12t
157 in loop 10 below.
158
159 Change the order of multiplication so that it can
160 be done in parallel. Start at t=1 and go up to t=T:
161 previously started at t+T and looped back to 1, 2, ... T-1
162 Buffers
163 There is a dependency. Can only parallelise the inner loop
164 */
165#ifdef __NVCC__
166 cudaDeviceSynchronise();
167 cuPolyakov(Sigma11,Sigma12,u11t,u12t,dimGrid,dimBlock);
168 cudaMemPrefetchAsync(Sigma11,kvol3*sizeof(Complex_f),cudaCpuDeviceId,NULL);
169#else
170#pragma unroll
171 for(int it=1;it<ksizet;it++)
172#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12:AVX)
173 for(int i=0;i<kvol3;i++){
174 //Seems a bit more efficient to increment indexu instead of reassigning
175 //it every single loop
176 int indexu=it*kvol3+i;
177 Complex_f a11=Sigma11[i]*u11t[indexu*ndim+3]-Sigma12[i]*conj(u12t[indexu*ndim+3]);
178 //Instead of having to store a second buffer just assign it directly
179 Sigma12[i]=Sigma11[i]*u12t[indexu*ndim+3]+Sigma12[i]*conj(u11t[indexu*ndim+3]);
180 Sigma11[i]=a11;
181 }
182 //Multiply this partial loop with the contributions of the other cores in the
183 //Time-like dimension
184#endif
185 //End of CUDA-CPU pre-processor for evaluating Polyakov
186 //
187 //Par_tmul does nothing if there is only a single processor in the time direction. So we only compile
188 //its call if it is required
189#if (npt>1)
190#ifdef __NVCC_
191#error Par_tmul is not yet implimented in CUDA as Sigma12 is device only memory
192#endif
193#ifdef _DEBUG
194 printf("Multiplying with MPI\n");
195#endif
196 Par_tmul(Sigma11, Sigma12);
197 //end of #if(npt>1)
198#endif
199 /*Now all cores have the value for the complete Polyakov line at all spacial sites
200 We need to globally sum over spacial processors but not across time as these
201 are duplicates. So we zero the value for all but t=0
202 This is (according to the FORTRAN code) a bit of a hack
203 I will expand on this hack and completely avoid any work
204 for this case rather than calculating everything just to set it to zero
205 */
206 if(!pcoord[3+rank*ndim])
207#ifdef __NVCC__
208 cudaDeviceSynchronise();
209#pragma omp parallel for simd reduction(+:poly)
210#else
211#pragma omp parallel for simd reduction(+:poly) aligned(Sigma11:AVX)
212#endif
213 for(int i=0;i<kvol3;i++)
214 poly+=creal(Sigma11[i]);
215#ifdef __NVCC__
216 cudaFree(Sigma11);
217#ifdef _DEBUG
218 cudaFree(Sigma12);
219#else
220 cudaFreeAsync(Sigma12,streams[0]);
221#endif
222#else
223 free(Sigma11); free(Sigma12);
224#endif
225
226#if(nproc>1)
227 Par_dsum(&poly);
228#endif
229 poly/=gvol3;
230 return poly;
231}
int Average_Plaquette(double *hg, double *avplaqs, double *avplaqt, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, float beta)
Calculates the gauge action using new (how new?) lookup table.
Definition bosonic.c:8
float SU2plaq(Complex_f *u11t, Complex_f *u12t, unsigned int *iu, int i, int mu, int nu)
Calculates the plaquette at site i in the direction.
Definition bosonic.c:72
double Polyakov(Complex_f *u11t, Complex_f *u12t)
Calculate the Polyakov loop (no prizes for guessing that one...)
Definition bosonic.c:105
MPI headers.
int rank
The MPI rank.
Definition par_mpi.c:22
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
int * pcoord
The processor grid.
Definition par_mpi.c:19
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define gvol3
Lattice spatial volume.
Definition sizes.h:94
#define ksizet
Sublattice t extent.
Definition sizes.h:149
#define kvol
Sublattice volume.
Definition sizes.h:154
#define gvol
Lattice volume.
Definition sizes.h:92
#define kvol3
Sublattice spatial volume.
Definition sizes.h:156
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
Function declarations for most of the routines.