su2hmc
Loading...
Searching...
No Matches
su2hmc.c
Go to the documentation of this file.
1
6#include <assert.h>
7#include <coord.h>
8#ifdef __NVCC__
9#include <cuda.h>
10#include <cuda_runtime.h>
11//Fix this later
12#endif
13#include <matrices.h>
14#include <par_mpi.h>
15#include <random.h>
16#include <string.h>
17#include <su2hmc.h>
18
19int Init(int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq,\
20 Complex *u[2], Complex *ut[2], Complex_f *ut_f[2], Complex *gamval, Complex_f *gamval_f,
21#ifdef __CLOVER__
22 Complex *sigval, Complex_f *sigval_f,
23#endif
24 int *gamin, double *dk[2], float *dk_f[2], unsigned int *iu, unsigned int *id){
25 /*
26 * Initialises the system
27 *
28 * Calls:
29 * ======
30 * Addrc, Check_addr, ran2, DHalo_swap_dir, Par_sread, Par_ranset, Reunitarise
31 *
32 * Globals:
33 * =======
34 * Complex gamval: Gamma Matrices
35 * Complex_f gamval_f: Float Gamma matrices:
36 *
37 * Parameters:
38 * ==========
39 * int istart: Zero for cold, >1 for hot, <1 for none
40 * int ibound: Periodic boundary conditions
41 * int iread: Read configuration from file
42 * float beta: beta
43 * float fmu: Chemical potential
44 * float akappa: Hopping parameter
45 * Complex_f ajq: Diquark source
46 * Complex *u[0]: First colour field
47 * Complex *u[1]: Second colour field
48 * Complex *ut[0]: First colour trial field
49 * Complex *ut[1]: Second colour trial field
50 * Complex_f *ut_f[0]: First float trial field
51 * Complex_f *ut_f[1]: Second float trial field
52 * double *dk[0] $exp(-\mu)$
53 * double *dk[1]: $exp(\mu)$
54 * float *dk_f[0]: $exp(-\mu)$ float
55 * float *dk_f[1]: $exp(\mu)$ float
56 * unsigned int *iu: Up halo indices
57 * unsigned int *id: Down halo indices
58 *
59 * Returns:
60 * =======
61 * Zero on success, integer error code otherwise
62 */
63 const char *funcname = "Init";
64
65#ifdef _OPENMP
66 omp_set_num_threads(nthreads);
67#ifdef __INTEL_MKL__
68 mkl_set_num_threads(nthreads);
69#endif
70#endif
71 //First things first, calculate a few constants for coordinates
72 Addrc(iu, id);
73 //And confirm they're legit
76#ifdef _DEBUG
77 printf("Checked addresses\n");
78#endif
79 double chem1=exp(fmu); double chem2 = 1/chem1;
80 //CUDA this. Only limit will be the bus speed
81#pragma omp parallel for simd //aligned(dk[0],dk[1]:AVX)
82 for(int i = 0; i<kvol; i++){
83 dk[1][i]=akappa*chem1;
84 dk[0][i]=akappa*chem2;
85 }
86 //Anti periodic Boundary Conditions. Flip the terms at the edge of the time
87 //direction
88 if(ibound == -1 && pcoord[3+ndim*rank]==npt-1){
89#ifdef _DEBUG
90 printf("Implementing antiperiodic boundary conditions on rank %i\n", rank);
91#endif
92#pragma omp parallel for simd //aligned(dk[0],dk[1]:AVX)
93 for(int k= kvol-1; k>=kvol-kvol3; k--){
94 //int k = kvol - kvol3 + i;
95 dk[1][k]*=-1;
96 dk[0][k]*=-1;
97 }
98 }
99 //These are constant so swap the halos when initialising and be done with it
100 //May need to add a synchronisation statement here first
101#if(npt>1)
102 DHalo_swap_dir(dk[1], 1, 3, UP);
103 DHalo_swap_dir(dk[0], 1, 3, UP);
104#endif
105 //Float versions
106#ifdef __NVCC__
107 cuReal_convert(dk_f[1],dk[1],kvol+halo,true,dimBlock,dimGrid);
108 cuReal_convert(dk_f[0],dk[0],kvol+halo,true,dimBlock,dimGrid);
109#else
110#pragma omp parallel for simd //aligned(dk[0],dk[1],dk_f[0],dk_f[1]:AVX)
111 for(int i=0;i<kvol+halo;i++){
112 dk_f[1][i]=(float)dk[1][i];
113 dk_f[0][i]=(float)dk[0][i];
114 }
115#endif
116 //What row of each dirac/sigma matrix contains the entry acting on element i of the spinor
117 int __attribute__((aligned(AVX))) gamin_t[4][4] = {{3,2,1,0},{3,2,1,0},{2,3,0,1},{2,3,0,1}};
118 //Gamma Matrices in Chiral Representation
119 //See Appendix 8.1.2 of Montvay and Munster
120 //_t is for temp. We copy these into the real gamvals later
121#ifdef __NVCC__
122 cudaMemcpy(gamin,gamin_t,4*4*sizeof(int),cudaMemcpyDefault);
123#else
124 memcpy(gamin,gamin_t,4*4*sizeof(int));
125#endif
126 //Each row of the dirac matrix contains only one non-zero entry, so that's all we encode here
127 Complex __attribute__((aligned(AVX))) gamval_t[5][4] = {{-I,-I,I,I},{-1,1,1,-1},{-I,I,I,-I},{1,1,1,1},{1,1,-1,-1}};
128 //Each gamma matrix is rescaled by akappa by flattening the gamval array
129#if defined USE_BLAS
130 //Don't cuBLAS this. It is small and won't saturate the GPU. Let the CPU handle
131 //it and just copy it later
132 cblas_zdscal(5*4, akappa, gamval_t, 1);
133#else
134#pragma omp parallel for simd collapse(2) aligned(gamval,gamval_f:AVX)
135 for(int i=0;i<5;i++)
136 for(int j=0;j<4;j++)
137 gamval_t[i][j]*=akappa;
138#endif
139
140
141#ifdef __NVCC__
142 cudaMemcpy(gamval,gamval_t,5*4*sizeof(Complex),cudaMemcpyDefault);
143 cuComplex_convert(gamval_f,gamval,20,true,dimBlockOne,dimGridOne);
144#else
145 memcpy(gamval,gamval_t,5*4*sizeof(Complex));
146 for(int i=0;i<5*4;i++)
147 gamval_f[i]=(Complex_f)gamval[i];
148#endif
149
150#ifdef __CLOVER__
151 int __attribute__((aligned(AVX))) sigin_t[7][4] = {{0,1,2,3},{0,1,2,3},{1,0,3,2},{1,0,3,2},{1,0,3,2},{1,0,3,2},{0,1,2,3}};
152 //The sigma matrices are the commutators of the gamma matrices. These are antisymmetric when you swap the indices
153 //Zero is the identical index case.
154 //1 is sigma_1,2
155 //2 is sigma_1,3
156 //3 is sigma_1,4
157 //4 is sigma_2,3
158 //5 is sigma_2,4
159 //6 is sigma_3,4
160 //TODO: Do we mutiply by 1/2 and kappa here or not? I say yes to be consistent with gamma. It means the factor of 2
161 //in the sigma matrices get's droppeed here
162 Complex __attribute__((aligned(AVX))) sigval_t[7][4] = {{0,0,0,0},{-1,1,-1,1},{-I,I,-I,I},{1,1,-1,-1},{-1,-1,-1,-1},{-I,I,I,-I},{1,-1,-1,1}};
163#if defined USE_BLAS
164 cblas_zdscal(6*4, akappa, sigval_t+4*sizeof(Complex), 1);
165#else
166#pragma omp parallel for simd collapse(2) aligned(sigval,sigval_f:AVX)
167 for(int i=1;i<7;i++)
168 for(int j=0;j<4;j++)
169 sigval_t[i][j]*=akappa;
170#endif
171
172#ifdef __NVCC__
173 cudaMemcpy(sigval,sigval_t,7*4*sizeof(Complex),cudaMemcpyDefault);
174 cuComplex_convert(sigval_f,sigval,28,true,dimBlockOne,dimGridOne);
175#else
176 memcpy(sigval,sigval_t,7*4*sizeof(Complex));
177 for(int i=0;i<7*4;i++)
178 sigval_f[i]=(Complex_f)sigval[i];
179#endif
180#endif
181
182 if(iread){
183 if(!rank) printf("Calling Par_sread() for configuration: %i\n", iread);
184 Par_sread(iread, beta, fmu, akappa, ajq,u[0],u[1],ut[0],ut[1]);
185 Par_ranset(&seed,iread);
186 }
187 else{
188 Par_ranset(&seed,iread);
189 if(istart==0){
190 //Initialise a cold start to zero
191 //memset is safe to use here because zero is zero
192#pragma omp parallel for simd //aligned(ut[0]:AVX)
193 //Leave it to the GPU?
194 for(int i=0; i<kvol*ndim;i++){
195 ut[0][i]=1; ut[1][i]=0;
196 }
197 }
198 else if(istart>0){
199 //Ideally, we can use gsl_ranlux as the PRNG
200#ifdef __RANLUX__
201 for(int i=0; i<kvol*ndim;i++){
202 ut[0][i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
203 ut[1][i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
204 }
205 //If not, the Intel Vectorise Mersenne Twister
206#elif (defined __INTEL_MKL__&&!defined USE_RAN2)
207 //Good news, casting works for using a double to create random complex numbers
208 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*ndim*kvol, ut[0], -1, 1);
209 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*ndim*kvol, ut[1], -1, 1);
210 //Last resort, Numerical Recipes' Ran2
211#else
212 for(int i=0; i<kvol*ndim;i++){
213 ut[0][i]=2*(ran2(&seed)-0.5+I*(ran2(&seed)-0.5));
214 ut[1][i]=2*(ran2(&seed)-0.5+I*(ran2(&seed)-0.5));
215 }
216#endif
217 }
218 else
219 fprintf(stderr,"Warning %i in %s: Gauge fields are not initialised.\n", NOINIT, funcname);
220
221#ifdef __NVCC__
222 int device=-1;
223 cudaGetDevice(&device);
224 cudaMemPrefetchAsync(ut[0], ndim*kvol*sizeof(Complex),device,streams[0]);
225 cudaMemPrefetchAsync(ut[1], ndim*kvol*sizeof(Complex),device,streams[1]);
226#endif
227 //Send trials to accelerator for reunitarisation
228 Reunitarise(ut);
229 //Get trials back
230#ifdef __NVCC__
231 cudaMemcpyAsync(u[0],ut[0],ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[0]);
232 cudaMemPrefetchAsync(u[0], ndim*kvol*sizeof(Complex),device,streams[0]);
233 cudaMemcpyAsync(u[1],ut[1],ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[1]);
234 cudaMemPrefetchAsync(u[1], ndim*kvol*sizeof(Complex),device,streams[1]);
235#else
236 memcpy(u[0], ut[0], ndim*kvol*sizeof(Complex));
237 memcpy(u[1], ut[1], ndim*kvol*sizeof(Complex));
238#endif
239 }
240#ifdef _DEBUG
241 printf("Initialisation Complete\n");
242#endif
243 return 0;
244}
245int Hamilton(double *h,double *s,double res2,double *pp,Complex *X0,Complex *X1,Complex *Phi,
246 Complex_f *ut[2],unsigned int *iu,unsigned int *id, Complex_f *gamval_f,int *gamin,
247 float *dk[2],Complex_f jqq,float akappa,float beta,double *ancgh,int traj){
248 /*
249 * @brief Calculate the Hamiltonian
250 *
251 *
252 * @param h: Hamiltonian
253 * @param s: Action
254 * @param res2: Limit for conjugate gradient
255 * @param X0:
256 * @param X1:
257 * @param Phi:
258 * @param ut[0],ut[1]: Gauge fields
259 * @param ut[0],ut[1]: Gauge fields
260 * @param iu,id: Lattice indices
261 * @param gamval_f: Gamma matrices
262 * @param gamin: Gamma indices
263 * @param dk_f[0]: $exp(-\mu)$ float
264 * @param dk_f[1]: $exp(\mu)$ float
265 * @param jqq: Diquark source
266 * @param akappa: Hopping parameter
267 * @param beta: Inverse gauge coupling
268 * @param ancgh: Conjugate gradient iterations counter
269 *
270 * @return Zero on success. Integer Error code otherwise.
271 */
272 const char *funcname = "Hamilton";
273 //Iterate over momentum terms.
274#ifdef __NVCC__
275 double hp;
276 int device=-1;
277 cudaGetDevice(&device);
278 cudaMemPrefetchAsync(pp,kmom*sizeof(double),device,NULL);
279 cublasDnrm2(cublas_handle, kmom, pp, 1,&hp);
280 hp*=hp;
281#elif defined USE_BLAS
282 double hp = cblas_dnrm2(kmom, pp, 1);
283 hp*=hp;
284#else
285 double hp=0;
286 for(int i = 0; i<kmom; i++)
287 hp+=pp[i]*pp[i];
288#endif
289 hp*=0.5;
290 double avplaqs, avplaqt;
291 double hg = 0;
292 //avplaq? isn't seen again here.
293 Average_Plaquette(&hg,&avplaqs,&avplaqt,ut,iu,beta);
294
295 double hf = 0; int itercg = 0;
296#ifdef __NVCC__
297 Complex *smallPhi;
298 cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),NULL);
299#else
300 Complex *smallPhi = aligned_alloc(AVX,kferm2*sizeof(Complex));
301#endif
302 //Iterating over flavours
303 for(int na=0;na<nf;na++){
304#ifdef __NVCC__
305#ifdef _DEBUG
306 cudaDeviceSynchronise();
307#endif
308 cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,streams[0]);
309#ifdef _DEBUG
310 cudaDeviceSynchronise();
311#endif
312#else
313 memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
314#endif
315 Fill_Small_Phi(na, smallPhi, Phi);
316 if(Congradq(na,res2,X1,smallPhi,ut,iu,id,gamval_f,gamin,dk,jqq,akappa,&itercg))
317 fprintf(stderr,"Trajectory %d\n", traj);
318
319 *ancgh+=itercg;
320#ifdef __NVCC__
321 cudaMemcpyAsync(X0+na*kferm2,X1,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,streams[0]);
322#else
323 memcpy(X0+na*kferm2,X1,kferm2*sizeof(Complex));
324#endif
325 Fill_Small_Phi(na, smallPhi,Phi);
326#ifdef __NVCC__
327 Complex dot;
328 cublasZdotc(cublas_handle,kferm2,(cuDoubleComplex *)smallPhi,1,(cuDoubleComplex *) X1,1,(cuDoubleComplex *) &dot);
329 hf+=creal(dot);
330#elif defined USE_BLAS
331 Complex dot;
332 cblas_zdotc_sub(kferm2, smallPhi, 1, X1, 1, &dot);
333 hf+=creal(dot);
334#else
335 //It is a dot product of the flattened arrays, could use
336 //a module to convert index to coordinate array...
337 for(int j=0;j<kferm2;j++)
338 hf+=creal(conj(smallPhi[j])*X1[j]);
339#endif
340 }
341#ifdef __NVCC__
342 cudaFreeAsync(smallPhi,NULL);
343#else
344 free(smallPhi);
345#endif
346 //hg was summed over inside of Average_Plaquette.
347#if(nproc>1)
348 Par_dsum(&hp); Par_dsum(&hf);
349#endif
350 *s=hg+hf; *h=(*s)+hp;
351#ifdef _DEBUG
352 if(!rank)
353 printf("hg=%.5e; hf=%.5e; hp=%.5e; h=%.5e\n", hg, hf, hp, *h);
354#endif
355 return 0;
356}
357inline int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
358{
359 const char *funcname = "C_gather";
360 //FORTRAN had a second parameter m giving the size of y (kvol+halo) normally
361 //Pointers mean that's not an issue for us so I'm leaving it out
362#ifdef __NVCC__
363 cuC_gather(x,y,n,table,mu,dimBlock,dimGrid);
364#else
365#pragma omp parallel for simd aligned (x,y,table:AVX)
366 for(int i=0; i<n; i++)
367 x[i]=y[table[i*ndim+mu]*ndim+mu];
368#endif
369 return 0;
370}
371inline int Z_gather(Complex *x, Complex *y, int n, unsigned int *table, unsigned int mu)
372{
373 const char *funcname = "Z_gather";
374 //FORTRAN had a second parameter m giving the size of y (kvol+halo) normally
375 //Pointers mean that's not an issue for us so I'm leaving it out
376#ifdef __NVCC__
377 cuZ_gather(x,y,n,table,mu,dimBlock,dimGrid);
378#else
379#pragma omp parallel for simd aligned (x,y,table:AVX)
380 for(int i=0; i<n; i++)
381 x[i]=y[table[i*ndim+mu]*ndim+mu];
382#endif
383 return 0;
384}
385inline int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
386{
387 /*Copies necessary (2*4*kvol) elements of Phi into a vector variable
388 *
389 * Globals:
390 * =======
391 * Phi: The source array
392 *
393 * Parameters:
394 * ==========
395 * int na: flavour index
396 * Complex *smallPhi: The target array
397 *
398 * Returns:
399 * =======
400 * Zero on success, integer error code otherwise
401 */
402 const char *funcname = "Fill_Small_Phi";
403 //BIG and small phi index
404#ifdef __NVCC__
405 cuFill_Small_Phi(na,smallPhi,Phi,dimBlock,dimGrid);
406#else
407#pragma omp parallel for simd aligned(smallPhi,Phi:AVX) collapse(3)
408 for(int i = 0; i<kvol;i++)
409 for(int idirac = 0; idirac<ndirac; idirac++)
410 for(int ic= 0; ic<nc; ic++)
411 // PHI_index=i*16+j*2+k;
412 smallPhi[(i*ndirac+idirac)*nc+ic]=Phi[((na*kvol+i)*ngorkov+idirac)*nc+ic];
413#endif
414 return 0;
415}
416inline int UpDownPart(const int na, Complex *X0, Complex *R1){
417#ifdef __NVCC__
418 cuUpDownPart(na,X0,R1,dimBlock,dimGrid);
419 cudaDeviceSynchronise();
420#else
421 //The only reason this was removed from the original function is for diagnostics
422#pragma omp parallel for simd collapse(2) aligned(X0,R1:AVX)
423 for(int i=0; i<kvol; i++)
424 for(int idirac = 0; idirac < ndirac; idirac++){
425 X0[((na*kvol+i)*ndirac+idirac)*nc]=R1[(i*ngorkov+idirac)*nc];
426 X0[((na*kvol+i)*ndirac+idirac)*nc+1]=R1[(i*ngorkov+idirac)*nc+1];
427 }
428#endif
429 return 0;
430}
Header for routines related to lattice sites.
int Check_addr(unsigned int *table, int lns, int lnt, int imin, int imax)
Definition coord.c:334
int Addrc(unsigned int *iu, unsigned int *id)
Loads the addresses required during the update.
Definition coord.c:16
Matrix multiplication and related declarations.
MPI headers.
#define UP
Flag for send up.
Definition par_mpi.h:37
int Par_sread(const int iread, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t)
Reads and assigns the gauges from file.
Definition par_mpi.c:127
int rank
The MPI rank.
Definition par_mpi.c:22
int DHalo_swap_dir(double *d, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
int * pcoord
The processor grid.
Definition par_mpi.c:19
Header for random number configuration.
int Par_ranset(long *seed, int iread)
Uses the rank to get a new seed. Copying from the FORTRAN description here c create new seeds in rang...
Definition random.c:135
double ran2(long *idum)
Generates uniformly distributed random double between zero and one as described in numerical recipes....
Definition random.c:440
long seed
RAN2 seed.
Definition random.c:27
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define kmom
sublattice momentum sites
Definition sizes.h:184
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define ksizet
Sublattice t extent.
Definition sizes.h:149
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define nthreads
Number of threads for OpenMP, which can be overwritten at runtime.
Definition sizes.h:135
#define ksize
Sublattice spatial extent for a cubic lattice.
Definition sizes.h:146
#define nf
Fermion flavours (double it)
Definition sizes.h:151
#define ndirac
Dirac indices.
Definition sizes.h:177
#define kvol3
Sublattice spatial volume.
Definition sizes.h:156
#define halo
Total Halo size.
Definition sizes.h:222
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188
#define npt
Processor grid t extent.
Definition sizes.h:124
int Z_gather(Complex *x, Complex *y, int n, unsigned int *table, unsigned int mu)
Extracts all the double precision gauge links in the direction only.
Definition su2hmc.c:371
int Init(int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq, Complex *u[2], Complex *ut[2], Complex_f *ut_f[2], Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], unsigned int *iu, unsigned int *id)
Initialises the system.
Definition su2hmc.c:19
int Hamilton(double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
Calculate the Hamiltonian.
Definition su2hmc.c:245
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
Definition su2hmc.c:385
int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
Extracts all the single precision gauge links in the direction only.
Definition su2hmc.c:357
Function declarations for most of the routines.
int Average_Plaquette(double *hg, double *avplaqs, double *avplaqt, Complex_f *ut[2], unsigned int *iu, float beta)
Calculates the gauge action using new (how new?) lookup table.
Definition bosonic.c:20
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
Definition congrad.c:7
int Reunitarise(Complex *ut[2])
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
Definition matrices.c:1012