su2hmc
Loading...
Searching...
No Matches
su2hmc.c
Go to the documentation of this file.
1
6#include <assert.h>
7#include <coord.h>
8#ifdef __NVCC__
9#include <cuda.h>
10#include <cuda_runtime.h>
11//Fix this later
12#endif
13#include <matrices.h>
14#include <par_mpi.h>
15#include <random.h>
16#include <string.h>
17#include <su2hmc.h>
18
19int Init(int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq,\
20 Complex *u11, Complex *u12, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f,\
21 Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f,\
22 unsigned int *iu, unsigned int *id){
23 /*
24 * Initialises the system
25 *
26 * Calls:
27 * ======
28 * Addrc, Check_addr, ran2, DHalo_swap_dir, Par_sread, Par_ranset, Reunitarise
29 *
30 * Globals:
31 * =======
32 * Complex gamval: Gamma Matrices
33 * Complex_f gamval_f: Float Gamma matrices:
34 *
35 * Parameters:
36 * ==========
37 * int istart: Zero for cold, >1 for hot, <1 for none
38 * int ibound: Periodic boundary conditions
39 * int iread: Read configuration from file
40 * float beta: beta
41 * float fmu: Chemical potential
42 * float akappa: Hopping parameter
43 * Complex_f ajq: Diquark source
44 * Complex *u11: First colour field
45 * Complex *u12: Second colour field
46 * Complex *u11t: First colour trial field
47 * Complex *u12t: Second colour trial field
48 * Complex_f *u11t_f: First float trial field
49 * Complex_f *u12t_f: Second float trial field
50 * double *dk4m $exp(-\mu)$
51 * double *dk4p: $exp(\mu)$
52 * float *dk4m_f: $exp(-\mu)$ float
53 * float *dk4p_f: $exp(\mu)$ float
54 * unsigned int *iu: Up halo indices
55 * unsigned int *id: Down halo indices
56 *
57 * Returns:
58 * =======
59 * Zero on success, integer error code otherwise
60 */
61 const char *funcname = "Init";
62
63#ifdef _OPENMP
64 omp_set_num_threads(nthreads);
65#ifdef __INTEL_MKL__
66 mkl_set_num_threads(nthreads);
67#endif
68#endif
69 //First things first, calculate a few constants for coordinates
70 Addrc(iu, id);
71 //And confirm they're legit
74#ifdef _DEBUG
75 printf("Checked addresses\n");
76#endif
77 double chem1=exp(fmu); double chem2 = 1/chem1;
78 //CUDA this. Only limit will be the bus speed
79#pragma omp parallel for simd aligned(dk4m,dk4p:AVX)
80 for(int i = 0; i<kvol; i++){
81 dk4p[i]=akappa*chem1;
82 dk4m[i]=akappa*chem2;
83 }
84 //Anti periodic Boundary Conditions. Flip the terms at the edge of the time
85 //direction
86 if(ibound == -1 && pcoord[3+ndim*rank]==npt-1){
87#ifdef _DEBUG
88 printf("Implementing antiperiodic boundary conditions on rank %i\n", rank);
89#endif
90#pragma omp parallel for simd aligned(dk4m,dk4p:AVX)
91 for(int k= kvol-1; k>=kvol-kvol3; k--){
92 //int k = kvol - kvol3 + i;
93 dk4p[k]*=-1;
94 dk4m[k]*=-1;
95 }
96 }
97 //These are constant so swap the halos when initialising and be done with it
98 //May need to add a synchronisation statement here first
99#if(npt>1)
100 DHalo_swap_dir(dk4p, 1, 3, UP);
101 DHalo_swap_dir(dk4m, 1, 3, UP);
102#endif
103 //Float versions
104#ifdef __NVCC__
105 cuReal_convert(dk4p_f,dk4p,kvol+halo,true,dimBlock,dimGrid);
106 cuReal_convert(dk4m_f,dk4m,kvol+halo,true,dimBlock,dimGrid);
107#else
108#pragma omp parallel for simd aligned(dk4m,dk4p,dk4m_f,dk4p_f:AVX)
109 for(int i=0;i<kvol+halo;i++){
110 dk4p_f[i]=(float)dk4p[i];
111 dk4m_f[i]=(float)dk4m[i];
112 }
113#endif
114 int __attribute__((aligned(AVX))) gamin_t[4][4] = {{3,2,1,0},{3,2,1,0},{2,3,0,1},{2,3,0,1}};
115 //Gamma Matrices in Chiral Representation
116 //Gattringer and Lang have a nice crash course in appendix A.2 of
117 //Quantum Chromodynamics on the Lattice (530.14 GAT)
118 //_t is for temp. We copy these into the real gamvals later
119#ifdef __NVCC__
120 cudaMemcpy(gamin,gamin_t,4*4*sizeof(int),cudaMemcpyDefault);
121#else
122 memcpy(gamin,gamin_t,4*4*sizeof(int));
123#endif
124 Complex __attribute__((aligned(AVX))) gamval_t[5][4] = {{-I,-I,I,I},{-1,1,1,-1},{-I,I,I,-I},{1,1,1,1},{1,1,-1,-1}};
125 //Each gamma matrix is rescaled by akappa by flattening the gamval array
126#if defined USE_BLAS
127 //Don't cuBLAS this. It is small and won't saturate the GPU. Let the CPU handle
128 //it and just copy it later
129 cblas_zdscal(5*4, akappa, gamval_t, 1);
130#else
131#pragma omp parallel for simd collapse(2) aligned(gamval,gamval_f:AVX)
132 for(int i=0;i<5;i++)
133 for(int j=0;j<4;j++)
134 gamval_t[i][j]*=akappa;
135#endif
136
137#ifdef __NVCC__
138 cudaMemcpy(gamval,gamval_t,5*4*sizeof(Complex),cudaMemcpyDefault);
139 cuComplex_convert(gamval_f,gamval,20,true,dimBlockOne,dimGridOne);
140#else
141 memcpy(gamval,gamval_t,5*4*sizeof(Complex));
142 for(int i=0;i<5*4;i++)
143 gamval_f[i]=(Complex_f)gamval[i];
144#endif
145 if(iread){
146 if(!rank) printf("Calling Par_sread() for configuration: %i\n", iread);
147 Par_sread(iread, beta, fmu, akappa, ajq,u11,u12,u11t,u12t);
148 Par_ranset(&seed,iread);
149 }
150 else{
151 Par_ranset(&seed,iread);
152 if(istart==0){
153 //Initialise a cold start to zero
154 //memset is safe to use here because zero is zero
155#pragma omp parallel for simd aligned(u11t:AVX)
156 //Leave it to the GPU?
157 for(int i=0; i<kvol*ndim;i++){
158 u11t[i]=1; u12t[i]=0;
159 }
160 }
161 else if(istart>0){
162 //Ideally, we can use gsl_ranlux as the PRNG
163#ifdef __RANLUX__
164 for(int i=0; i<kvol*ndim;i++){
165 u11t[i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
166 u12t[i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
167 }
168 //If not, the Intel Vectorise Mersenne Twister
169#elif (defined __INTEL_MKL__&&!defined USE_RAN2)
170 //Good news, casting works for using a double to create random complex numbers
171 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*ndim*kvol, u11t, -1, 1);
172 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*ndim*kvol, u12t, -1, 1);
173 //Last resort, Numerical Recipes' Ran2
174#else
175 for(int i=0; i<kvol*ndim;i++){
176 u11t[i]=2*(ran2(&seed)-0.5+I*(ran2(&seed)-0.5));
177 u12t[i]=2*(ran2(&seed)-0.5+I*(ran2(&seed)-0.5));
178 }
179#endif
180 }
181 else
182 fprintf(stderr,"Warning %i in %s: Gauge fields are not initialised.\n", NOINIT, funcname);
183
184#ifdef __NVCC__
185 int device=-1;
186 cudaGetDevice(&device);
187 cudaMemPrefetchAsync(u11t, ndim*kvol*sizeof(Complex),device,streams[0]);
188 cudaMemPrefetchAsync(u12t, ndim*kvol*sizeof(Complex),device,streams[1]);
189#endif
190 //Send trials to accelerator for reunitarisation
191 Reunitarise(u11t,u12t);
192 //Get trials back
193#ifdef __NVCC__
194 cudaMemcpyAsync(u11,u11t,ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[0]);
195 cudaMemPrefetchAsync(u11, ndim*kvol*sizeof(Complex),device,streams[0]);
196 cudaMemcpyAsync(u12,u12t,ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[1]);
197 cudaMemPrefetchAsync(u12, ndim*kvol*sizeof(Complex),device,streams[1]);
198#else
199 memcpy(u11, u11t, ndim*kvol*sizeof(Complex));
200 memcpy(u12, u12t, ndim*kvol*sizeof(Complex));
201#endif
202 }
203#ifdef _DEBUG
204 printf("Initialisation Complete\n");
205#endif
206 return 0;
207}
208int Hamilton(double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi,\
209 Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int * iu, unsigned int *id,\
210 Complex_f *gamval_f, int *gamin, float *dk4m_f, float * dk4p_f, Complex_f jqq,\
211 float akappa, float beta, double *ancgh,int traj){
212 /*
213 * @brief Calculate the Hamiltonian
214 *
215 *
216 * @param h: Hamiltonian
217 * @param s: Action
218 * @param res2: Limit for conjugate gradient
219 * @param X0:
220 * @param X1:
221 * @param Phi:
222 * @param u11t,u12t: Gauge fields
223 * @param u11t_f,u12t_f: Gauge fields
224 * @param iu,id: Lattice indices
225 * @param gamval_f: Gamma matrices
226 * @param gamin: Gamma indices
227 * @param dk4m_f: $exp(-\mu)$ float
228 * @param dk4p_f: $exp(\mu)$ float
229 * @param jqq: Diquark source
230 * @param akappa: Hopping parameter
231 * @param beta: Inverse gauge coupling
232 * @param ancgh: Conjugate gradient iterations counter
233 *
234 * @return Zero on success. Integer Error code otherwise.
235 */
236 const char *funcname = "Hamilton";
237 //Iterate over momentum terms.
238#ifdef __NVCC__
239 double hp;
240 int device=-1;
241 cudaGetDevice(&device);
242 cudaMemPrefetchAsync(pp,kmom*sizeof(double),device,NULL);
243 cublasDnrm2(cublas_handle, kmom, pp, 1,&hp);
244 hp*=hp;
245#elif defined USE_BLAS
246 double hp = cblas_dnrm2(kmom, pp, 1);
247 hp*=hp;
248#else
249 double hp=0;
250 for(int i = 0; i<kmom; i++)
251 hp+=pp[i]*pp[i];
252#endif
253 hp*=0.5;
254 double avplaqs, avplaqt;
255 double hg = 0;
256 //avplaq? isn't seen again here.
257 Average_Plaquette(&hg,&avplaqs,&avplaqt,u11t_f,u12t_f,iu,beta);
258
259 double hf = 0; int itercg = 0;
260#ifdef __NVCC__
261 Complex *smallPhi;
262 cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),NULL);
263#else
264 Complex *smallPhi = aligned_alloc(AVX,kferm2*sizeof(Complex));
265#endif
266 //Iterating over flavours
267 for(int na=0;na<nf;na++){
268#ifdef __NVCC__
269#ifdef _DEBUG
270 cudaDeviceSynchronise();
271#endif
272 cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,streams[0]);
273#ifdef _DEBUG
274 cudaDeviceSynchronise();
275#endif
276#else
277 memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
278#endif
279 Fill_Small_Phi(na, smallPhi, Phi);
280 if(Congradq(na,res2,X1,smallPhi,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg))
281 fprintf(stderr,"Trajectory %d\n", traj);
282
283 *ancgh+=itercg;
284#ifdef __NVCC__
285 cudaMemcpyAsync(X0+na*kferm2,X1,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,streams[0]);
286#else
287 memcpy(X0+na*kferm2,X1,kferm2*sizeof(Complex));
288#endif
289 Fill_Small_Phi(na, smallPhi,Phi);
290#ifdef __NVCC__
291 Complex dot;
292 cublasZdotc(cublas_handle,kferm2,(cuDoubleComplex *)smallPhi,1,(cuDoubleComplex *) X1,1,(cuDoubleComplex *) &dot);
293 hf+=creal(dot);
294#elif defined USE_BLAS
295 Complex dot;
296 cblas_zdotc_sub(kferm2, smallPhi, 1, X1, 1, &dot);
297 hf+=creal(dot);
298#else
299 //It is a dot product of the flattened arrays, could use
300 //a module to convert index to coordinate array...
301 for(int j=0;j<kferm2;j++)
302 hf+=creal(conj(smallPhi[j])*X1[j]);
303#endif
304 }
305#ifdef __NVCC__
306 cudaFreeAsync(smallPhi,NULL);
307#else
308 free(smallPhi);
309#endif
310 //hg was summed over inside of Average_Plaquette.
311#if(nproc>1)
312 Par_dsum(&hp); Par_dsum(&hf);
313#endif
314 *s=hg+hf; *h=(*s)+hp;
315#ifdef _DEBUG
316 if(!rank)
317 printf("hg=%.5e; hf=%.5e; hp=%.5e; h=%.5e\n", hg, hf, hp, *h);
318#endif
319 return 0;
320}
321inline int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
322{
323 const char *funcname = "C_gather";
324 //FORTRAN had a second parameter m giving the size of y (kvol+halo) normally
325 //Pointers mean that's not an issue for us so I'm leaving it out
326#ifdef __NVCC__
327 cuC_gather(x,y,n,table,mu,dimBlock,dimGrid);
328#else
329#pragma omp parallel for simd aligned (x,y,table:AVX)
330 for(int i=0; i<n; i++)
331 x[i]=y[table[i*ndim+mu]*ndim+mu];
332#endif
333 return 0;
334}
335inline int Z_gather(Complex *x, Complex *y, int n, unsigned int *table, unsigned int mu)
336{
337 const char *funcname = "Z_gather";
338 //FORTRAN had a second parameter m giving the size of y (kvol+halo) normally
339 //Pointers mean that's not an issue for us so I'm leaving it out
340#ifdef __NVCC__
341 cuZ_gather(x,y,n,table,mu,dimBlock,dimGrid);
342#else
343#pragma omp parallel for simd aligned (x,y,table:AVX)
344 for(int i=0; i<n; i++)
345 x[i]=y[table[i*ndim+mu]*ndim+mu];
346#endif
347 return 0;
348}
349inline int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
350{
351 /*Copies necessary (2*4*kvol) elements of Phi into a vector variable
352 *
353 * Globals:
354 * =======
355 * Phi: The source array
356 *
357 * Parameters:
358 * ==========
359 * int na: flavour index
360 * Complex *smallPhi: The target array
361 *
362 * Returns:
363 * =======
364 * Zero on success, integer error code otherwise
365 */
366 const char *funcname = "Fill_Small_Phi";
367 //BIG and small phi index
368#ifdef __NVCC__
369 cuFill_Small_Phi(na,smallPhi,Phi,dimBlock,dimGrid);
370#else
371#pragma omp parallel for simd aligned(smallPhi,Phi:AVX) collapse(3)
372 for(int i = 0; i<kvol;i++)
373 for(int idirac = 0; idirac<ndirac; idirac++)
374 for(int ic= 0; ic<nc; ic++)
375 // PHI_index=i*16+j*2+k;
376 smallPhi[(i*ndirac+idirac)*nc+ic]=Phi[((na*kvol+i)*ngorkov+idirac)*nc+ic];
377#endif
378 return 0;
379}
380inline int UpDownPart(const int na, Complex *X0, Complex *R1){
381#ifdef __NVCC__
382 cuUpDownPart(na,X0,R1,dimBlock,dimGrid);
383 cudaDeviceSynchronise();
384#else
385 //The only reason this was removed from the original function is for diagnostics
386#pragma omp parallel for simd collapse(2) aligned(X0,R1:AVX)
387 for(int i=0; i<kvol; i++)
388 for(int idirac = 0; idirac < ndirac; idirac++){
389 X0[((na*kvol+i)*ndirac+idirac)*nc]=R1[(i*ngorkov+idirac)*nc];
390 X0[((na*kvol+i)*ndirac+idirac)*nc+1]=R1[(i*ngorkov+idirac)*nc+1];
391 }
392#endif
393 return 0;
394}
Header for routines related to lattice sites.
int Check_addr(unsigned int *table, int lns, int lnt, int imin, int imax)
Definition coord.c:334
int Addrc(unsigned int *iu, unsigned int *id)
Loads the addresses required during the update.
Definition coord.c:16
Matrix multiplication and related declarations.
MPI headers.
#define UP
Flag for send up.
Definition par_mpi.h:37
int Par_sread(const int iread, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t)
Reads and assigns the gauges from file.
Definition par_mpi.c:127
int rank
The MPI rank.
Definition par_mpi.c:22
int DHalo_swap_dir(double *d, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
int * pcoord
The processor grid.
Definition par_mpi.c:19
Header for random number configuration.
int Par_ranset(long *seed, int iread)
Uses the rank to get a new seed. Copying from the FORTRAN description here c create new seeds in rang...
Definition random.c:135
double ran2(long *idum)
Generates uniformly distributed random double between zero and one as described in numerical recipes....
Definition random.c:440
long seed
RAN2 seed.
Definition random.c:27
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define kmom
sublattice momentum sites
Definition sizes.h:184
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define ksizet
Sublattice t extent.
Definition sizes.h:149
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define nthreads
Number of threads for OpenMP, which can be overwritten at runtime.
Definition sizes.h:135
#define ksize
Sublattice spatial extent for a cubic lattice.
Definition sizes.h:146
#define nf
Fermion flavours (double it)
Definition sizes.h:151
#define ndirac
Dirac indices.
Definition sizes.h:177
#define kvol3
Sublattice spatial volume.
Definition sizes.h:156
#define halo
Total Halo size.
Definition sizes.h:222
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188
#define npt
Processor grid t extent.
Definition sizes.h:124
int Init(int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, unsigned int *iu, unsigned int *id)
Initialises the system.
Definition su2hmc.c:19
int Z_gather(Complex *x, Complex *y, int n, unsigned int *table, unsigned int mu)
Extracts all the double precision gauge links in the direction only.
Definition su2hmc.c:335
int Hamilton(double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
Calculate the Hamiltonian.
Definition su2hmc.c:208
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
Definition su2hmc.c:349
int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
Extracts all the single precision gauge links in the direction only.
Definition su2hmc.c:321
Function declarations for most of the routines.
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
Definition congrad.c:7
int Average_Plaquette(double *hg, double *avplaqs, double *avplaqt, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, float beta)
Calculates the gauge action using new (how new?) lookup table.
Definition bosonic.c:8
int Reunitarise(Complex *u11t, Complex *u12t)
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
Definition matrices.c:904