su2hmc
Loading...
Searching...
No Matches
fermionic.c
Go to the documentation of this file.
1
5#include <matrices.h>
6#include <random.h>
7#include <su2hmc.h>
8int Measure(double *pbp, double *endenf, double *denf, Complex *qq, Complex *qbqb, double res, int *itercg,\
9 Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id,\
10 Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p,\
11 float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, Complex *Phi, Complex *R1){
12 /*
13 * @brief Calculate fermion expectation values via a noisy estimator
14 *
15 * Matrix inversion via conjugate gradient algorithm
16 * Solves @f(Mx=x_1@f)
17 * (Numerical Recipes section 2.10 pp.70-73)
18 * uses NEW lookup tables **
19 * Implemented in Congradq
20 *
21 * @param pbp: @f(\langle\bar{\Psi}\Psi\rangle@f)
22 * @param endenf: Energy density
23 * @param denf: Number Density
24 * @param qq: Diquark condensate
25 * @param qbqb: Antidiquark condensate
26 * @param res: Conjugate Gradient Residue
27 * @param itercg: Iterations of Conjugate Gradient
28 * @param u11t,u12t Double precisiongauge field
29 * @param u11t_f,u12t_f: Single precision gauge fields
30 * @param iu,id Lattice indices
31 * @param gamval_f: Gamma matrices
32 * @param gamin: Indices for Dirac terms
33 * @param dk4m_f: $exp(-\mu)$ float
34 * @param dk4p_f: $exp(\mu)$ float
35 * @param jqq: Diquark source
36 * @param akappa: Hopping parameter
37 * @param Phi: Pseudofermion field
38 * @param R1: A useful array for holding things that was already assigned in main.
39 * In particular, we'll be using it to catch the output of
40 * @f$ M^\dagger\Xi@f$ before the inversion, then used to store the
41 * output of the inversion
42 *
43 * @return Zero on success, integer error code otherwise
44 */
45 const char *funcname = "Measure";
46 //This x is just a storage container
47
48#ifdef __NVCC__
49 int device=-1;
50 cudaGetDevice(&device);
51 Complex *x, *xi; Complex_f *xi_f, *R1_f;
52 #ifdef _DEBUG
53 cudaMallocManaged((void **)&R1_f,kfermHalo*sizeof(Complex_f), cudaMemAttachGlobal);
54 #else
55 cudaMallocAsync((void **)&R1_f,kfermHalo*sizeof(Complex_f),streams[1]);
56 #endif
57 cudaMallocManaged((void **)&x,kfermHalo*sizeof(Complex), cudaMemAttachGlobal);
58 cudaMallocManaged((void **)&xi,kferm*sizeof(Complex), cudaMemAttachGlobal);
59 cudaMallocManaged((void **)&xi_f,kfermHalo*sizeof(Complex_f), cudaMemAttachGlobal);
60#else
61 Complex *x =(Complex *)aligned_alloc(AVX,kfermHalo*sizeof(Complex));
62 Complex *xi =(Complex *)aligned_alloc(AVX,kferm*sizeof(Complex));
63 Complex_f *xi_f =(Complex_f *)aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
64 Complex_f *R1_f = (Complex_f *)aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
65#endif
66 //Setting up noise.
67#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
68 Gauss_c(xi_f, kferm, 0, (float)(1/sqrt(2)));
69#else
70 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, xi_f, 0, 1/sqrt(2));
71#endif
72#ifdef __NVCC__
73 cudaMemPrefetchAsync(xi_f,kferm*sizeof(Complex_f),device,streams[0]);
74 cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
75 //Transpose needed here for Dslashd
76 Transpose_c(xi_f,ngorkov*nc,kvol,dimGrid,dimBlock);
77 //Flip all the gauge fields around so memory is coalesced
78 Transpose_c(u11t_f,ndim,kvol,dimGrid,dimBlock);
79 Transpose_c(u12t_f,ndim,kvol,dimGrid,dimBlock);
80 cudaMemcpyAsync(x, xi, kferm*sizeof(Complex),cudaMemcpyDefault,0);
81#else
82#pragma omp parallel for simd aligned(xi,xi_f:AVX)
83 for(int i=0;i<kferm;i++)
84 xi[i]=(Complex)xi_f[i];
85 memcpy(x, xi, kferm*sizeof(Complex));
86#endif
87 //R_1= @f$M^\dagger\Xi@f$
88 //R1 is local in FORTRAN but since its going to be reset anyway I'm going to recycle the
89 //global
90 Dslashd_f(R1_f,xi_f,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa);
91#ifdef __NVCC__
92 cudaDeviceSynchronise();
93 cudaFree(xi_f);
94 Transpose_c(R1_f,kvol,ngorkov*nc,dimGrid,dimBlock);
95 Transpose_c(u11t_f,kvol,ndim,dimGrid,dimBlock);
96 Transpose_c(u12t_f,kvol,ndim,dimGrid,dimBlock);
97 cuComplex_convert(R1_f,R1,kferm,false,dimBlock,dimGrid);
98 cudaMemcpy(Phi, R1, kferm*sizeof(Complex),cudaMemcpyDefault);
99#else
100#pragma omp parallel for simd aligned(R1,R1_f:AVX)
101 for(int i=0;i<kferm;i++)
102 R1[i]=(Complex)R1_f[i];
103 //Copying R1 to the first (zeroth) flavour index of Phi
104 //This should be safe with memcpy since the pointer name
105 //references the first block of memory for that pointer
106 memcpy(Phi, R1, kferm*sizeof(Complex));
107#endif
108 //Evaluate xi = (M^† M)^-1 R_1
109 // Congradp(0, res, R1_f, itercg);
110 //If the conjugate gradient fails to converge for some reason, restart it.
111 //That's causing issues with NaN's. Plan B is to not record the measurements.
112 if(Congradp(0, res, Phi, R1,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,itercg)==ITERLIM)
113 return ITERLIM;
114 //itercg=0;
115 //if(!rank) fprintf(stderr, "Restarting conjugate gradient from %s\n", funcname);
116 //Congradp(0, res, Phi, R1_f,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,itercg);
117 //itercg+=niterc;
118 /*
119#pragma omp parallel for simd aligned(R1,R1_f:AVX)
120for(int i=0;i<kferm;i++)
121xi[i]=(Complex)R1_f[i];
122*/
123#ifdef __NVCC__
124 cudaMemcpyAsync(xi,R1,kferm*sizeof(Complex),cudaMemcpyDefault,streams[0]);
125 #ifdef _DEBUG
126 cudaFree(R1_f);
127 #else
128 cudaFreeAsync(R1_f,streams[1]);
129 #endif
130#else
131 memcpy(xi,R1,kferm*sizeof(Complex));
132 free(xi_f); free(R1_f);
133#endif
134#ifdef USE_BLAS
135 Complex buff;
136#ifdef __NVCC__
137 cublasZdotc(cublas_handle,kferm,(cuDoubleComplex *)x,1,(cuDoubleComplex *)xi,1,(cuDoubleComplex *)&buff);
138 cudaDeviceSynchronise();
139#elif defined USE_BLAS
140 cblas_zdotc_sub(kferm, x, 1, xi, 1, &buff);
141#endif
142 *pbp=creal(buff);
143#else
144 *pbp = 0;
145#pragma unroll
146 for(int i=0;i<kferm;i++)
147 *pbp+=creal(conj(x[i])*xi[i]);
148#endif
149#if(nproc>1)
150 Par_dsum(pbp);
151#endif
152 *pbp/=4*gvol;
153
154 *qbqb=*qq=0;
155#if defined USE_BLAS
156 for(int idirac = 0; idirac<ndirac; idirac++){
157 int igork=idirac+4;
158 //Unrolling the colour indices, Then its just (γ_5*x)*Ξ or (γ_5*Ξ)*x
159#pragma unroll
160 for(int ic = 0; ic<nc; ic++){
161 Complex dot;
162 //Because we have kvol on the outer index and are summing over it, we set the
163 //step for BLAS to be ngorkov*nc=16.
164 //Does this make sense to do on the GPU?
165#ifdef __NVCC__
166 cublasZdotc(cublas_handle,kvol,(cuDoubleComplex *)(x+idirac*nc+ic),ngorkov*nc,(cuDoubleComplex *)(xi+igork*nc+ic), ngorkov*nc,(cuDoubleComplex *)&dot);
167#else
168 cblas_zdotc_sub(kvol, &x[idirac*nc+ic], ngorkov*nc, &xi[igork*nc+ic], ngorkov*nc, &dot);
169#endif
170 *qbqb+=gamval[4*ndirac+idirac]*dot;
171#ifdef __NVCC__
172 cublasZdotc(cublas_handle,kvol,(cuDoubleComplex *)(x+igork*nc+ic),ngorkov*nc,(cuDoubleComplex *)(xi+idirac*nc+ic), ngorkov*nc,(cuDoubleComplex *)&dot);
173#else
174 cblas_zdotc_sub(kvol, &x[igork*nc+ic], ngorkov*nc, &xi[idirac*nc+ic], ngorkov*nc, &dot);
175#endif
176 *qq-=gamval[4*ndirac+idirac]*dot;
177 }
178 }
179#else
180#pragma unroll(2)
181 for(int i=0; i<kvol; i++)
182 //What is the optimal order to evaluate these in?
183 for(int idirac = 0; idirac<ndirac; idirac++){
184 int igork=idirac+4;
185 *qbqb+=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+idirac)*nc])*xi[(i*ngorkov+igork)*nc];
186 *qq-=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+igork)*nc])*xi[(i*ngorkov+idirac)*nc];
187 *qbqb+=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+idirac)*nc+1])*xi[(i*ngorkov+igork)*nc+1];
188 *qq-=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+igork)*nc+1])*xi[(i*ngorkov+idirac)*nc+1];
189 }
190#endif
191 //In the FORTRAN Code dsum was used instead despite qq and qbqb being complex
192 //Since we only care about the real part this shouldn't cause (m)any serious issues
193#if(nproc>1)
194 Par_dsum((double *)qq); Par_dsum((double *)qbqb);
195#endif
196 *qq=(*qq+*qbqb)/(2*gvol);
197 Complex xu, xd, xuu, xdd;
198 xu=xd=xuu=xdd=0;
199
200 //Halos
201#if(npt>1)
202 ZHalo_swap_dir(x,16,3,DOWN); ZHalo_swap_dir(x,16,3,UP);
203#endif
204 //Pesky halo exchange indices again
205 //The halo exchange for the trial fields was done already at the end of the trajectory
206 //No point doing it again
207
208 //Instead of typing id[i*ndim+3] a lot, we'll just assign them to variables.
209 //Idea. One loop instead of two loops but for xuu and xdd just use ngorkov-(igorkov+1) instead
210 //Dirty CUDA work around since it won't convert thrust<complex> to double
211 //TODO: get a reduction routine ready for CUDA
212#ifndef __NVCC__
213#pragma omp parallel for reduction(+:xd,xu,xdd,xuu)
214#endif
215 for(int i = 0; i<kvol; i++){
216 int did=id[3+ndim*i];
217 int uid=iu[3+ndim*i];
218#ifndef __NVCC__
219#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xu)
220#endif
221 for(int igorkov=0; igorkov<4; igorkov++){
222 int igork1=gamin[3*ndirac+igorkov];
223 //For the C Version I'll try and factorise where possible
224 xu+=dk4p[did]*(conj(x[(did*ngorkov+igorkov)*nc])*(\
225 u11t[did*ndim+3]*(xi[(i*ngorkov+igork1)*nc]-xi[(i*ngorkov+igorkov)*nc])+\
226 u12t[did*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]-xi[(i*ngorkov+igorkov)*nc+1]) )+\
227 conj(x[(did*ngorkov+igorkov)*nc+1])*(\
228 conj(u11t[did*ndim+3])*(xi[(i*ngorkov+igork1)*nc+1]-xi[(i*ngorkov+igorkov)*nc+1])+\
229 conj(u12t[did*ndim+3])*(xi[(i*ngorkov+igorkov)*nc]-xi[(i*ngorkov+igork1)*nc])));
230 }
231#ifndef __NVCC__
232#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xd)
233#endif
234 for(int igorkov=0; igorkov<4; igorkov++){
235 int igork1=gamin[3*ndirac+igorkov];
236 xd+=dk4m[i]*(conj(x[(uid*ngorkov+igorkov)*nc])*(\
237 conj(u11t[i*ndim+3])*(xi[(i*ngorkov+igork1)*nc]+xi[(i*ngorkov+igorkov)*nc])-\
238 u12t[i*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]+xi[(i*ngorkov+igorkov)*nc+1]) )+\
239 conj(x[(uid*ngorkov+igorkov)*nc+1])*(\
240 u11t[i*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]+xi[(i*ngorkov+igorkov)*nc+1])+\
241 conj(u12t[i*ndim+3])*(xi[(i*ngorkov+igorkov)*nc]+xi[(i*ngorkov+igork1)*nc]) ) );
242 }
243#ifndef __NVCC__
244#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xuu)
245#endif
246 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
247 int igork1PP=4+gamin[3*ndirac+igorkovPP-4];
248 xuu-=dk4m[did]*(conj(x[(did*ngorkov+igorkovPP)*nc])*(\
249 u11t[did*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc]-xi[(i*ngorkov+igorkovPP)*nc])+\
250 u12t[did*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]-xi[(i*ngorkov+igorkovPP)*nc+1]) )+\
251 conj(x[(did*ngorkov+igorkovPP)*nc+1])*(\
252 conj(u11t[did*ndim+3])*(xi[(i*ngorkov+igork1PP)*nc+1]-xi[(i*ngorkov+igorkovPP)*nc+1])+\
253 conj(u12t[did*ndim+3])*(xi[(i*ngorkov+igorkovPP)*nc]-xi[(i*ngorkov+igork1PP)*nc]) ) );
254 }
255#ifndef __NVCC__
256#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xdd)
257#endif
258 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
259 int igork1PP=4+gamin[3*ndirac+igorkovPP-4];
260 xdd-=dk4p[i]*(conj(x[(uid*ngorkov+igorkovPP)*nc])*(\
261 conj(u11t[i*ndim+3])*(xi[(i*ngorkov+igork1PP)*nc]+xi[(i*ngorkov+igorkovPP)*nc])-\
262 u12t[i*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]+xi[(i*ngorkov+igorkovPP)*nc+1]) )+\
263 conj(x[(uid*ngorkov+igorkovPP)*nc+1])*(\
264 u11t[i*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]+xi[(i*ngorkov+igorkovPP)*nc+1])+\
265 conj(u12t[i*ndim+3])*(xi[(i*ngorkov+igorkovPP)*nc]+xi[(i*ngorkov+igork1PP)*nc]) ) );
266 }
267 }
268 *endenf=creal(xu-xd-xuu+xdd);
269 *denf=creal(xu+xd+xuu+xdd);
270
271#if(nproc>1)
272 Par_dsum(endenf); Par_dsum(denf);
273#endif
274 *endenf/=2*gvol; *denf/=2*gvol;
275 //Future task. Chiral susceptibility measurements
276#ifdef __NVCC__
277 cudaFree(x); cudaFree(xi);
278#else
279 free(x); free(xi);
280#endif
281 return 0;
282}
int Measure(double *pbp, double *endenf, double *denf, Complex *qq, Complex *qbqb, double res, int *itercg, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, Complex *Phi, Complex *R1)
Calculate fermion expectation values via a noisy estimator.
Definition fermionic.c:8
Matrix multiplication and related declarations.
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:584
#define UP
Flag for send up.
Definition par_mpi.h:37
#define DOWN
Flag for send down.
Definition par_mpi.h:35
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
Header for random number configuration.
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
Definition random.c:260
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define ndirac
Dirac indices.
Definition sizes.h:177
#define gvol
Lattice volume.
Definition sizes.h:92
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225
Function declarations for most of the routines.
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...
Definition congrad.c:262