su2hmc
Loading...
Searching...
No Matches
fermionic.c
Go to the documentation of this file.
1
5#include <matrices.h>
6#include <random.h>
7#include <su2hmc.h>
8int Measure(double *pbp, double *endenf, double *denf, Complex *qq, Complex *qbqb, double res, int *itercg,\
9 Complex *ut[2], Complex_f *ut_f[2], unsigned int *iu, unsigned int *id,\
10 Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2],\
11 float *dk_f[2], Complex_f jqq, float akappa, Complex *Phi, Complex *R1){
12 /*
13 * @brief Calculate fermion expectation values via a noisy estimator
14 *
15 * Matrix inversion via conjugate gradient algorithm
16 * Solves @f(Mx=x_1@f)
17 * (Numerical Recipes section 2.10 pp.70-73)
18 * uses NEW lookup tables **
19 * Implemented in Congradq
20 *
21 * @param pbp: @f(\langle\bar{\Psi}\Psi\rangle@f)
22 * @param endenf: Energy density
23 * @param denf: Number Density
24 * @param qq: Diquark condensate
25 * @param qbqb: Antidiquark condensate
26 * @param res: Conjugate Gradient Residue
27 * @param itercg: Iterations of Conjugate Gradient
28 * @param u11t,u12t Double precisiongauge field
29 * @param u11t_f,u12t_f: Single precision gauge fields
30 * @param iu,id Lattice indices
31 * @param gamval_f: Gamma matrices
32 * @param gamin: Indices for Dirac terms
33 * @param dk4m_f: $exp(-\mu)$ float
34 * @param dk4p_f: $exp(\mu)$ float
35 * @param jqq: Diquark source
36 * @param akappa: Hopping parameter
37 * @param Phi: Pseudofermion field
38 * @param R1: A useful array for holding things that was already assigned in main.
39 * In particular, we'll be using it to catch the output of
40 * @f$ M^\dagger\Xi@f$ before the inversion, then used to store the
41 * output of the inversion
42 *
43 * @return Zero on success, integer error code otherwise
44 */
45 const char *funcname = "Measure";
46 //This x is just a storage container
47
48#ifdef __NVCC__
49 int device=-1;
50 cudaGetDevice(&device);
51 Complex *x, *xi; Complex_f *xi_f, *R1_f;
52#ifdef _DEBUG
53 cudaMallocManaged((void **)&R1_f,kfermHalo*sizeof(Complex_f), cudaMemAttachGlobal);
54#else
55 cudaMallocAsync((void **)&R1_f,kfermHalo*sizeof(Complex_f),streams[1]);
56#endif
57 cudaMallocManaged((void **)&x,kfermHalo*sizeof(Complex), cudaMemAttachGlobal);
58 cudaMallocManaged((void **)&xi,kferm*sizeof(Complex), cudaMemAttachGlobal);
59 cudaMallocManaged((void **)&xi_f,kfermHalo*sizeof(Complex_f), cudaMemAttachGlobal);
60#else
61 Complex *x =(Complex *)aligned_alloc(AVX,kfermHalo*sizeof(Complex));
62 Complex *xi =(Complex *)aligned_alloc(AVX,kferm*sizeof(Complex));
63 Complex_f *xi_f =(Complex_f *)aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
64 Complex_f *R1_f = (Complex_f *)aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
65#endif
66 //Setting up noise.
67#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
68 Gauss_c(xi_f, kferm, 0, (float)(1/sqrt(2)));
69#else
70 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, xi_f, 0, 1/sqrt(2));
71#endif
72#ifdef __NVCC__
73 cudaMemPrefetchAsync(xi_f,kferm*sizeof(Complex_f),device,streams[0]);
74 cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
75 //Transpose needed here for Dslashd
76 Transpose_c(xi_f,ngorkov*nc,kvol);
77 //Flip all the gauge fields around so memory is coalesced
78 cudaMemcpyAsync(x, xi, kferm*sizeof(Complex),cudaMemcpyDefault,0);
79#else
80#pragma omp parallel for simd aligned(xi,xi_f:AVX)
81 for(int i=0;i<kferm;i++)
82 xi[i]=(Complex)xi_f[i];
83 memcpy(x, xi, kferm*sizeof(Complex));
84#endif
85 //R_1= @f$M^\dagger\Xi@f$
86 //R1 is local in FORTRAN but since its going to be reset anyway I'm going to recycle the
87 //global
88 Dslashd_f(R1_f,xi_f,ut_f[0],ut_f[1],iu,id,gamval_f,gamin,dk_f,jqq,akappa);
89#ifdef __NVCC__
90 cudaDeviceSynchronise();
91 cudaFree(xi_f);
92 Transpose_c(R1_f,kvol,ngorkov*nc);
93 cuComplex_convert(R1_f,R1,kferm,false,dimBlock,dimGrid);
94 cudaMemcpy(Phi, R1, kferm*sizeof(Complex),cudaMemcpyDefault);
95#else
96#pragma omp parallel for simd aligned(R1,R1_f:AVX)
97 for(int i=0;i<kferm;i++)
98 R1[i]=(Complex)R1_f[i];
99 //Copying R1 to the first (zeroth) flavour index of Phi
100 //This should be safe with memcpy since the pointer name
101 //references the first block of memory for that pointer
102 memcpy(Phi, R1, kferm*sizeof(Complex));
103#endif
104 //Evaluate xi = (M^† M)^-1 R_1
105 // Congradp(0, res, R1_f, itercg);
106 //If the conjugate gradient fails to converge for some reason, restart it.
107 //That's causing issues with NaN's. Plan B is to not record the measurements.
108 if(Congradp(0, res, Phi, R1,ut_f,iu,id,gamval_f,gamin,dk_f,jqq,akappa,itercg)==ITERLIM)
109 return ITERLIM;
110 //itercg=0;
111 //if(!rank) fprintf(stderr, "Restarting conjugate gradient from %s\n", funcname);
112 //Congradp(0, res, Phi, R1_f,ut_f[0],ut_f[1],iu,id,gamval_f,gamin,dk_f[0],dk_f[1],jqq,akappa,itercg);
113 //itercg+=niterc;
114 /*
115#pragma omp parallel for simd aligned(R1,R1_f:AVX)
116for(int i=0;i<kferm;i++)
117xi[i]=(Complex)R1_f[i];
118*/
119#ifdef __NVCC__
120 cudaMemcpyAsync(xi,R1,kferm*sizeof(Complex),cudaMemcpyDefault,streams[0]);
121#ifdef _DEBUG
122 cudaFree(R1_f);
123#else
124 cudaFreeAsync(R1_f,streams[1]);
125#endif
126#else
127 memcpy(xi,R1,kferm*sizeof(Complex));
128 free(xi_f); free(R1_f);
129#endif
130#ifdef USE_BLAS
131 Complex buff;
132#ifdef __NVCC__
133 cublasZdotc(cublas_handle,kferm,(cuDoubleComplex *)x,1,(cuDoubleComplex *)xi,1,(cuDoubleComplex *)&buff);
134 cudaDeviceSynchronise();
135#elif defined USE_BLAS
136 cblas_zdotc_sub(kferm, x, 1, xi, 1, &buff);
137#endif
138 *pbp=creal(buff);
139#else
140 *pbp = 0;
141#pragma unroll
142 for(int i=0;i<kferm;i++)
143 *pbp+=creal(conj(x[i])*xi[i]);
144#endif
145#if(nproc>1)
146 Par_dsum(pbp);
147#endif
148 *pbp/=4*gvol;
149
150 *qbqb=*qq=0;
151#if defined USE_BLAS
152 for(int idirac = 0; idirac<ndirac; idirac++){
153 int igork=idirac+4;
154 //Unrolling the colour indices, Then its just (γ_5*x)*Ξ or (γ_5*Ξ)*x
155#pragma unroll
156 for(int ic = 0; ic<nc; ic++){
157 Complex dot;
158 //Because we have kvol on the outer index and are summing over it, we set the
159 //step for BLAS to be ngorkov*nc=16.
160 //Does this make sense to do on the GPU?
161#ifdef __NVCC__
162 cublasZdotc(cublas_handle,kvol,(cuDoubleComplex *)(x+idirac*nc+ic),ngorkov*nc,(cuDoubleComplex *)(xi+igork*nc+ic), ngorkov*nc,(cuDoubleComplex *)&dot);
163#else
164 cblas_zdotc_sub(kvol, &x[idirac*nc+ic], ngorkov*nc, &xi[igork*nc+ic], ngorkov*nc, &dot);
165#endif
166 *qbqb+=gamval[4*ndirac+idirac]*dot;
167#ifdef __NVCC__
168 cublasZdotc(cublas_handle,kvol,(cuDoubleComplex *)(x+igork*nc+ic),ngorkov*nc,(cuDoubleComplex *)(xi+idirac*nc+ic), ngorkov*nc,(cuDoubleComplex *)&dot);
169#else
170 cblas_zdotc_sub(kvol, &x[igork*nc+ic], ngorkov*nc, &xi[idirac*nc+ic], ngorkov*nc, &dot);
171#endif
172 *qq-=gamval[4*ndirac+idirac]*dot;
173 }
174 }
175#else
176#pragma unroll(2)
177 for(int i=0; i<kvol; i++)
178 //What is the optimal order to evaluate these in?
179 for(int idirac = 0; idirac<ndirac; idirac++){
180 int igork=idirac+4;
181 *qbqb+=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+idirac)*nc])*xi[(i*ngorkov+igork)*nc];
182 *qq-=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+igork)*nc])*xi[(i*ngorkov+idirac)*nc];
183 *qbqb+=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+idirac)*nc+1])*xi[(i*ngorkov+igork)*nc+1];
184 *qq-=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+igork)*nc+1])*xi[(i*ngorkov+idirac)*nc+1];
185 }
186#endif
187 //In the FORTRAN Code dsum was used instead despite qq and qbqb being complex
188 //Since we only care about the real part this shouldn't cause (m)any serious issues
189#if(nproc>1)
190 Par_dsum((double *)qq); Par_dsum((double *)qbqb);
191#endif
192 *qq=(*qq+*qbqb)/(2*gvol);
193 Complex xu, xd, xuu, xdd;
194 xu=xd=xuu=xdd=0;
195
196 //Halos
197#if(npt>1)
198 ZHalo_swap_dir(x,16,3,DOWN); ZHalo_swap_dir(x,16,3,UP);
199#endif
200 //Pesky halo exchange indices again
201 //The halo exchange for the trial fields was done already at the end of the trajectory
202 //No point doing it again
203
204 //Instead of typing id[i*ndim+3] a lot, we'll just assign them to variables.
205 //Idea. One loop instead of two loops but for xuu and xdd just use ngorkov-(igorkov+1) instead
206 //Dirty CUDA work around since it won't convert thrust<complex> to double
207 //TODO: get a reduction routine ready for CUDA
208#ifdef __NVCC__
209 //Swapping back the gauge fields to SoA since the rest of the code is running on CPU and hasn't been ported
210 Transpose_z(ut[0],kvol,ndim);
211 Transpose_z(ut[1],kvol,ndim);
212 //Set up index arrays for CPU
213 Transpose_U(iu,kvol,ndim);
214 Transpose_U(id,kvol,ndim);
215 cudaDeviceSynchronise();
216#else
217#pragma omp parallel for reduction(+:xd,xu,xdd,xuu)
218#endif
219 for(int i = 0; i<kvol; i++){
220 int did=id[3+ndim*i];
221 int uid=iu[3+ndim*i];
222 for(int igorkov=0; igorkov<4; igorkov++){
223 int igork1=gamin[3*ndirac+igorkov];
224 //For the C Version I'll try and factorise where possible
225 xu+=dk[1][did]*(conj(x[(did*ngorkov+igorkov)*nc])*(\
226 ut[0][did*ndim+3]*(xi[(i*ngorkov+igork1)*nc]-xi[(i*ngorkov+igorkov)*nc])+\
227 ut[1][did*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]-xi[(i*ngorkov+igorkov)*nc+1]) )+\
228 conj(x[(did*ngorkov+igorkov)*nc+1])*(\
229 conj(ut[0][did*ndim+3])*(xi[(i*ngorkov+igork1)*nc+1]-xi[(i*ngorkov+igorkov)*nc+1])+\
230 conj(ut[1][did*ndim+3])*(xi[(i*ngorkov+igorkov)*nc]-xi[(i*ngorkov+igork1)*nc])));
231 }
232 for(int igorkov=0; igorkov<4; igorkov++){
233 int igork1=gamin[3*ndirac+igorkov];
234 xd+=dk[0][i]*(conj(x[(uid*ngorkov+igorkov)*nc])*(\
235 conj(ut[0][i*ndim+3])*(xi[(i*ngorkov+igork1)*nc]+xi[(i*ngorkov+igorkov)*nc])-\
236 ut[1][i*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]+xi[(i*ngorkov+igorkov)*nc+1]) )+\
237 conj(x[(uid*ngorkov+igorkov)*nc+1])*(\
238 ut[0][i*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]+xi[(i*ngorkov+igorkov)*nc+1])+\
239 conj(ut[1][i*ndim+3])*(xi[(i*ngorkov+igorkov)*nc]+xi[(i*ngorkov+igork1)*nc]) ) );
240 }
241 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
242 int igork1PP=4+gamin[3*ndirac+igorkovPP-4];
243 xuu-=dk[0][did]*(conj(x[(did*ngorkov+igorkovPP)*nc])*(\
244 ut[0][did*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc]-xi[(i*ngorkov+igorkovPP)*nc])+\
245 ut[1][did*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]-xi[(i*ngorkov+igorkovPP)*nc+1]) )+\
246 conj(x[(did*ngorkov+igorkovPP)*nc+1])*(\
247 conj(ut[0][did*ndim+3])*(xi[(i*ngorkov+igork1PP)*nc+1]-xi[(i*ngorkov+igorkovPP)*nc+1])+\
248 conj(ut[1][did*ndim+3])*(xi[(i*ngorkov+igorkovPP)*nc]-xi[(i*ngorkov+igork1PP)*nc]) ) );
249 }
250 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
251 int igork1PP=4+gamin[3*ndirac+igorkovPP-4];
252 xdd-=dk[1][i]*(conj(x[(uid*ngorkov+igorkovPP)*nc])*(\
253 conj(ut[0][i*ndim+3])*(xi[(i*ngorkov+igork1PP)*nc]+xi[(i*ngorkov+igorkovPP)*nc])-\
254 ut[1][i*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]+xi[(i*ngorkov+igorkovPP)*nc+1]) )+\
255 conj(x[(uid*ngorkov+igorkovPP)*nc+1])*(\
256 ut[0][i*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]+xi[(i*ngorkov+igorkovPP)*nc+1])+\
257 conj(ut[1][i*ndim+3])*(xi[(i*ngorkov+igorkovPP)*nc]+xi[(i*ngorkov+igork1PP)*nc]) ) );
258 }
259 }
260 *endenf=creal(xu-xd-xuu+xdd);
261 *denf=creal(xu+xd+xuu+xdd);
262
263#if(nproc>1)
264 Par_dsum(endenf); Par_dsum(denf);
265#endif
266 *endenf/=2*gvol; *denf/=2*gvol;
267 //Future task. Chiral susceptibility measurements
268#ifdef __NVCC__
269 cudaFree(x); cudaFree(xi);
270 //Revert index and gauge arrays
271 Transpose_z(ut[0],ndim,kvol);
272 Transpose_z(ut[1],ndim,kvol);
273 Transpose_U(iu,ndim,kvol);
274 Transpose_U(id,ndim,kvol);
275#else
276 free(x); free(xi);
277#endif
278 return 0;
279}
int Measure(double *pbp, double *endenf, double *denf, Complex *qq, Complex *qbqb, double res, int *itercg, Complex *ut[2], Complex_f *ut_f[2], unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], Complex_f jqq, float akappa, Complex *Phi, Complex *R1)
Calculate fermion expectation values via a noisy estimator.
Definition fermionic.c:8
Matrix multiplication and related declarations.
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:585
#define UP
Flag for send up.
Definition par_mpi.h:37
#define DOWN
Flag for send down.
Definition par_mpi.h:35
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
Header for random number configuration.
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
Definition random.c:260
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define ndirac
Dirac indices.
Definition sizes.h:177
#define gvol
Lattice volume.
Definition sizes.h:92
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225
Function declarations for most of the routines.
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...
Definition congrad.c:250