su2hmc
Loading...
Searching...
No Matches
fermionic.c File Reference

Code for fermionic observables. More...

#include <matrices.h>
#include <random.h>
#include <su2hmc.h>
Include dependency graph for fermionic.c:

Go to the source code of this file.

Functions

int Measure (double *pbp, double *endenf, double *denf, Complex *qq, Complex *qbqb, double res, int *itercg, Complex *ut[2], Complex_f *ut_f[2], unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], Complex_f jqq, float akappa, Complex *Phi, Complex *R1)
 Calculate fermion expectation values via a noisy estimator.
 

Detailed Description

Code for fermionic observables.

Definition in file fermionic.c.

Function Documentation

◆ Measure()

int Measure ( double * pbp,
double * endenf,
double * denf,
Complex * qq,
Complex * qbqb,
double res,
int * itercg,
Complex * ut[2],
Complex_f * ut_f[2],
unsigned int * iu,
unsigned int * id,
Complex * gamval,
Complex_f * gamval_f,
int * gamin,
double * dk[2],
float * dk_f[2],
Complex_f jqq,
float akappa,
Complex * Phi,
Complex * R1 )

Calculate fermion expectation values via a noisy estimator.

Matrix inversion via conjugate gradient algorithm Solves \(MX=X_1\) (Numerical Recipes section 2.10 pp.70-73)
uses NEW lookup tables ** Implemented in Congradp()

Parameters
pbp\(\langle\bar{\Psi}\Psi\rangle\)
endenfEnergy density
denfNumber Density
qqDiquark condensate
qbqbAntidiquark condensate
resConjugate Gradient Residue
itercgIterations of Conjugate Gradient
utDouble precisiongauge field
ut_fSingle precision gauge fields
iu,idLattice indices
gamvalDouble precision gamma matrices rescaled by kappa
gamval_fSingle precision gamma matrices rescaled by kappa
gaminIndices for Dirac terms
dk\(\left(1+\gamma_0\right)e^{-\mu}\) and \(\left(1-\gamma_0\right)e^\mu\) double
dk_f\(\left(1+\gamma_0\right)e^{-\mu}\) and \(\left(1-\gamma_0\right)e^\mu\) float
jqqDiquark source
akappaHopping parameter
PhiPseudofermion field
R1A useful array for holding things that was already assigned in main. In particular, we'll be using it to catch the output of \( M^\dagger\Xi\) before the inversion, then used to store the output of the inversion
Returns
Zero on success, integer error code otherwise

Definition at line 8 of file fermionic.c.

11 {
12 /*
13 * @brief Calculate fermion expectation values via a noisy estimator
14 *
15 * Matrix inversion via conjugate gradient algorithm
16 * Solves @f(Mx=x_1@f)
17 * (Numerical Recipes section 2.10 pp.70-73)
18 * uses NEW lookup tables **
19 * Implemented in Congradq
20 *
21 * @param pbp: @f(\langle\bar{\Psi}\Psi\rangle@f)
22 * @param endenf: Energy density
23 * @param denf: Number Density
24 * @param qq: Diquark condensate
25 * @param qbqb: Antidiquark condensate
26 * @param res: Conjugate Gradient Residue
27 * @param itercg: Iterations of Conjugate Gradient
28 * @param u11t,u12t Double precisiongauge field
29 * @param u11t_f,u12t_f: Single precision gauge fields
30 * @param iu,id Lattice indices
31 * @param gamval_f: Gamma matrices
32 * @param gamin: Indices for Dirac terms
33 * @param dk4m_f: $exp(-\mu)$ float
34 * @param dk4p_f: $exp(\mu)$ float
35 * @param jqq: Diquark source
36 * @param akappa: Hopping parameter
37 * @param Phi: Pseudofermion field
38 * @param R1: A useful array for holding things that was already assigned in main.
39 * In particular, we'll be using it to catch the output of
40 * @f$ M^\dagger\Xi@f$ before the inversion, then used to store the
41 * output of the inversion
42 *
43 * @return Zero on success, integer error code otherwise
44 */
45 const char *funcname = "Measure";
46 //This x is just a storage container
47
48#ifdef __NVCC__
49 int device=-1;
50 cudaGetDevice(&device);
51 Complex *x, *xi; Complex_f *xi_f, *R1_f;
52#ifdef _DEBUG
53 cudaMallocManaged((void **)&R1_f,kfermHalo*sizeof(Complex_f), cudaMemAttachGlobal);
54#else
55 cudaMallocAsync((void **)&R1_f,kfermHalo*sizeof(Complex_f),streams[1]);
56#endif
57 cudaMallocManaged((void **)&x,kfermHalo*sizeof(Complex), cudaMemAttachGlobal);
58 cudaMallocManaged((void **)&xi,kferm*sizeof(Complex), cudaMemAttachGlobal);
59 cudaMallocManaged((void **)&xi_f,kfermHalo*sizeof(Complex_f), cudaMemAttachGlobal);
60#else
61 Complex *x =(Complex *)aligned_alloc(AVX,kfermHalo*sizeof(Complex));
62 Complex *xi =(Complex *)aligned_alloc(AVX,kferm*sizeof(Complex));
63 Complex_f *xi_f =(Complex_f *)aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
64 Complex_f *R1_f = (Complex_f *)aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
65#endif
66 //Setting up noise.
67#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
68 Gauss_c(xi_f, kferm, 0, (float)(1/sqrt(2)));
69#else
70 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, xi_f, 0, 1/sqrt(2));
71#endif
72#ifdef __NVCC__
73 cudaMemPrefetchAsync(xi_f,kferm*sizeof(Complex_f),device,streams[0]);
74 cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
75 //Transpose needed here for Dslashd
76 Transpose_c(xi_f,ngorkov*nc,kvol);
77 //Flip all the gauge fields around so memory is coalesced
78 cudaMemcpyAsync(x, xi, kferm*sizeof(Complex),cudaMemcpyDefault,0);
79#else
80#pragma omp parallel for simd aligned(xi,xi_f:AVX)
81 for(int i=0;i<kferm;i++)
82 xi[i]=(Complex)xi_f[i];
83 memcpy(x, xi, kferm*sizeof(Complex));
84#endif
85 //R_1= @f$M^\dagger\Xi@f$
86 //R1 is local in FORTRAN but since its going to be reset anyway I'm going to recycle the
87 //global
88 Dslashd_f(R1_f,xi_f,ut_f[0],ut_f[1],iu,id,gamval_f,gamin,dk_f,jqq,akappa);
89#ifdef __NVCC__
90 cudaDeviceSynchronise();
91 cudaFree(xi_f);
92 Transpose_c(R1_f,kvol,ngorkov*nc);
93 cuComplex_convert(R1_f,R1,kferm,false,dimBlock,dimGrid);
94 cudaMemcpy(Phi, R1, kferm*sizeof(Complex),cudaMemcpyDefault);
95#else
96#pragma omp parallel for simd aligned(R1,R1_f:AVX)
97 for(int i=0;i<kferm;i++)
98 R1[i]=(Complex)R1_f[i];
99 //Copying R1 to the first (zeroth) flavour index of Phi
100 //This should be safe with memcpy since the pointer name
101 //references the first block of memory for that pointer
102 memcpy(Phi, R1, kferm*sizeof(Complex));
103#endif
104 //Evaluate xi = (M^† M)^-1 R_1
105 // Congradp(0, res, R1_f, itercg);
106 //If the conjugate gradient fails to converge for some reason, restart it.
107 //That's causing issues with NaN's. Plan B is to not record the measurements.
108 if(Congradp(0, res, Phi, R1,ut_f,iu,id,gamval_f,gamin,dk_f,jqq,akappa,itercg)==ITERLIM)
109 return ITERLIM;
110 //itercg=0;
111 //if(!rank) fprintf(stderr, "Restarting conjugate gradient from %s\n", funcname);
112 //Congradp(0, res, Phi, R1_f,ut_f[0],ut_f[1],iu,id,gamval_f,gamin,dk_f[0],dk_f[1],jqq,akappa,itercg);
113 //itercg+=niterc;
114 /*
115#pragma omp parallel for simd aligned(R1,R1_f:AVX)
116for(int i=0;i<kferm;i++)
117xi[i]=(Complex)R1_f[i];
118*/
119#ifdef __NVCC__
120 cudaMemcpyAsync(xi,R1,kferm*sizeof(Complex),cudaMemcpyDefault,streams[0]);
121#ifdef _DEBUG
122 cudaFree(R1_f);
123#else
124 cudaFreeAsync(R1_f,streams[1]);
125#endif
126#else
127 memcpy(xi,R1,kferm*sizeof(Complex));
128 free(xi_f); free(R1_f);
129#endif
130#ifdef USE_BLAS
131 Complex buff;
132#ifdef __NVCC__
133 cublasZdotc(cublas_handle,kferm,(cuDoubleComplex *)x,1,(cuDoubleComplex *)xi,1,(cuDoubleComplex *)&buff);
134 cudaDeviceSynchronise();
135#elif defined USE_BLAS
136 cblas_zdotc_sub(kferm, x, 1, xi, 1, &buff);
137#endif
138 *pbp=creal(buff);
139#else
140 *pbp = 0;
141#pragma unroll
142 for(int i=0;i<kferm;i++)
143 *pbp+=creal(conj(x[i])*xi[i]);
144#endif
145#if(nproc>1)
146 Par_dsum(pbp);
147#endif
148 *pbp/=4*gvol;
149
150 *qbqb=*qq=0;
151#if defined USE_BLAS
152 for(int idirac = 0; idirac<ndirac; idirac++){
153 int igork=idirac+4;
154 //Unrolling the colour indices, Then its just (γ_5*x)*Ξ or (γ_5*Ξ)*x
155#pragma unroll
156 for(int ic = 0; ic<nc; ic++){
157 Complex dot;
158 //Because we have kvol on the outer index and are summing over it, we set the
159 //step for BLAS to be ngorkov*nc=16.
160 //Does this make sense to do on the GPU?
161#ifdef __NVCC__
162 cublasZdotc(cublas_handle,kvol,(cuDoubleComplex *)(x+idirac*nc+ic),ngorkov*nc,(cuDoubleComplex *)(xi+igork*nc+ic), ngorkov*nc,(cuDoubleComplex *)&dot);
163#else
164 cblas_zdotc_sub(kvol, &x[idirac*nc+ic], ngorkov*nc, &xi[igork*nc+ic], ngorkov*nc, &dot);
165#endif
166 *qbqb+=gamval[4*ndirac+idirac]*dot;
167#ifdef __NVCC__
168 cublasZdotc(cublas_handle,kvol,(cuDoubleComplex *)(x+igork*nc+ic),ngorkov*nc,(cuDoubleComplex *)(xi+idirac*nc+ic), ngorkov*nc,(cuDoubleComplex *)&dot);
169#else
170 cblas_zdotc_sub(kvol, &x[igork*nc+ic], ngorkov*nc, &xi[idirac*nc+ic], ngorkov*nc, &dot);
171#endif
172 *qq-=gamval[4*ndirac+idirac]*dot;
173 }
174 }
175#else
176#pragma unroll(2)
177 for(int i=0; i<kvol; i++)
178 //What is the optimal order to evaluate these in?
179 for(int idirac = 0; idirac<ndirac; idirac++){
180 int igork=idirac+4;
181 *qbqb+=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+idirac)*nc])*xi[(i*ngorkov+igork)*nc];
182 *qq-=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+igork)*nc])*xi[(i*ngorkov+idirac)*nc];
183 *qbqb+=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+idirac)*nc+1])*xi[(i*ngorkov+igork)*nc+1];
184 *qq-=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+igork)*nc+1])*xi[(i*ngorkov+idirac)*nc+1];
185 }
186#endif
187 //In the FORTRAN Code dsum was used instead despite qq and qbqb being complex
188 //Since we only care about the real part this shouldn't cause (m)any serious issues
189#if(nproc>1)
190 Par_dsum((double *)qq); Par_dsum((double *)qbqb);
191#endif
192 *qq=(*qq+*qbqb)/(2*gvol);
193 Complex xu, xd, xuu, xdd;
194 xu=xd=xuu=xdd=0;
195
196 //Halos
197#if(npt>1)
198 ZHalo_swap_dir(x,16,3,DOWN); ZHalo_swap_dir(x,16,3,UP);
199#endif
200 //Pesky halo exchange indices again
201 //The halo exchange for the trial fields was done already at the end of the trajectory
202 //No point doing it again
203
204 //Instead of typing id[i*ndim+3] a lot, we'll just assign them to variables.
205 //Idea. One loop instead of two loops but for xuu and xdd just use ngorkov-(igorkov+1) instead
206 //Dirty CUDA work around since it won't convert thrust<complex> to double
207 //TODO: get a reduction routine ready for CUDA
208#ifdef __NVCC__
209 //Swapping back the gauge fields to SoA since the rest of the code is running on CPU and hasn't been ported
210 Transpose_z(ut[0],kvol,ndim);
211 Transpose_z(ut[1],kvol,ndim);
212 //Set up index arrays for CPU
213 Transpose_U(iu,kvol,ndim);
214 Transpose_U(id,kvol,ndim);
215 cudaDeviceSynchronise();
216#else
217#pragma omp parallel for reduction(+:xd,xu,xdd,xuu)
218#endif
219 for(int i = 0; i<kvol; i++){
220 int did=id[3+ndim*i];
221 int uid=iu[3+ndim*i];
222 for(int igorkov=0; igorkov<4; igorkov++){
223 int igork1=gamin[3*ndirac+igorkov];
224 //For the C Version I'll try and factorise where possible
225 xu+=dk[1][did]*(conj(x[(did*ngorkov+igorkov)*nc])*(\
226 ut[0][did*ndim+3]*(xi[(i*ngorkov+igork1)*nc]-xi[(i*ngorkov+igorkov)*nc])+\
227 ut[1][did*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]-xi[(i*ngorkov+igorkov)*nc+1]) )+\
228 conj(x[(did*ngorkov+igorkov)*nc+1])*(\
229 conj(ut[0][did*ndim+3])*(xi[(i*ngorkov+igork1)*nc+1]-xi[(i*ngorkov+igorkov)*nc+1])+\
230 conj(ut[1][did*ndim+3])*(xi[(i*ngorkov+igorkov)*nc]-xi[(i*ngorkov+igork1)*nc])));
231 }
232 for(int igorkov=0; igorkov<4; igorkov++){
233 int igork1=gamin[3*ndirac+igorkov];
234 xd+=dk[0][i]*(conj(x[(uid*ngorkov+igorkov)*nc])*(\
235 conj(ut[0][i*ndim+3])*(xi[(i*ngorkov+igork1)*nc]+xi[(i*ngorkov+igorkov)*nc])-\
236 ut[1][i*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]+xi[(i*ngorkov+igorkov)*nc+1]) )+\
237 conj(x[(uid*ngorkov+igorkov)*nc+1])*(\
238 ut[0][i*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]+xi[(i*ngorkov+igorkov)*nc+1])+\
239 conj(ut[1][i*ndim+3])*(xi[(i*ngorkov+igorkov)*nc]+xi[(i*ngorkov+igork1)*nc]) ) );
240 }
241 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
242 int igork1PP=4+gamin[3*ndirac+igorkovPP-4];
243 xuu-=dk[0][did]*(conj(x[(did*ngorkov+igorkovPP)*nc])*(\
244 ut[0][did*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc]-xi[(i*ngorkov+igorkovPP)*nc])+\
245 ut[1][did*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]-xi[(i*ngorkov+igorkovPP)*nc+1]) )+\
246 conj(x[(did*ngorkov+igorkovPP)*nc+1])*(\
247 conj(ut[0][did*ndim+3])*(xi[(i*ngorkov+igork1PP)*nc+1]-xi[(i*ngorkov+igorkovPP)*nc+1])+\
248 conj(ut[1][did*ndim+3])*(xi[(i*ngorkov+igorkovPP)*nc]-xi[(i*ngorkov+igork1PP)*nc]) ) );
249 }
250 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
251 int igork1PP=4+gamin[3*ndirac+igorkovPP-4];
252 xdd-=dk[1][i]*(conj(x[(uid*ngorkov+igorkovPP)*nc])*(\
253 conj(ut[0][i*ndim+3])*(xi[(i*ngorkov+igork1PP)*nc]+xi[(i*ngorkov+igorkovPP)*nc])-\
254 ut[1][i*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]+xi[(i*ngorkov+igorkovPP)*nc+1]) )+\
255 conj(x[(uid*ngorkov+igorkovPP)*nc+1])*(\
256 ut[0][i*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]+xi[(i*ngorkov+igorkovPP)*nc+1])+\
257 conj(ut[1][i*ndim+3])*(xi[(i*ngorkov+igorkovPP)*nc]+xi[(i*ngorkov+igork1PP)*nc]) ) );
258 }
259 }
260 *endenf=creal(xu-xd-xuu+xdd);
261 *denf=creal(xu+xd+xuu+xdd);
262
263#if(nproc>1)
264 Par_dsum(endenf); Par_dsum(denf);
265#endif
266 *endenf/=2*gvol; *denf/=2*gvol;
267 //Future task. Chiral susceptibility measurements
268#ifdef __NVCC__
269 cudaFree(x); cudaFree(xi);
270 //Revert index and gauge arrays
271 Transpose_z(ut[0],ndim,kvol);
272 Transpose_z(ut[1],ndim,kvol);
273 Transpose_U(iu,ndim,kvol);
274 Transpose_U(id,ndim,kvol);
275#else
276 free(x); free(xi);
277#endif
278 return 0;
279}
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:585
#define UP
Flag for send up.
Definition par_mpi.h:37
#define DOWN
Flag for send down.
Definition par_mpi.h:35
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
Definition random.c:260
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define ndirac
Dirac indices.
Definition sizes.h:177
#define gvol
Lattice volume.
Definition sizes.h:92
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...
Definition congrad.c:250

References AVX, Complex, Complex_f, Congradp(), DOWN, Dslashd_f(), Gauss_c(), gvol, kferm, kfermHalo, kvol, nc, ndim, ndirac, ngorkov, Par_dsum(), UP, and ZHalo_swap_dir().

Here is the call graph for this function:
Here is the caller graph for this function: