Calculate fermion expectation values via a noisy estimator.
Matrix inversion via conjugate gradient algorithm Solves \(MX=X_1\) (Numerical Recipes section 2.10 pp.70-73)
uses NEW lookup tables ** Implemented in Congradp()
11 {
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45 const char *funcname = "Measure";
46
47
48#ifdef __NVCC__
49 int device=-1;
50 cudaGetDevice(&device);
52#ifdef _DEBUG
54#else
56#endif
57 cudaMallocManaged((
void **)&x,
kfermHalo*
sizeof(
Complex), cudaMemAttachGlobal);
58 cudaMallocManaged((
void **)&xi,
kferm*
sizeof(
Complex), cudaMemAttachGlobal);
60#else
65#endif
66
67#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
69#else
70 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm, xi_f, 0, 1/sqrt(2));
71#endif
72#ifdef __NVCC__
73 cudaMemPrefetchAsync(xi_f,
kferm*
sizeof(
Complex_f),device,streams[0]);
74 cuComplex_convert(xi_f,xi,
kferm,
false,dimBlock,dimGrid);
75
77
78 cudaMemcpyAsync(x, xi,
kferm*
sizeof(
Complex),cudaMemcpyDefault,0);
79#else
80#pragma omp parallel for simd aligned(xi,xi_f:AVX)
81 for(
int i=0;i<
kferm;i++)
84#endif
85
86
87
88 Dslashd_f(R1_f,xi_f,ut_f[0],ut_f[1],iu,
id,gamval_f,gamin,dk_f,jqq,akappa);
89#ifdef __NVCC__
90 cudaDeviceSynchronise();
91 cudaFree(xi_f);
93 cuComplex_convert(R1_f,R1,
kferm,
false,dimBlock,dimGrid);
94 cudaMemcpy(Phi, R1,
kferm*
sizeof(
Complex),cudaMemcpyDefault);
95#else
96#pragma omp parallel for simd aligned(R1,R1_f:AVX)
97 for(
int i=0;i<
kferm;i++)
99
100
101
103#endif
104
105
106
107
108 if(
Congradp(0, res, Phi, R1,ut_f,iu,
id,gamval_f,gamin,dk_f,jqq,akappa,itercg)==ITERLIM)
109 return ITERLIM;
110
111
112
113
114
115
116
117
118
119#ifdef __NVCC__
120 cudaMemcpyAsync(xi,R1,
kferm*
sizeof(
Complex),cudaMemcpyDefault,streams[0]);
121#ifdef _DEBUG
122 cudaFree(R1_f);
123#else
124 cudaFreeAsync(R1_f,streams[1]);
125#endif
126#else
128 free(xi_f); free(R1_f);
129#endif
130#ifdef USE_BLAS
132#ifdef __NVCC__
133 cublasZdotc(cublas_handle,
kferm,(cuDoubleComplex *)x,1,(cuDoubleComplex *)xi,1,(cuDoubleComplex *)&buff);
134 cudaDeviceSynchronise();
135#elif defined USE_BLAS
136 cblas_zdotc_sub(
kferm, x, 1, xi, 1, &buff);
137#endif
138 *pbp=creal(buff);
139#else
140 *pbp = 0;
141#pragma unroll
142 for(
int i=0;i<
kferm;i++)
143 *pbp+=creal(conj(x[i])*xi[i]);
144#endif
145#if(nproc>1)
147#endif
149
150 *qbqb=*qq=0;
151#if defined USE_BLAS
152 for(
int idirac = 0; idirac<
ndirac; idirac++){
153 int igork=idirac+4;
154
155#pragma unroll
156 for(
int ic = 0; ic<
nc; ic++){
158
159
160
161#ifdef __NVCC__
162 cublasZdotc(cublas_handle,
kvol,(cuDoubleComplex *)(x+idirac*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)(xi+igork*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)&dot);
163#else
165#endif
166 *qbqb+=gamval[4*
ndirac+idirac]*dot;
167#ifdef __NVCC__
168 cublasZdotc(cublas_handle,
kvol,(cuDoubleComplex *)(x+igork*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)(xi+idirac*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)&dot);
169#else
171#endif
172 *qq-=gamval[4*
ndirac+idirac]*dot;
173 }
174 }
175#else
176#pragma unroll(2)
177 for(
int i=0; i<
kvol; i++)
178
179 for(
int idirac = 0; idirac<
ndirac; idirac++){
180 int igork=idirac+4;
185 }
186#endif
187
188
189#if(nproc>1)
191#endif
192 *qq=(*qq+*qbqb)/(2*
gvol);
194 xu=xd=xuu=xdd=0;
195
196
197#if(npt>1)
199#endif
200
201
202
203
204
205
206
207
208#ifdef __NVCC__
209
212
215 cudaDeviceSynchronise();
216#else
217#pragma omp parallel for reduction(+:xd,xu,xdd,xuu)
218#endif
219 for(
int i = 0; i<
kvol; i++){
220 int did=
id[3+
ndim*i];
221 int uid=iu[3+
ndim*i];
222 for(int igorkov=0; igorkov<4; igorkov++){
223 int igork1=gamin[3*
ndirac+igorkov];
224
225 xu+=dk[1][did]*(conj(x[(did*
ngorkov+igorkov)*
nc])*(\
231 }
232 for(int igorkov=0; igorkov<4; igorkov++){
233 int igork1=gamin[3*
ndirac+igorkov];
234 xd+=dk[0][i]*(conj(x[(uid*
ngorkov+igorkov)*
nc])*(\
240 }
241 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
242 int igork1PP=4+gamin[3*
ndirac+igorkovPP-4];
243 xuu-=dk[0][did]*(conj(x[(did*
ngorkov+igorkovPP)*
nc])*(\
249 }
250 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
251 int igork1PP=4+gamin[3*
ndirac+igorkovPP-4];
252 xdd-=dk[1][i]*(conj(x[(uid*
ngorkov+igorkovPP)*
nc])*(\
258 }
259 }
260 *endenf=creal(xu-xd-xuu+xdd);
261 *denf=creal(xu+xd+xuu+xdd);
262
263#if(nproc>1)
265#endif
267
268#ifdef __NVCC__
269 cudaFree(x); cudaFree(xi);
270
275#else
276 free(x); free(xi);
277#endif
278 return 0;
279}
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.
#define UP
Flag for send up.
#define DOWN
Flag for send down.
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
#define ngorkov
Gor'kov indices.
#define kvol
Sublattice volume.
#define Complex
Double precision complex number.
#define kferm
sublattice size including Gor'kov indices
#define ndirac
Dirac indices.
#define gvol
Lattice volume.
#define Complex_f
Single precision complex number.
#define kfermHalo
Gor'kov lattice and halo.
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...