Calculate fermion expectation values via a noisy estimator.
Matrix inversion via conjugate gradient algorithm Solves \(MX=X_1\) (Numerical Recipes section 2.10 pp.70-73)
uses NEW lookup tables ** Implemented in Congradp()
11 {
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45 const char *funcname = "Measure";
46
47
48#ifdef __NVCC__
49 int device=-1;
50 cudaGetDevice(&device);
52 #ifdef _DEBUG
54 #else
56 #endif
57 cudaMallocManaged((
void **)&x,
kfermHalo*
sizeof(
Complex), cudaMemAttachGlobal);
58 cudaMallocManaged((
void **)&xi,
kferm*
sizeof(
Complex), cudaMemAttachGlobal);
60#else
65#endif
66
67#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
69#else
70 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm, xi_f, 0, 1/sqrt(2));
71#endif
72#ifdef __NVCC__
73 cudaMemPrefetchAsync(xi_f,
kferm*
sizeof(
Complex_f),device,streams[0]);
74 cuComplex_convert(xi_f,xi,
kferm,
false,dimBlock,dimGrid);
75
77
78 Transpose_c(u11t_f,
ndim,
kvol,dimGrid,dimBlock);
79 Transpose_c(u12t_f,
ndim,
kvol,dimGrid,dimBlock);
80 cudaMemcpyAsync(x, xi,
kferm*
sizeof(
Complex),cudaMemcpyDefault,0);
81#else
82#pragma omp parallel for simd aligned(xi,xi_f:AVX)
83 for(
int i=0;i<
kferm;i++)
86#endif
87
88
89
90 Dslashd_f(R1_f,xi_f,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa);
91#ifdef __NVCC__
92 cudaDeviceSynchronise();
93 cudaFree(xi_f);
95 Transpose_c(u11t_f,
kvol,
ndim,dimGrid,dimBlock);
96 Transpose_c(u12t_f,
kvol,
ndim,dimGrid,dimBlock);
97 cuComplex_convert(R1_f,R1,
kferm,
false,dimBlock,dimGrid);
98 cudaMemcpy(Phi, R1,
kferm*
sizeof(
Complex),cudaMemcpyDefault);
99#else
100#pragma omp parallel for simd aligned(R1,R1_f:AVX)
101 for(
int i=0;i<
kferm;i++)
103
104
105
107#endif
108
109
110
111
112 if(
Congradp(0, res, Phi, R1,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,itercg)==ITERLIM)
113 return ITERLIM;
114
115
116
117
118
119
120
121
122
123#ifdef __NVCC__
124 cudaMemcpyAsync(xi,R1,
kferm*
sizeof(
Complex),cudaMemcpyDefault,streams[0]);
125 #ifdef _DEBUG
126 cudaFree(R1_f);
127 #else
128 cudaFreeAsync(R1_f,streams[1]);
129 #endif
130#else
132 free(xi_f); free(R1_f);
133#endif
134#ifdef USE_BLAS
136#ifdef __NVCC__
137 cublasZdotc(cublas_handle,
kferm,(cuDoubleComplex *)x,1,(cuDoubleComplex *)xi,1,(cuDoubleComplex *)&buff);
138 cudaDeviceSynchronise();
139#elif defined USE_BLAS
140 cblas_zdotc_sub(
kferm, x, 1, xi, 1, &buff);
141#endif
142 *pbp=creal(buff);
143#else
144 *pbp = 0;
145#pragma unroll
146 for(
int i=0;i<
kferm;i++)
147 *pbp+=creal(conj(x[i])*xi[i]);
148#endif
149#if(nproc>1)
151#endif
153
154 *qbqb=*qq=0;
155#if defined USE_BLAS
156 for(
int idirac = 0; idirac<
ndirac; idirac++){
157 int igork=idirac+4;
158
159#pragma unroll
160 for(
int ic = 0; ic<
nc; ic++){
162
163
164
165#ifdef __NVCC__
166 cublasZdotc(cublas_handle,
kvol,(cuDoubleComplex *)(x+idirac*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)(xi+igork*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)&dot);
167#else
169#endif
170 *qbqb+=gamval[4*
ndirac+idirac]*dot;
171#ifdef __NVCC__
172 cublasZdotc(cublas_handle,
kvol,(cuDoubleComplex *)(x+igork*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)(xi+idirac*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)&dot);
173#else
175#endif
176 *qq-=gamval[4*
ndirac+idirac]*dot;
177 }
178 }
179#else
180#pragma unroll(2)
181 for(
int i=0; i<
kvol; i++)
182
183 for(
int idirac = 0; idirac<
ndirac; idirac++){
184 int igork=idirac+4;
189 }
190#endif
191
192
193#if(nproc>1)
195#endif
196 *qq=(*qq+*qbqb)/(2*
gvol);
198 xu=xd=xuu=xdd=0;
199
200
201#if(npt>1)
203#endif
204
205
206
207
208
209
210
211
212#ifndef __NVCC__
213#pragma omp parallel for reduction(+:xd,xu,xdd,xuu)
214#endif
215 for(
int i = 0; i<
kvol; i++){
216 int did=
id[3+
ndim*i];
217 int uid=iu[3+
ndim*i];
218#ifndef __NVCC__
219#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xu)
220#endif
221 for(int igorkov=0; igorkov<4; igorkov++){
222 int igork1=gamin[3*
ndirac+igorkov];
223
224 xu+=dk4p[did]*(conj(x[(did*
ngorkov+igorkov)*
nc])*(\
230 }
231#ifndef __NVCC__
232#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xd)
233#endif
234 for(int igorkov=0; igorkov<4; igorkov++){
235 int igork1=gamin[3*
ndirac+igorkov];
236 xd+=dk4m[i]*(conj(x[(uid*
ngorkov+igorkov)*
nc])*(\
242 }
243#ifndef __NVCC__
244#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xuu)
245#endif
246 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
247 int igork1PP=4+gamin[3*
ndirac+igorkovPP-4];
248 xuu-=dk4m[did]*(conj(x[(did*
ngorkov+igorkovPP)*
nc])*(\
254 }
255#ifndef __NVCC__
256#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xdd)
257#endif
258 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
259 int igork1PP=4+gamin[3*
ndirac+igorkovPP-4];
260 xdd-=dk4p[i]*(conj(x[(uid*
ngorkov+igorkovPP)*
nc])*(\
266 }
267 }
268 *endenf=creal(xu-xd-xuu+xdd);
269 *denf=creal(xu+xd+xuu+xdd);
270
271#if(nproc>1)
273#endif
275
276#ifdef __NVCC__
277 cudaFree(x); cudaFree(xi);
278#else
279 free(x); free(xi);
280#endif
281 return 0;
282}
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
#define UP
Flag for send up.
#define DOWN
Flag for send down.
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
#define ngorkov
Gor'kov indices.
#define kvol
Sublattice volume.
#define Complex
Double precision complex number.
#define kferm
sublattice size including Gor'kov indices
#define ndirac
Dirac indices.
#define gvol
Lattice volume.
#define Complex_f
Single precision complex number.
#define kfermHalo
Gor'kov lattice and halo.
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...