45 const char *funcname =
"Measure";
50 cudaGetDevice(&device);
57 cudaMallocManaged((
void **)&x,
kfermHalo*
sizeof(
Complex), cudaMemAttachGlobal);
58 cudaMallocManaged((
void **)&xi,
kferm*
sizeof(
Complex), cudaMemAttachGlobal);
67#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
70 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm, xi_f, 0, 1/sqrt(2));
73 cudaMemPrefetchAsync(xi_f,
kferm*
sizeof(
Complex_f),device,streams[0]);
74 cuComplex_convert(xi_f,xi,
kferm,
false,dimBlock,dimGrid);
78 cudaMemcpyAsync(x, xi,
kferm*
sizeof(
Complex),cudaMemcpyDefault,0);
80#pragma omp parallel for simd aligned(xi,xi_f:AVX)
81 for(
int i=0;i<
kferm;i++)
88 Dslashd_f(R1_f,xi_f,ut_f[0],ut_f[1],iu,
id,gamval_f,gamin,dk_f,jqq,akappa);
90 cudaDeviceSynchronise();
93 cuComplex_convert(R1_f,R1,
kferm,
false,dimBlock,dimGrid);
94 cudaMemcpy(Phi, R1,
kferm*
sizeof(
Complex),cudaMemcpyDefault);
96#pragma omp parallel for simd aligned(R1,R1_f:AVX)
97 for(
int i=0;i<
kferm;i++)
108 if(
Congradp(0, res, Phi, R1,ut_f,iu,
id,gamval_f,gamin,dk_f,jqq,akappa,itercg)==ITERLIM)
120 cudaMemcpyAsync(xi,R1,
kferm*
sizeof(
Complex),cudaMemcpyDefault,streams[0]);
124 cudaFreeAsync(R1_f,streams[1]);
128 free(xi_f); free(R1_f);
133 cublasZdotc(cublas_handle,
kferm,(cuDoubleComplex *)x,1,(cuDoubleComplex *)xi,1,(cuDoubleComplex *)&buff);
134 cudaDeviceSynchronise();
135#elif defined USE_BLAS
136 cblas_zdotc_sub(
kferm, x, 1, xi, 1, &buff);
142 for(
int i=0;i<
kferm;i++)
143 *pbp+=creal(conj(x[i])*xi[i]);
152 for(
int idirac = 0; idirac<
ndirac; idirac++){
156 for(
int ic = 0; ic<
nc; ic++){
162 cublasZdotc(cublas_handle,
kvol,(cuDoubleComplex *)(x+idirac*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)(xi+igork*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)&dot);
166 *qbqb+=gamval[4*
ndirac+idirac]*dot;
168 cublasZdotc(cublas_handle,
kvol,(cuDoubleComplex *)(x+igork*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)(xi+idirac*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)&dot);
172 *qq-=gamval[4*
ndirac+idirac]*dot;
177 for(
int i=0; i<
kvol; i++)
179 for(
int idirac = 0; idirac<
ndirac; idirac++){
192 *qq=(*qq+*qbqb)/(2*
gvol);
215 cudaDeviceSynchronise();
217#pragma omp parallel for reduction(+:xd,xu,xdd,xuu)
219 for(
int i = 0; i<
kvol; i++){
220 int did=
id[3+
ndim*i];
221 int uid=iu[3+
ndim*i];
222 for(
int igorkov=0; igorkov<4; igorkov++){
223 int igork1=gamin[3*
ndirac+igorkov];
225 xu+=dk[1][did]*(conj(x[(did*
ngorkov+igorkov)*
nc])*(\
232 for(
int igorkov=0; igorkov<4; igorkov++){
233 int igork1=gamin[3*
ndirac+igorkov];
234 xd+=dk[0][i]*(conj(x[(uid*
ngorkov+igorkov)*
nc])*(\
241 for(
int igorkovPP=4; igorkovPP<8; igorkovPP++){
242 int igork1PP=4+gamin[3*
ndirac+igorkovPP-4];
243 xuu-=dk[0][did]*(conj(x[(did*
ngorkov+igorkovPP)*
nc])*(\
250 for(
int igorkovPP=4; igorkovPP<8; igorkovPP++){
251 int igork1PP=4+gamin[3*
ndirac+igorkovPP-4];
252 xdd-=dk[1][i]*(conj(x[(uid*
ngorkov+igorkovPP)*
nc])*(\
260 *endenf=creal(xu-xd-xuu+xdd);
261 *denf=creal(xu+xd+xuu+xdd);
269 cudaFree(x); cudaFree(xi);
int Measure(double *pbp, double *endenf, double *denf, Complex *qq, Complex *qbqb, double res, int *itercg, Complex *ut[2], Complex_f *ut_f[2], unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], Complex_f jqq, float akappa, Complex *Phi, Complex *R1)
Calculate fermion expectation values via a noisy estimator.
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...