10 Complex *gamval,
Complex_f *gamval_f,
int *gamin,
double *dk4m,
double *dk4p,\
45 const char *funcname =
"Measure";
50 cudaGetDevice(&device);
57 cudaMallocManaged((
void **)&x,
kfermHalo*
sizeof(
Complex), cudaMemAttachGlobal);
58 cudaMallocManaged((
void **)&xi,
kferm*
sizeof(
Complex), cudaMemAttachGlobal);
67#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
70 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm, xi_f, 0, 1/sqrt(2));
73 cudaMemPrefetchAsync(xi_f,
kferm*
sizeof(
Complex_f),device,streams[0]);
74 cuComplex_convert(xi_f,xi,
kferm,
false,dimBlock,dimGrid);
78 Transpose_c(u11t_f,
ndim,
kvol,dimGrid,dimBlock);
79 Transpose_c(u12t_f,
ndim,
kvol,dimGrid,dimBlock);
80 cudaMemcpyAsync(x, xi,
kferm*
sizeof(
Complex),cudaMemcpyDefault,0);
82#pragma omp parallel for simd aligned(xi,xi_f:AVX)
83 for(
int i=0;i<
kferm;i++)
90 Dslashd_f(R1_f,xi_f,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa);
92 cudaDeviceSynchronise();
95 Transpose_c(u11t_f,
kvol,
ndim,dimGrid,dimBlock);
96 Transpose_c(u12t_f,
kvol,
ndim,dimGrid,dimBlock);
97 cuComplex_convert(R1_f,R1,
kferm,
false,dimBlock,dimGrid);
98 cudaMemcpy(Phi, R1,
kferm*
sizeof(
Complex),cudaMemcpyDefault);
100#pragma omp parallel for simd aligned(R1,R1_f:AVX)
101 for(
int i=0;i<
kferm;i++)
112 if(
Congradp(0, res, Phi, R1,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,itercg)==ITERLIM)
124 cudaMemcpyAsync(xi,R1,
kferm*
sizeof(
Complex),cudaMemcpyDefault,streams[0]);
128 cudaFreeAsync(R1_f,streams[1]);
132 free(xi_f); free(R1_f);
137 cublasZdotc(cublas_handle,
kferm,(cuDoubleComplex *)x,1,(cuDoubleComplex *)xi,1,(cuDoubleComplex *)&buff);
138 cudaDeviceSynchronise();
139#elif defined USE_BLAS
140 cblas_zdotc_sub(
kferm, x, 1, xi, 1, &buff);
146 for(
int i=0;i<
kferm;i++)
147 *pbp+=creal(conj(x[i])*xi[i]);
156 for(
int idirac = 0; idirac<
ndirac; idirac++){
160 for(
int ic = 0; ic<
nc; ic++){
166 cublasZdotc(cublas_handle,
kvol,(cuDoubleComplex *)(x+idirac*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)(xi+igork*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)&dot);
170 *qbqb+=gamval[4*
ndirac+idirac]*dot;
172 cublasZdotc(cublas_handle,
kvol,(cuDoubleComplex *)(x+igork*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)(xi+idirac*
nc+ic),
ngorkov*
nc,(cuDoubleComplex *)&dot);
176 *qq-=gamval[4*
ndirac+idirac]*dot;
181 for(
int i=0; i<
kvol; i++)
183 for(
int idirac = 0; idirac<
ndirac; idirac++){
196 *qq=(*qq+*qbqb)/(2*
gvol);
213#pragma omp parallel for reduction(+:xd,xu,xdd,xuu)
215 for(
int i = 0; i<
kvol; i++){
216 int did=
id[3+
ndim*i];
217 int uid=iu[3+
ndim*i];
219#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xu)
221 for(
int igorkov=0; igorkov<4; igorkov++){
222 int igork1=gamin[3*
ndirac+igorkov];
224 xu+=dk4p[did]*(conj(x[(did*
ngorkov+igorkov)*
nc])*(\
232#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xd)
234 for(
int igorkov=0; igorkov<4; igorkov++){
235 int igork1=gamin[3*
ndirac+igorkov];
236 xd+=dk4m[i]*(conj(x[(uid*
ngorkov+igorkov)*
nc])*(\
244#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xuu)
246 for(
int igorkovPP=4; igorkovPP<8; igorkovPP++){
247 int igork1PP=4+gamin[3*
ndirac+igorkovPP-4];
248 xuu-=dk4m[did]*(conj(x[(did*
ngorkov+igorkovPP)*
nc])*(\
256#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xdd)
258 for(
int igorkovPP=4; igorkovPP<8; igorkovPP++){
259 int igork1PP=4+gamin[3*
ndirac+igorkovPP-4];
260 xdd-=dk4p[i]*(conj(x[(uid*
ngorkov+igorkovPP)*
nc])*(\
268 *endenf=creal(xu-xd-xuu+xdd);
269 *denf=creal(xu+xd+xuu+xdd);
277 cudaFree(x); cudaFree(xi);
int Measure(double *pbp, double *endenf, double *denf, Complex *qq, Complex *qbqb, double res, int *itercg, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, Complex *Phi, Complex *R1)
Calculate fermion expectation values via a noisy estimator.
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...