8 Complex_f *gamval_f,
int *gamin,
float *dk4m,
float *dk4p,
Complex_f jqq,
float akappa,
int *itercg){
35 const char *funcname =
"Congradq";
37 const double resid = res*res;
40 const Complex_f fac_f = conj(jqq)*jqq*akappa*akappa;
50 Complex_f *p_f, *x1_f, *x2_f, *r_f, *X1_f;
51 int device=-1; cudaGetDevice(&device);
56 cudaMallocManaged((
void **)&x2_f,
kferm2*
sizeof(
Complex_f),cudaMemAttachGlobal);
57 cudaMallocManaged((
void **)&r_f,
kferm2*
sizeof(
Complex_f),cudaMemAttachGlobal);
58 cudaMallocManaged((
void **)&X1_f,
kferm2*
sizeof(
Complex_f),cudaMemAttachGlobal);
77 cuComplex_convert(X1_f,X1,
kferm2,
true,dimBlock,dimGrid);
81 cuComplex_convert(r_f,r,
kferm2,
true,dimBlock,dimGrid);
85 cudaMemcpyAsync(p_f, X1_f,
kferm2*
sizeof(
Complex_f),cudaMemcpyDeviceToDevice,NULL);
87 Transpose_c(u11t,
ndim,
kvol,dimGrid,dimBlock);
88 Transpose_c(u12t,
ndim,
kvol,dimGrid,dimBlock);
89 Transpose_I(iu,
ndim,
kvol,dimGrid,dimBlock);
90 Transpose_I(
id,
ndim,
kvol,dimGrid,dimBlock);
92#pragma omp parallel for simd
102 double betan;
bool pf=
true;
103 for(*itercg=0; *itercg<
niterc; (*itercg)++){
106 Hdslash_f(x1_f,p_f,u11t,u12t,iu,
id,gamval_f,gamin,dk4m,dk4p,akappa);
107 Hdslashd_f(x2_f,x1_f,u11t,u12t,iu,
id,gamval_f,gamin,dk4m,dk4p,akappa);
109 cudaDeviceSynchronise();
115 cublasCaxpy(cublas_handle,
kferm2,(cuComplex *)&fac_f,(cuComplex *)p_f,1,(cuComplex *)x2_f,1);
116#elif defined USE_BLAS
117 cblas_caxpy(
kferm2, &fac_f, p_f, 1, x2_f, 1);
119#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
120 for(
int i=0; i<
kferm2; i++)
121 x2_f[i]+=fac_f*p_f[i];
128 cublasCdotc(cublas_handle,
kferm2,(cuComplex *)p_f,1,(cuComplex *)x2_f,1,(cuComplex *)&alphad);
129#elif defined USE_BLAS
130 cblas_cdotc_sub(
kferm2, p_f, 1, x2_f, 1, &alphad);
133#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
134 for(
int i=0; i<
kferm2; i++)
135 alphad+=conj(p_f[i])*x2_f[i];
143 alpha=alphan/creal(alphad);
147 cublasCaxpy(cublas_handle,
kferm2,(cuComplex *)&alpha_f,(cuComplex *)p_f,1,(cuComplex *)X1_f,1);
148#elif defined USE_BLAS
150 cblas_caxpy(
kferm2, &alpha_f, p_f, 1, X1_f, 1);
152 for(
int i=0; i<
kferm2; i++)
153 X1_f[i]+=alpha*p_f[i];
159 cublasCaxpy(cublas_handle,
kferm2,(cuComplex *)&alpha_m,(cuComplex *)x2_f,1,(cuComplex *)r_f,1);
161 cublasScnrm2(cublas_handle,
kferm2,(cuComplex *)r_f,1,&betan_f);
162 betan = betan_f*betan_f;
163#elif defined USE_BLAS
165 cblas_caxpy(
kferm2, &alpha_m, x2_f, 1, r_f, 1);
167 float betan_f = cblas_scnrm2(
kferm2, r_f,1);
169 betan = betan_f*betan_f;
172#pragma omp parallel for simd aligned(r_f,x2_f:AVX) reduction(+:betan)
173 for(
int i=0; i<
kferm2; i++){
174 r_f[i]-=alpha*x2_f[i];
175 betan += conj(r_f[i])*r_f[i];
183#warning "CG Debugging"
184 char *endline =
"\n";
186 char *endline =
"\r";
189 if(!
rank) printf(
"Iter(CG)=%i\tβ_n=%e\tα=%e%s", *itercg, betan, alpha,endline);
194 if(!
rank) printf(
"\nIter(CG)=%i\tResidue: %e\tTolerance: %e\n", *itercg, betan, resid);
198 else if(*itercg==
niterc-1){
199 if(!
rank) fprintf(stderr,
"Warning %i in %s: Exceeded iteration limit %i β_n=%e\n", ITERLIM, funcname, *itercg, betan);
200 ret_val=ITERLIM;
break;
205 Complex beta = (*itercg) ? betan/betad : 0;
206 betad=betan; alphan=betan;
212 cublasCscal(cublas_handle,
kferm2,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
213 cublasCaxpy(cublas_handle,
kferm2,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
214#elif (defined __INTEL_MKL__)
219 cblas_caxpby(
kferm2, &a, r_f, 1, &beta_f, p_f, 1);
220#elif defined USE_BLAS
222 cblas_cscal(
kferm2,&beta_f,p_f,1);
224 cblas_caxpy(
kferm2,&a,r_f,1,p_f,1);
226 for(
int i=0; i<
kferm2; i++)
227 p_f[i]=r_f[i]+beta*p_f[i];
233 cuComplex_convert(X1_f,X1,
kferm2,
false,dimBlock,dimGrid);
235 cuComplex_convert(r_f,r,
kferm2,
false,dimBlock,dimGrid);
236 Transpose_c(u11t,
kvol,
ndim,dimGrid,dimBlock);
237 Transpose_c(u12t,
kvol,
ndim,dimGrid,dimBlock);
238 Transpose_I(iu,
kvol,
ndim,dimGrid,dimBlock);
239 Transpose_I(
id,
kvol,
ndim,dimGrid,dimBlock);
241 for(
int i=0;i<
kferm2;i++){
248 cudaDeviceSynchronise();
249 cudaFree(x1_f);cudaFree(x2_f); cudaFree(p_f);
250 cudaFree(r_f);cudaFree(X1_f);
253 cudaFreeAsync(p_f,streams[0]);cudaFreeAsync(x1_f,streams[1]);cudaFreeAsync(x2_f,streams[2]);
254 cudaDeviceSynchronise();
255 cudaFreeAsync(r_f,streams[3]);cudaFreeAsync(X1_f,streams[4]);
258 free(x1_f);free(x2_f); free(p_f); free(r_f); free(X1_f);
263 Complex_f *gamval,
int *gamin,
float *dk4m,
float *dk4p,
Complex_f jqq,
float akappa,
int *itercg){
288 const char *funcname =
"Congradp";
291 const double resid = res*res;
297 Complex_f *p_f, *r_f, *xi_f, *x1_f, *x2_f;
298 int device; cudaGetDevice(&device);
301 cudaMallocManaged((
void **)&r_f,
kferm*
sizeof(
Complex_f),cudaMemAttachGlobal);
303 cudaMallocManaged((
void **)&x2_f,
kferm*
sizeof(
Complex_f),cudaMemAttachGlobal);
304 cudaMallocManaged((
void **)&xi_f,
kferm*
sizeof(
Complex_f),cudaMemAttachGlobal);
324 cuComplex_convert(p_f,xi,
kferm,
true,dimBlock,dimGrid);
329 cuComplex_convert(r_f,Phi+na*
kferm,
kferm,
true,dimBlock,dimGrid);
333 Transpose_c(u11t,
ndim,
kvol,dimGrid,dimBlock);
334 Transpose_c(u12t,
ndim,
kvol,dimGrid,dimBlock);
336#pragma omp parallel for simd aligned(p_f,xi_f,xi,r_f,Phi:AVX)
337 for(
int i =0;i<
kferm;i++){
351 cudaDeviceSynchronise();
353 for((*itercg)=0; (*itercg)<=
niterc; (*itercg)++){
356 Dslash_f(x1_f,p_f,u11t,u12t,iu,
id,gamval,gamin,dk4m,dk4p,jqq,akappa);
357 Dslashd_f(x2_f,x1_f,u11t,u12t,iu,
id,gamval,gamin,dk4m,dk4p,jqq,akappa);
359 cudaDeviceSynchronise();
367 cublasScnrm2(cublas_handle,
kferm,(cuComplex*) x1_f, 1,(
float *)&alphad_f);
368 alphad = alphad_f*alphad_f;
370 alphad_f = cblas_scnrm2(
kferm, x1_f, 1);
372 alphad = alphad_f*alphad_f;
375 for(
int i = 0; i<
kferm; i++)
376 alphad+=conj(x1_f[i])*x1_f[i];
382 alpha=alphan/creal(alphad);
388 cublasCaxpy(cublas_handle,
kferm,(cuComplex*) &alpha_f,(cuComplex*) p_f,1,(cuComplex*) xi_f,1);
393#pragma omp parallel for simd aligned(xi_f,p_f:AVX)
394 for(
int i = 0; i<
kferm; i++)
395 xi_f[i]+=alpha*p_f[i];
404 cublasCaxpy(cublas_handle,
kferm, (cuComplex *)&alpha_m,(cuComplex *) x2_f, 1,(cuComplex *) r_f, 1);
407 cublasScnrm2(cublas_handle,
kferm,(cuComplex *) r_f,1,(
float *)&betan_f);
414 betan=betan_f*betan_f;
420#pragma omp parallel for simd aligned(x2_f,r_f:AVX) reduction(+:betan)
421 for(
int i = 0; i<
kferm;i++){
422 r_f[i]-=alpha*x2_f[i];
423 betan+=conj(r_f[i])*r_f[i];
432 char *endline =
"\n";
434 char *endline =
"\r";
436 if(!
rank) printf(
"Iter (CG) = %i β_n= %e α= %e%s", *itercg, betan, alpha,endline);
442 if(!
rank) printf(
"\nIter (CG) = %i resid = %e toler = %e\n", *itercg, betan, resid);
446 else if(*itercg==
niterc-1){
447 if(!
rank) fprintf(stderr,
"Warning %i in %s: Exceeded iteration limit %i β_n=%e\n",
448 ITERLIM, funcname,
niterc, betan);
449 ret_val=ITERLIM;
break;
452 Complex beta = (*itercg) ? betan/betad : 0;
453 betad=betan; alphan=betan;
461 cublasCscal(cublas_handle,
kferm,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
462 cublasCaxpy(cublas_handle,
kferm,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
463 cudaDeviceSynchronise();
464#elif (defined __INTEL_MKL__ || defined AMD_BLAS)
465 cblas_caxpby(
kferm, &a, r_f, 1, &beta_f, p_f, 1);
467 cblas_cscal(
kferm,&beta_f,p_f,1);
468 cblas_caxpy(
kferm,&a,r_f,1,p_f,1);
471#pragma omp parallel for simd aligned(r_f,p_f:AVX)
472 for(
int i=0; i<
kferm; i++)
473 p_f[i]=r_f[i]+beta*p_f[i];
480 Transpose_c(u11t,
kvol,
ndim,dimGrid,dimBlock);
481 Transpose_c(u12t,
kvol,
ndim,dimGrid,dimBlock);
482 cudaDeviceSynchronise();
483 cuComplex_convert(xi_f,xi,
kferm,
false,dimBlock,dimGrid);
486 for(
int i = 0; i <
kferm;i++){
491 cudaFree(p_f); cudaFree(r_f);cudaFree(x1_f); cudaFree(x2_f); cudaFree(xi_f);
493 free(p_f); free(r_f); free(x1_f); free(x2_f); free(xi_f);
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...
int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
int Hdslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, float akappa)
Evaluates in single precision.
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, float akappa)
Evaluates in single precision.