35 const char *funcname =
"Congradq";
37 const double resid = res*res;
40 const Complex_f fac_f = conj(jqq)*jqq*akappa*akappa;
50 Complex_f *p_f, *x1_f, *x2_f, *r_f, *X1_f;
51 int device=-1; cudaGetDevice(&device);
56 cudaMallocManaged((
void **)&x2_f,
kferm2*
sizeof(
Complex_f),cudaMemAttachGlobal);
57 cudaMallocManaged((
void **)&r_f,
kferm2*
sizeof(
Complex_f),cudaMemAttachGlobal);
58 cudaMallocManaged((
void **)&X1_f,
kferm2*
sizeof(
Complex_f),cudaMemAttachGlobal);
77 cuComplex_convert(X1_f,X1,
kferm2,
true,dimBlock,dimGrid);
80 cuComplex_convert(r_f,r,
kferm2,
true,dimBlock,dimGrid);
83 cudaMemcpyAsync(p_f, X1_f,
kferm2*
sizeof(
Complex_f),cudaMemcpyDeviceToDevice,NULL);
86#pragma omp parallel for simd
96 double betan;
bool pf=
true;
97 for(*itercg=0; *itercg<
niterc; (*itercg)++){
100 Hdslash_f(x1_f,p_f,ut,iu,
id,gamval_f,gamin,dk,akappa);
101 Hdslashd_f(x2_f,x1_f,ut,iu,
id,gamval_f,gamin,dk,akappa);
103 cudaDeviceSynchronise();
109 cublasCaxpy(cublas_handle,
kferm2,(cuComplex *)&fac_f,(cuComplex *)p_f,1,(cuComplex *)x2_f,1);
110#elif defined USE_BLAS
111 cblas_caxpy(
kferm2, &fac_f, p_f, 1, x2_f, 1);
113#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
114 for(
int i=0; i<
kferm2; i++)
115 x2_f[i]+=fac_f*p_f[i];
122 cublasCdotc(cublas_handle,
kferm2,(cuComplex *)p_f,1,(cuComplex *)x2_f,1,(cuComplex *)&alphad);
123#elif defined USE_BLAS
124 cblas_cdotc_sub(
kferm2, p_f, 1, x2_f, 1, &alphad);
127#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
128 for(
int i=0; i<
kferm2; i++)
129 alphad+=conj(p_f[i])*x2_f[i];
137 alpha=alphan/creal(alphad);
141 cublasCaxpy(cublas_handle,
kferm2,(cuComplex *)&alpha_f,(cuComplex *)p_f,1,(cuComplex *)X1_f,1);
142#elif defined USE_BLAS
144 cblas_caxpy(
kferm2, &alpha_f, p_f, 1, X1_f, 1);
146 for(
int i=0; i<
kferm2; i++)
147 X1_f[i]+=alpha*p_f[i];
153 cublasCaxpy(cublas_handle,
kferm2,(cuComplex *)&alpha_m,(cuComplex *)x2_f,1,(cuComplex *)r_f,1);
155 cublasScnrm2(cublas_handle,
kferm2,(cuComplex *)r_f,1,&betan_f);
156 betan = betan_f*betan_f;
157#elif defined USE_BLAS
159 cblas_caxpy(
kferm2, &alpha_m, x2_f, 1, r_f, 1);
161 float betan_f = cblas_scnrm2(
kferm2, r_f,1);
163 betan = betan_f*betan_f;
166#pragma omp parallel for simd aligned(r_f,x2_f:AVX) reduction(+:betan)
167 for(
int i=0; i<
kferm2; i++){
168 r_f[i]-=alpha*x2_f[i];
169 betan += conj(r_f[i])*r_f[i];
177#warning "CG Debugging"
178 char *endline =
"\n";
180 char *endline =
"\r";
183 if(!
rank) printf(
"Iter(CG)=%i\tβ_n=%e\tα=%e%s", *itercg, betan, alpha,endline);
188 if(!
rank) printf(
"\nIter(CG)=%i\tResidue: %e\tTolerance: %e\n", *itercg, betan, resid);
192 else if(*itercg==
niterc-1){
193 if(!
rank) fprintf(stderr,
"Warning %i in %s: Exceeded iteration limit %i β_n=%e\n", ITERLIM, funcname, *itercg, betan);
194 ret_val=ITERLIM;
break;
199 Complex beta = (*itercg) ? betan/betad : 0;
200 betad=betan; alphan=betan;
206 cublasCscal(cublas_handle,
kferm2,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
207 cublasCaxpy(cublas_handle,
kferm2,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
208#elif (defined __INTEL_MKL__)
213 cblas_caxpby(
kferm2, &a, r_f, 1, &beta_f, p_f, 1);
214#elif defined USE_BLAS
216 cblas_cscal(
kferm2,&beta_f,p_f,1);
218 cblas_caxpy(
kferm2,&a,r_f,1,p_f,1);
220 for(
int i=0; i<
kferm2; i++)
221 p_f[i]=r_f[i]+beta*p_f[i];
226 cuComplex_convert(X1_f,X1,
kferm2,
false,dimBlock,dimGrid);
227 cuComplex_convert(r_f,r,
kferm2,
false,dimBlock,dimGrid);
229 for(
int i=0;i<
kferm2;i++){
236 cudaDeviceSynchronise();
237 cudaFree(x1_f);cudaFree(x2_f); cudaFree(p_f);
238 cudaFree(r_f);cudaFree(X1_f);
241 cudaFreeAsync(p_f,streams[0]);cudaFreeAsync(x1_f,streams[1]);cudaFreeAsync(x2_f,streams[2]);
242 cudaDeviceSynchronise();
243 cudaFreeAsync(r_f,streams[3]);cudaFreeAsync(X1_f,streams[4]);
246 free(x1_f);free(x2_f); free(p_f); free(r_f); free(X1_f);
276 const char *funcname =
"Congradp";
279 const double resid = res*res;
285 Complex_f *p_f, *r_f, *xi_f, *x1_f, *x2_f;
286 int device; cudaGetDevice(&device);
289 cudaMallocManaged((
void **)&r_f,
kferm*
sizeof(
Complex_f),cudaMemAttachGlobal);
291 cudaMallocManaged((
void **)&x2_f,
kferm*
sizeof(
Complex_f),cudaMemAttachGlobal);
292 cudaMallocManaged((
void **)&xi_f,
kferm*
sizeof(
Complex_f),cudaMemAttachGlobal);
312 cuComplex_convert(p_f,xi,
kferm,
true,dimBlock,dimGrid);
317 cuComplex_convert(r_f,Phi+na*
kferm,
kferm,
true,dimBlock,dimGrid);
320#pragma omp parallel for simd aligned(p_f,xi_f,xi,r_f,Phi:AVX)
321 for(
int i =0;i<
kferm;i++){
335 cudaDeviceSynchronise();
337 for((*itercg)=0; (*itercg)<=
niterc; (*itercg)++){
340 Dslash_f(x1_f,p_f,ut[0],ut[1],iu,
id,gamval,gamin,dk,jqq,akappa);
341 Dslashd_f(x2_f,x1_f,ut[0],ut[1],iu,
id,gamval,gamin,dk,jqq,akappa);
343 cudaDeviceSynchronise();
351 cublasScnrm2(cublas_handle,
kferm,(cuComplex*) x1_f, 1,(
float *)&alphad_f);
352 alphad = alphad_f*alphad_f;
354 alphad_f = cblas_scnrm2(
kferm, x1_f, 1);
356 alphad = alphad_f*alphad_f;
359 for(
int i = 0; i<
kferm; i++)
360 alphad+=conj(x1_f[i])*x1_f[i];
366 alpha=alphan/creal(alphad);
372 cublasCaxpy(cublas_handle,
kferm,(cuComplex*) &alpha_f,(cuComplex*) p_f,1,(cuComplex*) xi_f,1);
377#pragma omp parallel for simd aligned(xi_f,p_f:AVX)
378 for(
int i = 0; i<
kferm; i++)
379 xi_f[i]+=alpha*p_f[i];
388 cublasCaxpy(cublas_handle,
kferm, (cuComplex *)&alpha_m,(cuComplex *) x2_f, 1,(cuComplex *) r_f, 1);
391 cublasScnrm2(cublas_handle,
kferm,(cuComplex *) r_f,1,(
float *)&betan_f);
398 betan=betan_f*betan_f;
404#pragma omp parallel for simd aligned(x2_f,r_f:AVX) reduction(+:betan)
405 for(
int i = 0; i<
kferm;i++){
406 r_f[i]-=alpha*x2_f[i];
407 betan+=conj(r_f[i])*r_f[i];
416 char *endline =
"\n";
418 char *endline =
"\r";
420 if(!
rank) printf(
"Iter (CG) = %i β_n= %e α= %e%s", *itercg, betan, alpha,endline);
426 if(!
rank) printf(
"\nIter (CG) = %i resid = %e toler = %e\n", *itercg, betan, resid);
430 else if(*itercg==
niterc-1){
431 if(!
rank) fprintf(stderr,
"Warning %i in %s: Exceeded iteration limit %i β_n=%e\n",
432 ITERLIM, funcname,
niterc, betan);
433 ret_val=ITERLIM;
break;
436 Complex beta = (*itercg) ? betan/betad : 0;
437 betad=betan; alphan=betan;
445 cublasCscal(cublas_handle,
kferm,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
446 cublasCaxpy(cublas_handle,
kferm,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
447 cudaDeviceSynchronise();
448#elif (defined __INTEL_MKL__ || defined AMD_BLAS)
449 cblas_caxpby(
kferm, &a, r_f, 1, &beta_f, p_f, 1);
451 cblas_cscal(
kferm,&beta_f,p_f,1);
452 cblas_caxpy(
kferm,&a,r_f,1,p_f,1);
455#pragma omp parallel for simd aligned(r_f,p_f:AVX)
456 for(
int i=0; i<
kferm; i++)
457 p_f[i]=r_f[i]+beta*p_f[i];
464 cudaDeviceSynchronise();
465 cuComplex_convert(xi_f,xi,
kferm,
false,dimBlock,dimGrid);
468 for(
int i = 0; i <
kferm;i++){
473 cudaFree(p_f); cudaFree(r_f);cudaFree(x1_f); cudaFree(x2_f); cudaFree(xi_f);
475 free(p_f); free(r_f); free(x1_f); free(x2_f); free(xi_f);
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...
int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.