su2hmc
Loading...
Searching...
No Matches
congrad.c File Reference

Conjugate gradients. Congradq for the solver and Congradp for the inversion. More...

#include <matrices.h>
Include dependency graph for congrad.c:

Go to the source code of this file.

Functions

int Congradq (int na, double res, Complex *X1, Complex *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
 Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) Implements up/down partitioning The matrix multiplication step is done at single precision, while the update is done at double.
 
int Congradp (int na, double res, Complex *Phi, Complex *xi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
 Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) The matrix multiplication step is done at single precision, while the update is done at double.
 

Detailed Description

Conjugate gradients. Congradq for the solver and Congradp for the inversion.

Definition in file congrad.c.

Function Documentation

◆ Congradp()

int Congradp ( int na,
double res,
Complex * Phi,
Complex * xi,
Complex_f * u11t,
Complex_f * u12t,
unsigned int * iu,
unsigned int * id,
Complex_f * gamval,
int * gamin,
float * dk4m,
float * dk4p,
Complex_f jqq,
float akappa,
int * itercg )

Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) The matrix multiplication step is done at single precision, while the update is done at double.

Parameters
naFlavour index
resLimit for conjugate gradient
PhiPseudofermion field.
xiReturned as \((M^\dagger M)^{-1} \Phi\)
u11tFirst colour's trial field
u12tSecond colour's trial field
iuUpper halo indices
idLower halo indices
gamval_fSingle precision gamma matrices rescaled by kappa
gaminDirac indices
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping Parameter
itercgCounts the iterations of the conjugate gradient
Returns
0 on success, integer error code otherwise

Definition at line 262 of file congrad.c.

263 {
264 /*
265 * @brief Matrix Inversion via Conjugate Gradient
266 * Solves @f$(M^\dagger)Mx=\Phi@f$
267 * No even/odd partitioning.
268 * The matrix multiplication step is done at single precision, while the update is done at double
269 *
270 * @param na: Flavour index
271 * @param res: Limit for conjugate gradient
272 * @param Phi: @f(\Phi@f) initially,
273 * @param xi: Returned as @f((M^\dagger M)^{-1} \Phi@f)
274 * @param u11t: First colour's trial field
275 * @param u12t: Second colour's trial field
276 * @param iu: Upper halo indices
277 * @param id: Lower halo indices
278 * @param gamval: Gamma matrices
279 * @param gamin: Dirac indices
280 * @param dk4m:
281 * @param dk4p:
282 * @param jqq: Diquark source
283 * @param akappa: Hopping Parameter
284 * @param itercg: Counts the iterations of the conjugate gradient
285 *
286 * @return 0 on success, integer error code otherwise
287 */
288 const char *funcname = "Congradp";
289 //Return value
290 int ret_val=0;
291 const double resid = res*res;
292 //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
293 //These alpha and beta terms should be double, but that causes issues with BLAS. Instead we declare
294 //them Complex and work with the real part (especially for α_d)
295 //Give initial values Will be overwritten if niterx>0
296#ifdef __NVCC__
297 Complex_f *p_f, *r_f, *xi_f, *x1_f, *x2_f;
298 int device; cudaGetDevice(&device);
299#ifdef _DEBUG
300 cudaMallocManaged((void **)&p_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
301 cudaMallocManaged((void **)&r_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
302 cudaMallocManaged((void **)&x1_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
303 cudaMallocManaged((void **)&x2_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
304 cudaMallocManaged((void **)&xi_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
305#else
306 cudaMalloc((void **)&p_f, kfermHalo*sizeof(Complex_f));
307 cudaMalloc((void **)&r_f, kferm*sizeof(Complex_f));
308 cudaMalloc((void **)&x1_f, kfermHalo*sizeof(Complex_f));
309 cudaMalloc((void **)&x2_f, kferm*sizeof(Complex_f));
310 cudaMalloc((void **)&xi_f, kferm*sizeof(Complex_f));
311#endif
312#else
313 Complex_f *p_f = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
314 Complex_f *r_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
315 Complex_f *x1_f = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
316 Complex_f *x2_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
317 Complex_f *xi_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
318#endif
319 double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
320 double alphan=0.0;
321 //Instead of copying element-wise in a loop, use memcpy.
322#ifdef __NVCC__
323 //Get xi in single precision, then swap to AoS format
324 cuComplex_convert(p_f,xi,kferm,true,dimBlock,dimGrid);
325 Transpose_c(p_f,ngorkov*nc,kvol,dimGrid,dimBlock);
326 cudaMemcpy(xi_f,p_f,kferm*sizeof(Complex_f),cudaMemcpyDefault);
327
328 //And repeat for r
329 cuComplex_convert(r_f,Phi+na*kferm,kferm,true,dimBlock,dimGrid);
330 Transpose_c(r_f,ngorkov*nc,kvol,dimGrid,dimBlock);
331
332 //Flip all the gauge fields around so memory is coalesced
333 Transpose_c(u11t,ndim,kvol,dimGrid,dimBlock);
334 Transpose_c(u12t,ndim,kvol,dimGrid,dimBlock);
335#else
336#pragma omp parallel for simd aligned(p_f,xi_f,xi,r_f,Phi:AVX)
337 for(int i =0;i<kferm;i++){
338 p_f[i]=xi_f[i]=(Complex_f)xi[i];
339 r_f[i]=(Complex_f)Phi[na*kferm+i];
340 }
341#endif
342
343 // Declaring placeholder arrays
344 // This x1 is NOT related to the /common/vectorp/X1 in the FORTRAN code and should not
345 // be confused with X1 the global variable
346
347 //niterx isn't called as an index but we'll start from zero with the C code to make the
348 //if statements quicker to type
349 double betan;
350#ifdef __NVCC__
351 cudaDeviceSynchronise();
352#endif
353 for((*itercg)=0; (*itercg)<=niterc; (*itercg)++){
354 //Don't overwrite on first run.
355 //x2=(M^†)x1=(M^†)Mp
356 Dslash_f(x1_f,p_f,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa);
357 Dslashd_f(x2_f,x1_f,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa);
358#ifdef __NVCC__
359 cudaDeviceSynchronise();
360#endif
361 //We can't evaluate α on the first niterx because we need to get β_n.
362 if(*itercg){
363 //x*.x
364#ifdef USE_BLAS
365 float alphad_f;
366#ifdef __NVCC__
367 cublasScnrm2(cublas_handle,kferm,(cuComplex*) x1_f, 1,(float *)&alphad_f);
368 alphad = alphad_f*alphad_f;
369#else
370 alphad_f = cblas_scnrm2(kferm, x1_f, 1);
371#endif
372 alphad = alphad_f*alphad_f;
373#else
374 alphad=0;
375 for(int i = 0; i<kferm; i++)
376 alphad+=conj(x1_f[i])*x1_f[i];
377#endif
378#if(nproc>1)
379 Par_fsum((float *)&alphad);
380#endif
381 //α=(r.r)/p(M^†)Mp
382 alpha=alphan/creal(alphad);
383 // Complex_f alpha_f = (Complex_f)alpha;
384 //x+αp
385#ifdef USE_BLAS
386 Complex_f alpha_f=(float)alpha;
387#ifdef __NVCC__
388 cublasCaxpy(cublas_handle,kferm,(cuComplex*) &alpha_f,(cuComplex*) p_f,1,(cuComplex*) xi_f,1);
389#else
390 cblas_caxpy(kferm, (Complex_f*)&alpha_f,(Complex_f*)p_f, 1, (Complex_f*)xi_f, 1);
391#endif
392#else
393#pragma omp parallel for simd aligned(xi_f,p_f:AVX)
394 for(int i = 0; i<kferm; i++)
395 xi_f[i]+=alpha*p_f[i];
396#endif
397 }
398
399 //r=α(M^†)Mp and β_n=r*.r
400#if defined USE_BLAS
401 Complex_f alpha_m=(Complex_f)(-alpha);
402 float betan_f=0;
403#ifdef __NVCC__
404 cublasCaxpy(cublas_handle,kferm, (cuComplex *)&alpha_m,(cuComplex *) x2_f, 1,(cuComplex *) r_f, 1);
405 //cudaDeviceSynchronise();
406 //r*.r
407 cublasScnrm2(cublas_handle,kferm,(cuComplex *) r_f,1,(float *)&betan_f);
408#else
409 cblas_caxpy(kferm,(Complex_f*) &alpha_m,(Complex_f*) x2_f, 1,(Complex_f*) r_f, 1);
410 //r*.r
411 betan_f = cblas_scnrm2(kferm, (Complex_f*)r_f,1);
412#endif
413 //Gotta square it to "undo" the norm
414 betan=betan_f*betan_f;
415#else
416 //Just like Congradq, this loop could be unrolled but will need a reduction to deal with the betan
417 //addition.
418 betan = 0;
419 //If we get a small enough β_n before hitting the iteration cap we break
420#pragma omp parallel for simd aligned(x2_f,r_f:AVX) reduction(+:betan)
421 for(int i = 0; i<kferm;i++){
422 r_f[i]-=alpha*x2_f[i];
423 betan+=conj(r_f[i])*r_f[i];
424 }
425#endif
426 //This is basically just congradq at the end. Check there for comments
427#if(nproc>1)
428 Par_dsum(&betan);
429#endif
430#ifdef _DEBUG
431#ifdef _DEBUGCG
432 char *endline = "\n";
433#else
434 char *endline = "\r";
435#endif
436 if(!rank) printf("Iter (CG) = %i β_n= %e α= %e%s", *itercg, betan, alpha,endline);
437#endif
438 if(betan<resid){
439 //Started counting from zero so add one to make it accurate
440 (*itercg)++;
441#ifdef _DEBUG
442 if(!rank) printf("\nIter (CG) = %i resid = %e toler = %e\n", *itercg, betan, resid);
443#endif
444 ret_val=0; break;
445 }
446 else if(*itercg==niterc-1){
447 if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n",
448 ITERLIM, funcname, niterc, betan);
449 ret_val=ITERLIM; break;
450 }
451 //Note that beta below is not the global beta and scoping is used to avoid conflict between them
452 Complex beta = (*itercg) ? betan/betad : 0;
453 betad=betan; alphan=betan;
454 //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
455 //β instead of x.
456 //There is cblas_zaxpby in the MKL though, set a = 1 and b = β.
457#ifdef USE_BLAS
458 Complex_f beta_f = (Complex_f)beta;
459 Complex_f a = 1.0;
460#ifdef __NVCC__
461 cublasCscal(cublas_handle,kferm,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
462 cublasCaxpy(cublas_handle,kferm,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
463 cudaDeviceSynchronise();
464#elif (defined __INTEL_MKL__ || defined AMD_BLAS)
465 cblas_caxpby(kferm, &a, r_f, 1, &beta_f, p_f, 1);
466#else
467 cblas_cscal(kferm,&beta_f,p_f,1);
468 cblas_caxpy(kferm,&a,r_f,1,p_f,1);
469#endif
470#else
471#pragma omp parallel for simd aligned(r_f,p_f:AVX)
472 for(int i=0; i<kferm; i++)
473 p_f[i]=r_f[i]+beta*p_f[i];
474#endif
475 }
476#ifdef __NVCC__
477 Transpose_c(xi_f,kvol,ngorkov*nc,dimGrid,dimBlock);
478 Transpose_c(r_f,kvol,ngorkov*nc,dimGrid,dimBlock);
479
480 Transpose_c(u11t,kvol,ndim,dimGrid,dimBlock);
481 Transpose_c(u12t,kvol,ndim,dimGrid,dimBlock);
482 cudaDeviceSynchronise();
483 cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
484#else
485#pragma omp simd
486 for(int i = 0; i <kferm;i++){
487 xi[i]=(Complex)xi_f[i];
488 }
489#endif
490#ifdef __NVCC__
491 cudaFree(p_f); cudaFree(r_f);cudaFree(x1_f); cudaFree(x2_f); cudaFree(xi_f);
492#else
493 free(p_f); free(r_f); free(x1_f); free(x2_f); free(xi_f);
494#endif
495 return ret_val;
496}
int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:462
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:584
int Par_fsum(float *dval)
Performs a reduction on a float dval to get a sum which is then distributed to all ranks.
int rank
The MPI rank.
Definition par_mpi.c:22
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define niterc
Hard limit for runaway trajectories in Conjugate gradient.
Definition sizes.h:163
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225

References AVX, Complex, Complex_f, Dslash_f(), Dslashd_f(), kferm, kfermHalo, kvol, nc, ndim, ngorkov, niterc, Par_dsum(), Par_fsum(), and rank.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Congradq()

int Congradq ( int na,
double res,
Complex * X1,
Complex * r,
Complex_f * u11t_f,
Complex_f * u12t_f,
unsigned int * iu,
unsigned int * id,
Complex_f * gamval_f,
int * gamin,
float * dk4m_f,
float * dk4p_f,
Complex_f jqq,
float akappa,
int * itercg )

Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) Implements up/down partitioning The matrix multiplication step is done at single precision, while the update is done at double.

Parameters
naFlavour index
resLimit for conjugate gradient
X1Pseudofermion field \(\Phi\) initially, returned as \((M^\dagger M)^{-1} \Phi\)
rPartition of \(\Phi\) being used. Gets recycled as the residual vector
u11t_fFirst colour's trial field
u12t_fSecond colour's trial field
iuUpper halo indices
idLower halo indices
gamval_fSingle precision gamma matrices rescaled by kappa
gaminDirac indices
dk4m_f\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p_f\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping Parameter
itercgCounts the iterations of the conjugate gradient
Returns
0 on success, integer error code otherwise

Definition at line 7 of file congrad.c.

8 {
9 /*
10 * @brief Matrix Inversion via Mixed Precision Conjugate Gradient
11 * Solves @f$(M^\dagger)Mx=\Phi@f$
12 * Implements up/down partitioning
13 * The matrix multiplication step is done at single precision, while the update is done at double
14 *
15 * @param na: Flavour index
16 * @param res: Limit for conjugate gradient
17 * @param X1: @f(\Phi@f) initially, returned as @f((M^\dagger M)^{-1} \Phi@f)
18 * @param r: Partition of @f(\Phi@f) being used. Gets recycled as the residual vector
19 * @param u11t: First colour's trial field
20 * @param u12t: Second colour's trial field
21 * @param iu: Upper halo indices
22 * @param id: Lower halo indices
23 * @param gamval_f: Gamma matrices
24 * @param gamin: Dirac indices
25 * @param dk4m:
26 * @param dk4p:
27 * @param jqq: Diquark source
28 * @param akappa: Hopping Parameter
29 * @param itercg: Counts the iterations of the conjugate gradient
30 *
31 * @see Hdslash_f(), Hdslashd_f(), Par_fsum(), Par_dsum()
32 *
33 * @return 0 on success, integer error code otherwise
34 */
35 const char *funcname = "Congradq";
36 int ret_val=0;
37 const double resid = res*res;
38 //The κ^2 factor is needed to normalise the fields correctly
39 //jqq is the diquark condensate and is global scope.
40 const Complex_f fac_f = conj(jqq)*jqq*akappa*akappa;
41 //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
42 //n suffix is numerator, d is denominator
43 double alphan=1;
44 //The alpha and beta terms should be double, but that causes issues with BLAS pointers. Instead we declare
45 //them complex and work with the real part (especially for α_d)
46 //Give initial values Will be overwritten if niterx>0
47 double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
48 //Because we're dealing with flattened arrays here we can call cblas safely without the halo
49#ifdef __NVCC__
50 Complex_f *p_f, *x1_f, *x2_f, *r_f, *X1_f;
51 int device=-1; cudaGetDevice(&device);
52
53#ifdef _DEBUG
54 cudaMallocManaged((void **)&p_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
55 cudaMallocManaged((void **)&x1_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
56 cudaMallocManaged((void **)&x2_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
57 cudaMallocManaged((void **)&r_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
58 cudaMallocManaged((void **)&X1_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
59#else
60 //First two have halo exchanges, so getting NCCL working is important
61 cudaMallocAsync((void **)&p_f, kferm2Halo*sizeof(Complex_f),streams[0]);
62 cudaMallocAsync((void **)&x1_f, kferm2Halo*sizeof(Complex_f),streams[1]);
63 cudaMallocAsync((void **)&x2_f, kferm2*sizeof(Complex_f),streams[2]);
64 cudaMallocAsync((void **)&r_f, kferm2*sizeof(Complex_f),streams[3]);
65 cudaMallocAsync((void **)&X1_f, kferm2*sizeof(Complex_f),streams[4]);
66#endif
67#else
68 Complex_f *p_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
69 Complex_f *x1_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
70 Complex_f *x2_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
71 Complex_f *X1_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
72 Complex_f *r_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
73#endif
74 //Instead of copying element-wise in a loop, use memcpy.
75#ifdef __NVCC__
76 //Get X1 in single precision, then swap to AoS format
77 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
78 Transpose_c(X1_f,ndirac*nc,kvol,dimGrid,dimBlock);
79
80 //And repeat for r
81 cuComplex_convert(r_f,r,kferm2,true,dimBlock,dimGrid);
82 Transpose_c(r_f,ndirac*nc,kvol,dimGrid,dimBlock);
83
84 //cudaMemcpy is blocking, so use async instead
85 cudaMemcpyAsync(p_f, X1_f, kferm2*sizeof(Complex_f),cudaMemcpyDeviceToDevice,NULL);
86 //Flip all the gauge fields around so memory is coalesced
87 Transpose_c(u11t,ndim,kvol,dimGrid,dimBlock);
88 Transpose_c(u12t,ndim,kvol,dimGrid,dimBlock);
89 Transpose_I(iu,ndim,kvol,dimGrid,dimBlock);
90 Transpose_I(id,ndim,kvol,dimGrid,dimBlock);
91#else
92#pragma omp parallel for simd
93 for(int i=0;i<kferm2;i++){
94 r_f[i]=(Complex_f)r[i];
95 X1_f[i]=(Complex_f)X1[i];
96 }
97 memcpy(p_f, X1_f, kferm2*sizeof(Complex_f));
98#endif
99
100 //niterx isn't called as an index but we'll start from zero with the C code to make the
101 //if statements quicker to type
102 double betan; bool pf=true;
103 for(*itercg=0; *itercg<niterc; (*itercg)++){
104 //x2 = (M^†M)p
105 //No need to synchronise here. The memcpy in Hdslash is blocking
106 Hdslash_f(x1_f,p_f,u11t,u12t,iu,id,gamval_f,gamin,dk4m,dk4p,akappa);
107 Hdslashd_f(x2_f,x1_f,u11t,u12t,iu,id,gamval_f,gamin,dk4m,dk4p,akappa);
108#ifdef __NVCC__
109 cudaDeviceSynchronise();
110#endif
111 //x2 = (M^†M+J^2)p
112 //No point adding zero a couple of hundred times if the diquark source is zero
113 if(fac_f!=0){
114#ifdef __NVCC__
115 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&fac_f,(cuComplex *)p_f,1,(cuComplex *)x2_f,1);
116#elif defined USE_BLAS
117 cblas_caxpy(kferm2, &fac_f, p_f, 1, x2_f, 1);
118#else
119#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
120 for(int i=0; i<kferm2; i++)
121 x2_f[i]+=fac_f*p_f[i];
122#endif
123 }
124 //We can't evaluate α on the first *itercg because we need to get β_n.
125 if(*itercg){
126 //α_d= p* (M^†M+J^2)p
127#ifdef __NVCC__
128 cublasCdotc(cublas_handle,kferm2,(cuComplex *)p_f,1,(cuComplex *)x2_f,1,(cuComplex *)&alphad);
129#elif defined USE_BLAS
130 cblas_cdotc_sub(kferm2, p_f, 1, x2_f, 1, &alphad);
131#else
132 alphad=0;
133#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
134 for(int i=0; i<kferm2; i++)
135 alphad+=conj(p_f[i])*x2_f[i];
136#endif
137 //For now I'll cast it into a float for the reduction. Each rank only sends and writes
138 //to the real part so this is fine
139#if(nproc>1)
140 Par_fsum((float *)&alphad);
141#endif
142 //α=α_n/α_d = (r.r)/p(M^†M)p
143 alpha=alphan/creal(alphad);
144 //x-αp,
145#ifdef __NVCC__
146 Complex_f alpha_f = (Complex_f)alpha;
147 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&alpha_f,(cuComplex *)p_f,1,(cuComplex *)X1_f,1);
148#elif defined USE_BLAS
149 Complex_f alpha_f = (Complex_f)alpha;
150 cblas_caxpy(kferm2, &alpha_f, p_f, 1, X1_f, 1);
151#else
152 for(int i=0; i<kferm2; i++)
153 X1_f[i]+=alpha*p_f[i];
154#endif
155 }
156 // r_n+1 = r_n-α(M^† M)p_n and β_n=r*.r
157#ifdef __NVCC__
158 Complex_f alpha_m=(Complex_f)(-alpha);
159 cublasCaxpy(cublas_handle, kferm2,(cuComplex *)&alpha_m,(cuComplex *)x2_f,1,(cuComplex *)r_f,1);
160 float betan_f;
161 cublasScnrm2(cublas_handle,kferm2,(cuComplex *)r_f,1,&betan_f);
162 betan = betan_f*betan_f;
163#elif defined USE_BLAS
164 Complex_f alpha_m = (Complex_f)(-alpha);
165 cblas_caxpy(kferm2, &alpha_m, x2_f, 1, r_f, 1);
166 //Undo the negation for the BLAS routine
167 float betan_f = cblas_scnrm2(kferm2, r_f,1);
168 //Gotta square it to "undo" the norm
169 betan = betan_f*betan_f;
170#else
171 betan=0;
172#pragma omp parallel for simd aligned(r_f,x2_f:AVX) reduction(+:betan)
173 for(int i=0; i<kferm2; i++){
174 r_f[i]-=alpha*x2_f[i];
175 betan += conj(r_f[i])*r_f[i];
176 }
177#endif
178 //And... reduce.
179#if(nproc>1)
180 Par_dsum(&betan);
181#endif
182#ifdef _DEBUGCG
183#warning "CG Debugging"
184 char *endline = "\n";
185#else
186 char *endline = "\r";
187#endif
188#ifdef _DEBUG
189 if(!rank) printf("Iter(CG)=%i\tβ_n=%e\tα=%e%s", *itercg, betan, alpha,endline);
190#endif
191 if(betan<resid){
192 (*itercg)++;
193#ifdef _DEBUG
194 if(!rank) printf("\nIter(CG)=%i\tResidue: %e\tTolerance: %e\n", *itercg, betan, resid);
195#endif
196 ret_val=0; break;
197 }
198 else if(*itercg==niterc-1){
199 if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n", ITERLIM, funcname, *itercg, betan);
200 ret_val=ITERLIM; break;
201 }
202 //Here we evaluate β=(r_{k+1}.r_{k+1})/(r_k.r_k) and then shuffle our indices down the line.
203 //On the first iteration we define beta to be zero.
204 //Note that beta below is not the global beta and scoping is used to avoid conflict between them
205 Complex beta = (*itercg) ? betan/betad : 0;
206 betad=betan; alphan=betan;
207 //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
208 //β instead of x.
209#ifdef __NVCC__
210 Complex_f beta_f=(Complex_f)beta;
211 __managed__ Complex_f a = 1.0;
212 cublasCscal(cublas_handle,kferm2,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
213 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
214#elif (defined __INTEL_MKL__)
215 Complex_f a = 1.0;
216 Complex_f beta_f=(Complex_f)beta;
217 //There is cblas_?axpby in the MKL and AMD though, set a = 1 and b = β.
218 //If we get a small enough β_n before hitting the iteration cap we break
219 cblas_caxpby(kferm2, &a, r_f, 1, &beta_f, p_f, 1);
220#elif defined USE_BLAS
221 Complex_f beta_f=(Complex_f)beta;
222 cblas_cscal(kferm2,&beta_f,p_f,1);
223 Complex_f a = 1.0;
224 cblas_caxpy(kferm2,&a,r_f,1,p_f,1);
225#else
226 for(int i=0; i<kferm2; i++)
227 p_f[i]=r_f[i]+beta*p_f[i];
228#endif
229 }
230#ifdef __NVCC__
231//Restore arrays back to their previous salyout
232 Transpose_c(X1_f,kvol,ndirac*nc,dimGrid,dimBlock);
233 cuComplex_convert(X1_f,X1,kferm2,false,dimBlock,dimGrid);
234 Transpose_c(r_f,kvol,ndirac*nc,dimGrid,dimBlock);
235 cuComplex_convert(r_f,r,kferm2,false,dimBlock,dimGrid);
236 Transpose_c(u11t,kvol,ndim,dimGrid,dimBlock);
237 Transpose_c(u12t,kvol,ndim,dimGrid,dimBlock);
238 Transpose_I(iu,kvol,ndim,dimGrid,dimBlock);
239 Transpose_I(id,kvol,ndim,dimGrid,dimBlock);
240#else
241 for(int i=0;i<kferm2;i++){
242 X1[i]=(Complex)X1_f[i];
243 r[i]=(Complex)r_f[i];
244 }
245#endif
246#ifdef __NVCC__
247#ifdef _DEBUG
248 cudaDeviceSynchronise();
249 cudaFree(x1_f);cudaFree(x2_f); cudaFree(p_f);
250 cudaFree(r_f);cudaFree(X1_f);
251#else
252 //streams match the ones that allocated them.
253 cudaFreeAsync(p_f,streams[0]);cudaFreeAsync(x1_f,streams[1]);cudaFreeAsync(x2_f,streams[2]);
254 cudaDeviceSynchronise();
255 cudaFreeAsync(r_f,streams[3]);cudaFreeAsync(X1_f,streams[4]);
256#endif
257#else
258 free(x1_f);free(x2_f); free(p_f); free(r_f); free(X1_f);
259#endif
260 return ret_val;
261}
int Hdslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, float akappa)
Evaluates in single precision.
Definition matrices.c:802
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, float akappa)
Evaluates in single precision.
Definition matrices.c:711
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define ndirac
Dirac indices.
Definition sizes.h:177
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188

References AVX, Complex, Complex_f, Hdslash_f(), Hdslashd_f(), kferm2, kferm2Halo, kvol, nc, ndim, ndirac, niterc, Par_dsum(), Par_fsum(), and rank.

Here is the call graph for this function:
Here is the caller graph for this function: