su2hmc
Loading...
Searching...
No Matches
congrad.c File Reference

Conjugate gradients. Congradq for the solver and Congradp for the inversion. More...

#include <matrices.h>
Include dependency graph for congrad.c:

Go to the source code of this file.

Functions

int Congradq (int na, double res, Complex *X1, Complex *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
 Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) Implements up/down partitioning The matrix multiplication step is done at single precision, while the update is done at double.
 
int Congradp (int na, double res, Complex *Phi, Complex *xi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
 Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) The matrix multiplication step is done at single precision, while the update is done at double.
 

Detailed Description

Conjugate gradients. Congradq for the solver and Congradp for the inversion.

Definition in file congrad.c.

Function Documentation

◆ Congradp()

int Congradp ( int na,
double res,
Complex * Phi,
Complex * xi,
Complex_f * ut[2],
unsigned int * iu,
unsigned int * id,
Complex_f * gamval,
int * gamin,
float * dk[2],
Complex_f jqq,
float akappa,
int * itercg )

Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) The matrix multiplication step is done at single precision, while the update is done at double.

Parameters
naFlavour index
resLimit for conjugate gradient
PhiPseudofermion field.
xiReturned as \((M^\dagger M)^{-1} \Phi\)
u11tFirst colour's trial field
u12tSecond colour's trial field
iuUpper halo indices
idLower halo indices
gamval_fSingle precision gamma matrices rescaled by kappa
gaminDirac indices
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping Parameter
itercgCounts the iterations of the conjugate gradient
Returns
0 on success, integer error code otherwise

Definition at line 250 of file congrad.c.

251 {
252 /*
253 * @brief Matrix Inversion via Conjugate Gradient
254 * Solves @f$(M^\dagger)Mx=\Phi@f$
255 * No even/odd partitioning.
256 * The matrix multiplication step is done at single precision, while the update is done at double
257 *
258 * @param na: Flavour index
259 * @param res: Limit for conjugate gradient
260 * @param Phi: @f(\Phi@f) initially,
261 * @param xi: Returned as @f((M^\dagger M)^{-1} \Phi@f)
262 * @param ut[0]: First colour's trial field
263 * @param ut[1]: Second colour's trial field
264 * @param iu: Upper halo indices
265 * @param id: Lower halo indices
266 * @param gamval: Gamma matrices
267 * @param gamin: Dirac indices
268 * @param dk[0]:
269 * @param dk[1]:
270 * @param jqq: Diquark source
271 * @param akappa: Hopping Parameter
272 * @param itercg: Counts the iterations of the conjugate gradient
273 *
274 * @return 0 on success, integer error code otherwise
275 */
276 const char *funcname = "Congradp";
277 //Return value
278 int ret_val=0;
279 const double resid = res*res;
280 //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
281 //These alpha and beta terms should be double, but that causes issues with BLAS. Instead we declare
282 //them Complex and work with the real part (especially for α_d)
283 //Give initial values Will be overwritten if niterx>0
284#ifdef __NVCC__
285 Complex_f *p_f, *r_f, *xi_f, *x1_f, *x2_f;
286 int device; cudaGetDevice(&device);
287#ifdef _DEBUG
288 cudaMallocManaged((void **)&p_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
289 cudaMallocManaged((void **)&r_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
290 cudaMallocManaged((void **)&x1_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
291 cudaMallocManaged((void **)&x2_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
292 cudaMallocManaged((void **)&xi_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
293#else
294 cudaMalloc((void **)&p_f, kfermHalo*sizeof(Complex_f));
295 cudaMalloc((void **)&r_f, kferm*sizeof(Complex_f));
296 cudaMalloc((void **)&x1_f, kfermHalo*sizeof(Complex_f));
297 cudaMalloc((void **)&x2_f, kferm*sizeof(Complex_f));
298 cudaMalloc((void **)&xi_f, kferm*sizeof(Complex_f));
299#endif
300#else
301 Complex_f *p_f = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
302 Complex_f *r_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
303 Complex_f *x1_f = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
304 Complex_f *x2_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
305 Complex_f *xi_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
306#endif
307 double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
308 double alphan=0.0;
309 //Instead of copying element-wise in a loop, use memcpy.
310#ifdef __NVCC__
311 //Get xi in single precision, then swap to AoS format
312 cuComplex_convert(p_f,xi,kferm,true,dimBlock,dimGrid);
313 Transpose_c(p_f,ngorkov*nc,kvol);
314 cudaMemcpy(xi_f,p_f,kferm*sizeof(Complex_f),cudaMemcpyDefault);
315
316 //And repeat for r
317 cuComplex_convert(r_f,Phi+na*kferm,kferm,true,dimBlock,dimGrid);
318 Transpose_c(r_f,ngorkov*nc,kvol);
319#else
320#pragma omp parallel for simd aligned(p_f,xi_f,xi,r_f,Phi:AVX)
321 for(int i =0;i<kferm;i++){
322 p_f[i]=xi_f[i]=(Complex_f)xi[i];
323 r_f[i]=(Complex_f)Phi[na*kferm+i];
324 }
325#endif
326
327 // Declaring placeholder arrays
328 // This x1 is NOT related to the /common/vectorp/X1 in the FORTRAN code and should not
329 // be confused with X1 the global variable
330
331 //niterx isn't called as an index but we'll start from zero with the C code to make the
332 //if statements quicker to type
333 double betan;
334#ifdef __NVCC__
335 cudaDeviceSynchronise();
336#endif
337 for((*itercg)=0; (*itercg)<=niterc; (*itercg)++){
338 //Don't overwrite on first run.
339 //x2=(M^†)x1=(M^†)Mp
340 Dslash_f(x1_f,p_f,ut[0],ut[1],iu,id,gamval,gamin,dk,jqq,akappa);
341 Dslashd_f(x2_f,x1_f,ut[0],ut[1],iu,id,gamval,gamin,dk,jqq,akappa);
342#ifdef __NVCC__
343 cudaDeviceSynchronise();
344#endif
345 //We can't evaluate α on the first niterx because we need to get β_n.
346 if(*itercg){
347 //x*.x
348#ifdef USE_BLAS
349 float alphad_f;
350#ifdef __NVCC__
351 cublasScnrm2(cublas_handle,kferm,(cuComplex*) x1_f, 1,(float *)&alphad_f);
352 alphad = alphad_f*alphad_f;
353#else
354 alphad_f = cblas_scnrm2(kferm, x1_f, 1);
355#endif
356 alphad = alphad_f*alphad_f;
357#else
358 alphad=0;
359 for(int i = 0; i<kferm; i++)
360 alphad+=conj(x1_f[i])*x1_f[i];
361#endif
362#if(nproc>1)
363 Par_fsum((float *)&alphad);
364#endif
365 //α=(r.r)/p(M^†)Mp
366 alpha=alphan/creal(alphad);
367 // Complex_f alpha_f = (Complex_f)alpha;
368 //x+αp
369#ifdef USE_BLAS
370 Complex_f alpha_f=(float)alpha;
371#ifdef __NVCC__
372 cublasCaxpy(cublas_handle,kferm,(cuComplex*) &alpha_f,(cuComplex*) p_f,1,(cuComplex*) xi_f,1);
373#else
374 cblas_caxpy(kferm, (Complex_f*)&alpha_f,(Complex_f*)p_f, 1, (Complex_f*)xi_f, 1);
375#endif
376#else
377#pragma omp parallel for simd aligned(xi_f,p_f:AVX)
378 for(int i = 0; i<kferm; i++)
379 xi_f[i]+=alpha*p_f[i];
380#endif
381 }
382
383 //r=α(M^†)Mp and β_n=r*.r
384#if defined USE_BLAS
385 Complex_f alpha_m=(Complex_f)(-alpha);
386 float betan_f=0;
387#ifdef __NVCC__
388 cublasCaxpy(cublas_handle,kferm, (cuComplex *)&alpha_m,(cuComplex *) x2_f, 1,(cuComplex *) r_f, 1);
389 //cudaDeviceSynchronise();
390 //r*.r
391 cublasScnrm2(cublas_handle,kferm,(cuComplex *) r_f,1,(float *)&betan_f);
392#else
393 cblas_caxpy(kferm,(Complex_f*) &alpha_m,(Complex_f*) x2_f, 1,(Complex_f*) r_f, 1);
394 //r*.r
395 betan_f = cblas_scnrm2(kferm, (Complex_f*)r_f,1);
396#endif
397 //Gotta square it to "undo" the norm
398 betan=betan_f*betan_f;
399#else
400 //Just like Congradq, this loop could be unrolled but will need a reduction to deal with the betan
401 //addition.
402 betan = 0;
403 //If we get a small enough β_n before hitting the iteration cap we break
404#pragma omp parallel for simd aligned(x2_f,r_f:AVX) reduction(+:betan)
405 for(int i = 0; i<kferm;i++){
406 r_f[i]-=alpha*x2_f[i];
407 betan+=conj(r_f[i])*r_f[i];
408 }
409#endif
410 //This is basically just congradq at the end. Check there for comments
411#if(nproc>1)
412 Par_dsum(&betan);
413#endif
414#ifdef _DEBUG
415#ifdef _DEBUGCG
416 char *endline = "\n";
417#else
418 char *endline = "\r";
419#endif
420 if(!rank) printf("Iter (CG) = %i β_n= %e α= %e%s", *itercg, betan, alpha,endline);
421#endif
422 if(betan<resid){
423 //Started counting from zero so add one to make it accurate
424 (*itercg)++;
425#ifdef _DEBUG
426 if(!rank) printf("\nIter (CG) = %i resid = %e toler = %e\n", *itercg, betan, resid);
427#endif
428 ret_val=0; break;
429 }
430 else if(*itercg==niterc-1){
431 if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n",
432 ITERLIM, funcname, niterc, betan);
433 ret_val=ITERLIM; break;
434 }
435 //Note that beta below is not the global beta and scoping is used to avoid conflict between them
436 Complex beta = (*itercg) ? betan/betad : 0;
437 betad=betan; alphan=betan;
438 //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
439 //β instead of x.
440 //There is cblas_zaxpby in the MKL though, set a = 1 and b = β.
441#ifdef USE_BLAS
442 Complex_f beta_f = (Complex_f)beta;
443 Complex_f a = 1.0;
444#ifdef __NVCC__
445 cublasCscal(cublas_handle,kferm,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
446 cublasCaxpy(cublas_handle,kferm,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
447 cudaDeviceSynchronise();
448#elif (defined __INTEL_MKL__ || defined AMD_BLAS)
449 cblas_caxpby(kferm, &a, r_f, 1, &beta_f, p_f, 1);
450#else
451 cblas_cscal(kferm,&beta_f,p_f,1);
452 cblas_caxpy(kferm,&a,r_f,1,p_f,1);
453#endif
454#else
455#pragma omp parallel for simd aligned(r_f,p_f:AVX)
456 for(int i=0; i<kferm; i++)
457 p_f[i]=r_f[i]+beta*p_f[i];
458#endif
459 }
460#ifdef __NVCC__
461 Transpose_c(xi_f,kvol,ngorkov*nc);
462 Transpose_c(r_f,kvol,ngorkov*nc);
463
464 cudaDeviceSynchronise();
465 cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
466#else
467#pragma omp simd
468 for(int i = 0; i <kferm;i++){
469 xi[i]=(Complex)xi_f[i];
470 }
471#endif
472#ifdef __NVCC__
473 cudaFree(p_f); cudaFree(r_f);cudaFree(x1_f); cudaFree(x2_f); cudaFree(xi_f);
474#else
475 free(p_f); free(r_f); free(x1_f); free(x2_f); free(xi_f);
476#endif
477 return ret_val;
478}
int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:463
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:585
int Par_fsum(float *dval)
Performs a reduction on a float dval to get a sum which is then distributed to all ranks.
int rank
The MPI rank.
Definition par_mpi.c:22
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define niterc
Hard limit for runaway trajectories in Conjugate gradient.
Definition sizes.h:163
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225

References AVX, Complex, Complex_f, Dslash_f(), Dslashd_f(), kferm, kfermHalo, kvol, nc, ngorkov, niterc, Par_dsum(), Par_fsum(), and rank.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Congradq()

int Congradq ( int na,
double res,
Complex * X1,
Complex * r,
Complex_f * ut[2],
unsigned int * iu,
unsigned int * id,
Complex_f * gamval_f,
int * gamin,
float * dk[2],
Complex_f jqq,
float akappa,
int * itercg )

Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) Implements up/down partitioning The matrix multiplication step is done at single precision, while the update is done at double.

Parameters
naFlavour index
resLimit for conjugate gradient
X1Pseudofermion field \(\Phi\) initially, returned as \((M^\dagger M)^{-1} \Phi\)
rPartition of \(\Phi\) being used. Gets recycled as the residual vector
utTrial colour fields
iuUpper halo indices
idLower halo indices
gamval_fSingle precision gamma matrices rescaled by kappa
gaminDirac indices
dk\(\left(1+\gamma_0\right)e^{-\mu}\) and \(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping Parameter
itercgCounts the iterations of the conjugate gradient
Returns
0 on success, integer error code otherwise

Definition at line 7 of file congrad.c.

8 {
9 /*
10 * @brief Matrix Inversion via Mixed Precision Conjugate Gradient
11 * Solves @f$(M^\dagger)Mx=\Phi@f$
12 * Implements up/down partitioning
13 * The matrix multiplication step is done at single precision, while the update is done at double
14 *
15 * @param na: Flavour index
16 * @param res: Limit for conjugate gradient
17 * @param X1: @f(\Phi@f) initially, returned as @f((M^\dagger M)^{-1} \Phi@f)
18 * @param r: Partition of @f(\Phi@f) being used. Gets recycled as the residual vector
19 * @param ut[0]: First colour's trial field
20 * @param ut[1]: Second colour's trial field
21 * @param iu: Upper halo indices
22 * @param id: Lower halo indices
23 * @param gamval_f: Gamma matrices
24 * @param gamin: Dirac indices
25 * @param dk[0]:
26 * @param dk[1]:
27 * @param jqq: Diquark source
28 * @param akappa: Hopping Parameter
29 * @param itercg: Counts the iterations of the conjugate gradient
30 *
31 * @see Hdslash_f(), Hdslashd_f(), Par_fsum(), Par_dsum()
32 *
33 * @return 0 on success, integer error code otherwise
34 */
35 const char *funcname = "Congradq";
36 int ret_val=0;
37 const double resid = res*res;
38 //The κ^2 factor is needed to normalise the fields correctly
39 //jqq is the diquark condensate and is global scope.
40 const Complex_f fac_f = conj(jqq)*jqq*akappa*akappa;
41 //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
42 //n suffix is numerator, d is denominator
43 double alphan=1;
44 //The alpha and beta terms should be double, but that causes issues with BLAS pointers. Instead we declare
45 //them complex and work with the real part (especially for α_d)
46 //Give initial values Will be overwritten if niterx>0
47 double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
48 //Because we're dealing with flattened arrays here we can call cblas safely without the halo
49#ifdef __NVCC__
50 Complex_f *p_f, *x1_f, *x2_f, *r_f, *X1_f;
51 int device=-1; cudaGetDevice(&device);
52
53#ifdef _DEBUG
54 cudaMallocManaged((void **)&p_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
55 cudaMallocManaged((void **)&x1_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
56 cudaMallocManaged((void **)&x2_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
57 cudaMallocManaged((void **)&r_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
58 cudaMallocManaged((void **)&X1_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
59#else
60 //First two have halo exchanges, so getting NCCL working is important
61 cudaMallocAsync((void **)&p_f, kferm2Halo*sizeof(Complex_f),streams[0]);
62 cudaMallocAsync((void **)&x1_f, kferm2Halo*sizeof(Complex_f),streams[1]);
63 cudaMallocAsync((void **)&x2_f, kferm2*sizeof(Complex_f),streams[2]);
64 cudaMallocAsync((void **)&r_f, kferm2*sizeof(Complex_f),streams[3]);
65 cudaMallocAsync((void **)&X1_f, kferm2*sizeof(Complex_f),streams[4]);
66#endif
67#else
68 Complex_f *p_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
69 Complex_f *x1_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
70 Complex_f *x2_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
71 Complex_f *X1_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
72 Complex_f *r_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
73#endif
74 //Instead of copying element-wise in a loop, use memcpy.
75#ifdef __NVCC__
76 //Get X1 in single precision, then swap to AoS format
77 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
78
79 //And repeat for r
80 cuComplex_convert(r_f,r,kferm2,true,dimBlock,dimGrid);
81
82 //cudaMemcpy is blocking, so use async instead
83 cudaMemcpyAsync(p_f, X1_f, kferm2*sizeof(Complex_f),cudaMemcpyDeviceToDevice,NULL);
84 //Flip all the gauge fields around so memory is coalesced
85#else
86#pragma omp parallel for simd
87 for(int i=0;i<kferm2;i++){
88 r_f[i]=(Complex_f)r[i];
89 X1_f[i]=(Complex_f)X1[i];
90 }
91 memcpy(p_f, X1_f, kferm2*sizeof(Complex_f));
92#endif
93
94 //niterx isn't called as an index but we'll start from zero with the C code to make the
95 //if statements quicker to type
96 double betan; bool pf=true;
97 for(*itercg=0; *itercg<niterc; (*itercg)++){
98 //x2 = (M^†M)p
99 //No need to synchronise here. The memcpy in Hdslash is blocking
100 Hdslash_f(x1_f,p_f,ut,iu,id,gamval_f,gamin,dk,akappa);
101 Hdslashd_f(x2_f,x1_f,ut,iu,id,gamval_f,gamin,dk,akappa);
102#ifdef __NVCC__
103 cudaDeviceSynchronise();
104#endif
105 //x2 = (M^†M+J^2)p
106 //No point adding zero a couple of hundred times if the diquark source is zero
107 if(fac_f!=0){
108#ifdef __NVCC__
109 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&fac_f,(cuComplex *)p_f,1,(cuComplex *)x2_f,1);
110#elif defined USE_BLAS
111 cblas_caxpy(kferm2, &fac_f, p_f, 1, x2_f, 1);
112#else
113#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
114 for(int i=0; i<kferm2; i++)
115 x2_f[i]+=fac_f*p_f[i];
116#endif
117 }
118 //We can't evaluate α on the first *itercg because we need to get β_n.
119 if(*itercg){
120 //α_d= p* (M^†M+J^2)p
121#ifdef __NVCC__
122 cublasCdotc(cublas_handle,kferm2,(cuComplex *)p_f,1,(cuComplex *)x2_f,1,(cuComplex *)&alphad);
123#elif defined USE_BLAS
124 cblas_cdotc_sub(kferm2, p_f, 1, x2_f, 1, &alphad);
125#else
126 alphad=0;
127#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
128 for(int i=0; i<kferm2; i++)
129 alphad+=conj(p_f[i])*x2_f[i];
130#endif
131 //For now I'll cast it into a float for the reduction. Each rank only sends and writes
132 //to the real part so this is fine
133#if(nproc>1)
134 Par_fsum((float *)&alphad);
135#endif
136 //α=α_n/α_d = (r.r)/p(M^†M)p
137 alpha=alphan/creal(alphad);
138 //x-αp,
139#ifdef __NVCC__
140 Complex_f alpha_f = (Complex_f)alpha;
141 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&alpha_f,(cuComplex *)p_f,1,(cuComplex *)X1_f,1);
142#elif defined USE_BLAS
143 Complex_f alpha_f = (Complex_f)alpha;
144 cblas_caxpy(kferm2, &alpha_f, p_f, 1, X1_f, 1);
145#else
146 for(int i=0; i<kferm2; i++)
147 X1_f[i]+=alpha*p_f[i];
148#endif
149 }
150 // r_n+1 = r_n-α(M^† M)p_n and β_n=r*.r
151#ifdef __NVCC__
152 Complex_f alpha_m=(Complex_f)(-alpha);
153 cublasCaxpy(cublas_handle, kferm2,(cuComplex *)&alpha_m,(cuComplex *)x2_f,1,(cuComplex *)r_f,1);
154 float betan_f;
155 cublasScnrm2(cublas_handle,kferm2,(cuComplex *)r_f,1,&betan_f);
156 betan = betan_f*betan_f;
157#elif defined USE_BLAS
158 Complex_f alpha_m = (Complex_f)(-alpha);
159 cblas_caxpy(kferm2, &alpha_m, x2_f, 1, r_f, 1);
160 //Undo the negation for the BLAS routine
161 float betan_f = cblas_scnrm2(kferm2, r_f,1);
162 //Gotta square it to "undo" the norm
163 betan = betan_f*betan_f;
164#else
165 betan=0;
166#pragma omp parallel for simd aligned(r_f,x2_f:AVX) reduction(+:betan)
167 for(int i=0; i<kferm2; i++){
168 r_f[i]-=alpha*x2_f[i];
169 betan += conj(r_f[i])*r_f[i];
170 }
171#endif
172 //And... reduce.
173#if(nproc>1)
174 Par_dsum(&betan);
175#endif
176#ifdef _DEBUGCG
177#warning "CG Debugging"
178 char *endline = "\n";
179#else
180 char *endline = "\r";
181#endif
182#ifdef _DEBUG
183 if(!rank) printf("Iter(CG)=%i\tβ_n=%e\tα=%e%s", *itercg, betan, alpha,endline);
184#endif
185 if(betan<resid){
186 (*itercg)++;
187#ifdef _DEBUG
188 if(!rank) printf("\nIter(CG)=%i\tResidue: %e\tTolerance: %e\n", *itercg, betan, resid);
189#endif
190 ret_val=0; break;
191 }
192 else if(*itercg==niterc-1){
193 if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n", ITERLIM, funcname, *itercg, betan);
194 ret_val=ITERLIM; break;
195 }
196 //Here we evaluate β=(r_{k+1}.r_{k+1})/(r_k.r_k) and then shuffle our indices down the line.
197 //On the first iteration we define beta to be zero.
198 //Note that beta below is not the global beta and scoping is used to avoid conflict between them
199 Complex beta = (*itercg) ? betan/betad : 0;
200 betad=betan; alphan=betan;
201 //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
202 //β instead of x.
203#ifdef __NVCC__
204 Complex_f beta_f=(Complex_f)beta;
205 __managed__ Complex_f a = 1.0;
206 cublasCscal(cublas_handle,kferm2,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
207 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
208#elif (defined __INTEL_MKL__)
209 Complex_f a = 1.0;
210 Complex_f beta_f=(Complex_f)beta;
211 //There is cblas_?axpby in the MKL and AMD though, set a = 1 and b = β.
212 //If we get a small enough β_n before hitting the iteration cap we break
213 cblas_caxpby(kferm2, &a, r_f, 1, &beta_f, p_f, 1);
214#elif defined USE_BLAS
215 Complex_f beta_f=(Complex_f)beta;
216 cblas_cscal(kferm2,&beta_f,p_f,1);
217 Complex_f a = 1.0;
218 cblas_caxpy(kferm2,&a,r_f,1,p_f,1);
219#else
220 for(int i=0; i<kferm2; i++)
221 p_f[i]=r_f[i]+beta*p_f[i];
222#endif
223 }
224#ifdef __NVCC__
225//Restore arrays back to their previous salyout
226 cuComplex_convert(X1_f,X1,kferm2,false,dimBlock,dimGrid);
227 cuComplex_convert(r_f,r,kferm2,false,dimBlock,dimGrid);
228#else
229 for(int i=0;i<kferm2;i++){
230 X1[i]=(Complex)X1_f[i];
231 r[i]=(Complex)r_f[i];
232 }
233#endif
234#ifdef __NVCC__
235#ifdef _DEBUG
236 cudaDeviceSynchronise();
237 cudaFree(x1_f);cudaFree(x2_f); cudaFree(p_f);
238 cudaFree(r_f);cudaFree(X1_f);
239#else
240 //streams match the ones that allocated them.
241 cudaFreeAsync(p_f,streams[0]);cudaFreeAsync(x1_f,streams[1]);cudaFreeAsync(x2_f,streams[2]);
242 cudaDeviceSynchronise();
243 cudaFreeAsync(r_f,streams[3]);cudaFreeAsync(X1_f,streams[4]);
244#endif
245#else
246 free(x1_f);free(x2_f); free(p_f); free(r_f); free(X1_f);
247#endif
248 return ret_val;
249}
int Hdslashd_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates in single precision.
Definition matrices.c:855
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates in single precision.
Definition matrices.c:712
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188

References AVX, Complex, Complex_f, Hdslash_f(), Hdslashd_f(), kferm2, kferm2Halo, niterc, Par_dsum(), Par_fsum(), and rank.

Here is the call graph for this function:
Here is the caller graph for this function: