su2hmc
Loading...
Searching...
No Matches
congrad.c
Go to the documentation of this file.
1
6#include <matrices.h>
7int Congradq(int na,double res,Complex *X1,Complex *r,Complex_f *u11t,Complex_f *u12t,unsigned int *iu,unsigned int *id,\
8 Complex_f *gamval_f,int *gamin,float *dk4m,float *dk4p,Complex_f jqq,float akappa,int *itercg){
9 /*
10 * @brief Matrix Inversion via Mixed Precision Conjugate Gradient
11 * Solves @f$(M^\dagger)Mx=\Phi@f$
12 * Implements up/down partitioning
13 * The matrix multiplication step is done at single precision, while the update is done at double
14 *
15 * @param na: Flavour index
16 * @param res: Limit for conjugate gradient
17 * @param X1: @f(\Phi@f) initially, returned as @f((M^\dagger M)^{-1} \Phi@f)
18 * @param r: Partition of @f(\Phi@f) being used. Gets recycled as the residual vector
19 * @param u11t: First colour's trial field
20 * @param u12t: Second colour's trial field
21 * @param iu: Upper halo indices
22 * @param id: Lower halo indices
23 * @param gamval_f: Gamma matrices
24 * @param gamin: Dirac indices
25 * @param dk4m:
26 * @param dk4p:
27 * @param jqq: Diquark source
28 * @param akappa: Hopping Parameter
29 * @param itercg: Counts the iterations of the conjugate gradient
30 *
31 * @see Hdslash_f(), Hdslashd_f(), Par_fsum(), Par_dsum()
32 *
33 * @return 0 on success, integer error code otherwise
34 */
35 const char *funcname = "Congradq";
36 int ret_val=0;
37 const double resid = res*res;
38 //The κ^2 factor is needed to normalise the fields correctly
39 //jqq is the diquark condensate and is global scope.
40 const Complex_f fac_f = conj(jqq)*jqq*akappa*akappa;
41 //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
42 //n suffix is numerator, d is denominator
43 double alphan=1;
44 //The alpha and beta terms should be double, but that causes issues with BLAS pointers. Instead we declare
45 //them complex and work with the real part (especially for α_d)
46 //Give initial values Will be overwritten if niterx>0
47 double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
48 //Because we're dealing with flattened arrays here we can call cblas safely without the halo
49#ifdef __NVCC__
50 Complex_f *p_f, *x1_f, *x2_f, *r_f, *X1_f;
51 int device=-1; cudaGetDevice(&device);
52
53#ifdef _DEBUG
54 cudaMallocManaged((void **)&p_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
55 cudaMallocManaged((void **)&x1_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
56 cudaMallocManaged((void **)&x2_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
57 cudaMallocManaged((void **)&r_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
58 cudaMallocManaged((void **)&X1_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
59#else
60 //First two have halo exchanges, so getting NCCL working is important
61 cudaMallocAsync((void **)&p_f, kferm2Halo*sizeof(Complex_f),streams[0]);
62 cudaMallocAsync((void **)&x1_f, kferm2Halo*sizeof(Complex_f),streams[1]);
63 cudaMallocAsync((void **)&x2_f, kferm2*sizeof(Complex_f),streams[2]);
64 cudaMallocAsync((void **)&r_f, kferm2*sizeof(Complex_f),streams[3]);
65 cudaMallocAsync((void **)&X1_f, kferm2*sizeof(Complex_f),streams[4]);
66#endif
67#else
68 Complex_f *p_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
69 Complex_f *x1_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
70 Complex_f *x2_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
71 Complex_f *X1_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
72 Complex_f *r_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
73#endif
74 //Instead of copying element-wise in a loop, use memcpy.
75#ifdef __NVCC__
76 //Get X1 in single precision, then swap to AoS format
77 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
78 Transpose_c(X1_f,ndirac*nc,kvol,dimGrid,dimBlock);
79
80 //And repeat for r
81 cuComplex_convert(r_f,r,kferm2,true,dimBlock,dimGrid);
82 Transpose_c(r_f,ndirac*nc,kvol,dimGrid,dimBlock);
83
84 //cudaMemcpy is blocking, so use async instead
85 cudaMemcpyAsync(p_f, X1_f, kferm2*sizeof(Complex_f),cudaMemcpyDeviceToDevice,NULL);
86 //Flip all the gauge fields around so memory is coalesced
87 Transpose_c(u11t,ndim,kvol,dimGrid,dimBlock);
88 Transpose_c(u12t,ndim,kvol,dimGrid,dimBlock);
89 Transpose_I(iu,ndim,kvol,dimGrid,dimBlock);
90 Transpose_I(id,ndim,kvol,dimGrid,dimBlock);
91#else
92#pragma omp parallel for simd
93 for(int i=0;i<kferm2;i++){
94 r_f[i]=(Complex_f)r[i];
95 X1_f[i]=(Complex_f)X1[i];
96 }
97 memcpy(p_f, X1_f, kferm2*sizeof(Complex_f));
98#endif
99
100 //niterx isn't called as an index but we'll start from zero with the C code to make the
101 //if statements quicker to type
102 double betan; bool pf=true;
103 for(*itercg=0; *itercg<niterc; (*itercg)++){
104 //x2 = (M^†M)p
105 //No need to synchronise here. The memcpy in Hdslash is blocking
106 Hdslash_f(x1_f,p_f,u11t,u12t,iu,id,gamval_f,gamin,dk4m,dk4p,akappa);
107 Hdslashd_f(x2_f,x1_f,u11t,u12t,iu,id,gamval_f,gamin,dk4m,dk4p,akappa);
108#ifdef __NVCC__
109 cudaDeviceSynchronise();
110#endif
111 //x2 = (M^†M+J^2)p
112 //No point adding zero a couple of hundred times if the diquark source is zero
113 if(fac_f!=0){
114#ifdef __NVCC__
115 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&fac_f,(cuComplex *)p_f,1,(cuComplex *)x2_f,1);
116#elif defined USE_BLAS
117 cblas_caxpy(kferm2, &fac_f, p_f, 1, x2_f, 1);
118#else
119#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
120 for(int i=0; i<kferm2; i++)
121 x2_f[i]+=fac_f*p_f[i];
122#endif
123 }
124 //We can't evaluate α on the first *itercg because we need to get β_n.
125 if(*itercg){
126 //α_d= p* (M^†M+J^2)p
127#ifdef __NVCC__
128 cublasCdotc(cublas_handle,kferm2,(cuComplex *)p_f,1,(cuComplex *)x2_f,1,(cuComplex *)&alphad);
129#elif defined USE_BLAS
130 cblas_cdotc_sub(kferm2, p_f, 1, x2_f, 1, &alphad);
131#else
132 alphad=0;
133#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
134 for(int i=0; i<kferm2; i++)
135 alphad+=conj(p_f[i])*x2_f[i];
136#endif
137 //For now I'll cast it into a float for the reduction. Each rank only sends and writes
138 //to the real part so this is fine
139#if(nproc>1)
140 Par_fsum((float *)&alphad);
141#endif
142 //α=α_n/α_d = (r.r)/p(M^†M)p
143 alpha=alphan/creal(alphad);
144 //x-αp,
145#ifdef __NVCC__
146 Complex_f alpha_f = (Complex_f)alpha;
147 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&alpha_f,(cuComplex *)p_f,1,(cuComplex *)X1_f,1);
148#elif defined USE_BLAS
149 Complex_f alpha_f = (Complex_f)alpha;
150 cblas_caxpy(kferm2, &alpha_f, p_f, 1, X1_f, 1);
151#else
152 for(int i=0; i<kferm2; i++)
153 X1_f[i]+=alpha*p_f[i];
154#endif
155 }
156 // r_n+1 = r_n-α(M^† M)p_n and β_n=r*.r
157#ifdef __NVCC__
158 Complex_f alpha_m=(Complex_f)(-alpha);
159 cublasCaxpy(cublas_handle, kferm2,(cuComplex *)&alpha_m,(cuComplex *)x2_f,1,(cuComplex *)r_f,1);
160 float betan_f;
161 cublasScnrm2(cublas_handle,kferm2,(cuComplex *)r_f,1,&betan_f);
162 betan = betan_f*betan_f;
163#elif defined USE_BLAS
164 Complex_f alpha_m = (Complex_f)(-alpha);
165 cblas_caxpy(kferm2, &alpha_m, x2_f, 1, r_f, 1);
166 //Undo the negation for the BLAS routine
167 float betan_f = cblas_scnrm2(kferm2, r_f,1);
168 //Gotta square it to "undo" the norm
169 betan = betan_f*betan_f;
170#else
171 betan=0;
172#pragma omp parallel for simd aligned(r_f,x2_f:AVX) reduction(+:betan)
173 for(int i=0; i<kferm2; i++){
174 r_f[i]-=alpha*x2_f[i];
175 betan += conj(r_f[i])*r_f[i];
176 }
177#endif
178 //And... reduce.
179#if(nproc>1)
180 Par_dsum(&betan);
181#endif
182#ifdef _DEBUGCG
183#warning "CG Debugging"
184 char *endline = "\n";
185#else
186 char *endline = "\r";
187#endif
188#ifdef _DEBUG
189 if(!rank) printf("Iter(CG)=%i\tβ_n=%e\tα=%e%s", *itercg, betan, alpha,endline);
190#endif
191 if(betan<resid){
192 (*itercg)++;
193#ifdef _DEBUG
194 if(!rank) printf("\nIter(CG)=%i\tResidue: %e\tTolerance: %e\n", *itercg, betan, resid);
195#endif
196 ret_val=0; break;
197 }
198 else if(*itercg==niterc-1){
199 if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n", ITERLIM, funcname, *itercg, betan);
200 ret_val=ITERLIM; break;
201 }
202 //Here we evaluate β=(r_{k+1}.r_{k+1})/(r_k.r_k) and then shuffle our indices down the line.
203 //On the first iteration we define beta to be zero.
204 //Note that beta below is not the global beta and scoping is used to avoid conflict between them
205 Complex beta = (*itercg) ? betan/betad : 0;
206 betad=betan; alphan=betan;
207 //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
208 //β instead of x.
209#ifdef __NVCC__
210 Complex_f beta_f=(Complex_f)beta;
211 __managed__ Complex_f a = 1.0;
212 cublasCscal(cublas_handle,kferm2,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
213 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
214#elif (defined __INTEL_MKL__)
215 Complex_f a = 1.0;
216 Complex_f beta_f=(Complex_f)beta;
217 //There is cblas_?axpby in the MKL and AMD though, set a = 1 and b = β.
218 //If we get a small enough β_n before hitting the iteration cap we break
219 cblas_caxpby(kferm2, &a, r_f, 1, &beta_f, p_f, 1);
220#elif defined USE_BLAS
221 Complex_f beta_f=(Complex_f)beta;
222 cblas_cscal(kferm2,&beta_f,p_f,1);
223 Complex_f a = 1.0;
224 cblas_caxpy(kferm2,&a,r_f,1,p_f,1);
225#else
226 for(int i=0; i<kferm2; i++)
227 p_f[i]=r_f[i]+beta*p_f[i];
228#endif
229 }
230#ifdef __NVCC__
231//Restore arrays back to their previous salyout
232 Transpose_c(X1_f,kvol,ndirac*nc,dimGrid,dimBlock);
233 cuComplex_convert(X1_f,X1,kferm2,false,dimBlock,dimGrid);
234 Transpose_c(r_f,kvol,ndirac*nc,dimGrid,dimBlock);
235 cuComplex_convert(r_f,r,kferm2,false,dimBlock,dimGrid);
236 Transpose_c(u11t,kvol,ndim,dimGrid,dimBlock);
237 Transpose_c(u12t,kvol,ndim,dimGrid,dimBlock);
238 Transpose_I(iu,kvol,ndim,dimGrid,dimBlock);
239 Transpose_I(id,kvol,ndim,dimGrid,dimBlock);
240#else
241 for(int i=0;i<kferm2;i++){
242 X1[i]=(Complex)X1_f[i];
243 r[i]=(Complex)r_f[i];
244 }
245#endif
246#ifdef __NVCC__
247#ifdef _DEBUG
248 cudaDeviceSynchronise();
249 cudaFree(x1_f);cudaFree(x2_f); cudaFree(p_f);
250 cudaFree(r_f);cudaFree(X1_f);
251#else
252 //streams match the ones that allocated them.
253 cudaFreeAsync(p_f,streams[0]);cudaFreeAsync(x1_f,streams[1]);cudaFreeAsync(x2_f,streams[2]);
254 cudaDeviceSynchronise();
255 cudaFreeAsync(r_f,streams[3]);cudaFreeAsync(X1_f,streams[4]);
256#endif
257#else
258 free(x1_f);free(x2_f); free(p_f); free(r_f); free(X1_f);
259#endif
260 return ret_val;
261}
262int Congradp(int na,double res,Complex *Phi,Complex *xi,Complex_f *u11t,Complex_f *u12t,unsigned int *iu,unsigned int *id,\
263 Complex_f *gamval,int *gamin, float *dk4m,float *dk4p,Complex_f jqq,float akappa,int *itercg){
264 /*
265 * @brief Matrix Inversion via Conjugate Gradient
266 * Solves @f$(M^\dagger)Mx=\Phi@f$
267 * No even/odd partitioning.
268 * The matrix multiplication step is done at single precision, while the update is done at double
269 *
270 * @param na: Flavour index
271 * @param res: Limit for conjugate gradient
272 * @param Phi: @f(\Phi@f) initially,
273 * @param xi: Returned as @f((M^\dagger M)^{-1} \Phi@f)
274 * @param u11t: First colour's trial field
275 * @param u12t: Second colour's trial field
276 * @param iu: Upper halo indices
277 * @param id: Lower halo indices
278 * @param gamval: Gamma matrices
279 * @param gamin: Dirac indices
280 * @param dk4m:
281 * @param dk4p:
282 * @param jqq: Diquark source
283 * @param akappa: Hopping Parameter
284 * @param itercg: Counts the iterations of the conjugate gradient
285 *
286 * @return 0 on success, integer error code otherwise
287 */
288 const char *funcname = "Congradp";
289 //Return value
290 int ret_val=0;
291 const double resid = res*res;
292 //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
293 //These alpha and beta terms should be double, but that causes issues with BLAS. Instead we declare
294 //them Complex and work with the real part (especially for α_d)
295 //Give initial values Will be overwritten if niterx>0
296#ifdef __NVCC__
297 Complex_f *p_f, *r_f, *xi_f, *x1_f, *x2_f;
298 int device; cudaGetDevice(&device);
299#ifdef _DEBUG
300 cudaMallocManaged((void **)&p_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
301 cudaMallocManaged((void **)&r_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
302 cudaMallocManaged((void **)&x1_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
303 cudaMallocManaged((void **)&x2_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
304 cudaMallocManaged((void **)&xi_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
305#else
306 cudaMalloc((void **)&p_f, kfermHalo*sizeof(Complex_f));
307 cudaMalloc((void **)&r_f, kferm*sizeof(Complex_f));
308 cudaMalloc((void **)&x1_f, kfermHalo*sizeof(Complex_f));
309 cudaMalloc((void **)&x2_f, kferm*sizeof(Complex_f));
310 cudaMalloc((void **)&xi_f, kferm*sizeof(Complex_f));
311#endif
312#else
313 Complex_f *p_f = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
314 Complex_f *r_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
315 Complex_f *x1_f = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
316 Complex_f *x2_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
317 Complex_f *xi_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
318#endif
319 double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
320 double alphan=0.0;
321 //Instead of copying element-wise in a loop, use memcpy.
322#ifdef __NVCC__
323 //Get xi in single precision, then swap to AoS format
324 cuComplex_convert(p_f,xi,kferm,true,dimBlock,dimGrid);
325 Transpose_c(p_f,ngorkov*nc,kvol,dimGrid,dimBlock);
326 cudaMemcpy(xi_f,p_f,kferm*sizeof(Complex_f),cudaMemcpyDefault);
327
328 //And repeat for r
329 cuComplex_convert(r_f,Phi+na*kferm,kferm,true,dimBlock,dimGrid);
330 Transpose_c(r_f,ngorkov*nc,kvol,dimGrid,dimBlock);
331
332 //Flip all the gauge fields around so memory is coalesced
333 Transpose_c(u11t,ndim,kvol,dimGrid,dimBlock);
334 Transpose_c(u12t,ndim,kvol,dimGrid,dimBlock);
335#else
336#pragma omp parallel for simd aligned(p_f,xi_f,xi,r_f,Phi:AVX)
337 for(int i =0;i<kferm;i++){
338 p_f[i]=xi_f[i]=(Complex_f)xi[i];
339 r_f[i]=(Complex_f)Phi[na*kferm+i];
340 }
341#endif
342
343 // Declaring placeholder arrays
344 // This x1 is NOT related to the /common/vectorp/X1 in the FORTRAN code and should not
345 // be confused with X1 the global variable
346
347 //niterx isn't called as an index but we'll start from zero with the C code to make the
348 //if statements quicker to type
349 double betan;
350#ifdef __NVCC__
351 cudaDeviceSynchronise();
352#endif
353 for((*itercg)=0; (*itercg)<=niterc; (*itercg)++){
354 //Don't overwrite on first run.
355 //x2=(M^†)x1=(M^†)Mp
356 Dslash_f(x1_f,p_f,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa);
357 Dslashd_f(x2_f,x1_f,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa);
358#ifdef __NVCC__
359 cudaDeviceSynchronise();
360#endif
361 //We can't evaluate α on the first niterx because we need to get β_n.
362 if(*itercg){
363 //x*.x
364#ifdef USE_BLAS
365 float alphad_f;
366#ifdef __NVCC__
367 cublasScnrm2(cublas_handle,kferm,(cuComplex*) x1_f, 1,(float *)&alphad_f);
368 alphad = alphad_f*alphad_f;
369#else
370 alphad_f = cblas_scnrm2(kferm, x1_f, 1);
371#endif
372 alphad = alphad_f*alphad_f;
373#else
374 alphad=0;
375 for(int i = 0; i<kferm; i++)
376 alphad+=conj(x1_f[i])*x1_f[i];
377#endif
378#if(nproc>1)
379 Par_fsum((float *)&alphad);
380#endif
381 //α=(r.r)/p(M^†)Mp
382 alpha=alphan/creal(alphad);
383 // Complex_f alpha_f = (Complex_f)alpha;
384 //x+αp
385#ifdef USE_BLAS
386 Complex_f alpha_f=(float)alpha;
387#ifdef __NVCC__
388 cublasCaxpy(cublas_handle,kferm,(cuComplex*) &alpha_f,(cuComplex*) p_f,1,(cuComplex*) xi_f,1);
389#else
390 cblas_caxpy(kferm, (Complex_f*)&alpha_f,(Complex_f*)p_f, 1, (Complex_f*)xi_f, 1);
391#endif
392#else
393#pragma omp parallel for simd aligned(xi_f,p_f:AVX)
394 for(int i = 0; i<kferm; i++)
395 xi_f[i]+=alpha*p_f[i];
396#endif
397 }
398
399 //r=α(M^†)Mp and β_n=r*.r
400#if defined USE_BLAS
401 Complex_f alpha_m=(Complex_f)(-alpha);
402 float betan_f=0;
403#ifdef __NVCC__
404 cublasCaxpy(cublas_handle,kferm, (cuComplex *)&alpha_m,(cuComplex *) x2_f, 1,(cuComplex *) r_f, 1);
405 //cudaDeviceSynchronise();
406 //r*.r
407 cublasScnrm2(cublas_handle,kferm,(cuComplex *) r_f,1,(float *)&betan_f);
408#else
409 cblas_caxpy(kferm,(Complex_f*) &alpha_m,(Complex_f*) x2_f, 1,(Complex_f*) r_f, 1);
410 //r*.r
411 betan_f = cblas_scnrm2(kferm, (Complex_f*)r_f,1);
412#endif
413 //Gotta square it to "undo" the norm
414 betan=betan_f*betan_f;
415#else
416 //Just like Congradq, this loop could be unrolled but will need a reduction to deal with the betan
417 //addition.
418 betan = 0;
419 //If we get a small enough β_n before hitting the iteration cap we break
420#pragma omp parallel for simd aligned(x2_f,r_f:AVX) reduction(+:betan)
421 for(int i = 0; i<kferm;i++){
422 r_f[i]-=alpha*x2_f[i];
423 betan+=conj(r_f[i])*r_f[i];
424 }
425#endif
426 //This is basically just congradq at the end. Check there for comments
427#if(nproc>1)
428 Par_dsum(&betan);
429#endif
430#ifdef _DEBUG
431#ifdef _DEBUGCG
432 char *endline = "\n";
433#else
434 char *endline = "\r";
435#endif
436 if(!rank) printf("Iter (CG) = %i β_n= %e α= %e%s", *itercg, betan, alpha,endline);
437#endif
438 if(betan<resid){
439 //Started counting from zero so add one to make it accurate
440 (*itercg)++;
441#ifdef _DEBUG
442 if(!rank) printf("\nIter (CG) = %i resid = %e toler = %e\n", *itercg, betan, resid);
443#endif
444 ret_val=0; break;
445 }
446 else if(*itercg==niterc-1){
447 if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n",
448 ITERLIM, funcname, niterc, betan);
449 ret_val=ITERLIM; break;
450 }
451 //Note that beta below is not the global beta and scoping is used to avoid conflict between them
452 Complex beta = (*itercg) ? betan/betad : 0;
453 betad=betan; alphan=betan;
454 //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
455 //β instead of x.
456 //There is cblas_zaxpby in the MKL though, set a = 1 and b = β.
457#ifdef USE_BLAS
458 Complex_f beta_f = (Complex_f)beta;
459 Complex_f a = 1.0;
460#ifdef __NVCC__
461 cublasCscal(cublas_handle,kferm,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
462 cublasCaxpy(cublas_handle,kferm,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
463 cudaDeviceSynchronise();
464#elif (defined __INTEL_MKL__ || defined AMD_BLAS)
465 cblas_caxpby(kferm, &a, r_f, 1, &beta_f, p_f, 1);
466#else
467 cblas_cscal(kferm,&beta_f,p_f,1);
468 cblas_caxpy(kferm,&a,r_f,1,p_f,1);
469#endif
470#else
471#pragma omp parallel for simd aligned(r_f,p_f:AVX)
472 for(int i=0; i<kferm; i++)
473 p_f[i]=r_f[i]+beta*p_f[i];
474#endif
475 }
476#ifdef __NVCC__
477 Transpose_c(xi_f,kvol,ngorkov*nc,dimGrid,dimBlock);
478 Transpose_c(r_f,kvol,ngorkov*nc,dimGrid,dimBlock);
479
480 Transpose_c(u11t,kvol,ndim,dimGrid,dimBlock);
481 Transpose_c(u12t,kvol,ndim,dimGrid,dimBlock);
482 cudaDeviceSynchronise();
483 cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
484#else
485#pragma omp simd
486 for(int i = 0; i <kferm;i++){
487 xi[i]=(Complex)xi_f[i];
488 }
489#endif
490#ifdef __NVCC__
491 cudaFree(p_f); cudaFree(r_f);cudaFree(x1_f); cudaFree(x2_f); cudaFree(xi_f);
492#else
493 free(p_f); free(r_f); free(x1_f); free(x2_f); free(xi_f);
494#endif
495 return ret_val;
496}
497/* Old clutter for debugging CG
498 * Pre mult
499#ifdef _DEBUGCG
500memset(x1_f,0,kferm2Halo*sizeof(Complex_f));
501#ifdef __NVCC__
502cudaMemPrefetchAsync(x1_f,kferm2*sizeof(Complex_f),device,NULL);
503cudaDeviceSynchronise();
504#endif
505printf("\nPre mult:\tp_f[kferm2-1]=%.5e+%.5ei\tx1_f[kferm2-1]=%.5e+%.5ei\tx2_f[kferm2-1]=%.5e+%.5ei\t",\
506creal(p_f[kferm2-1]),cimag(p_f[kferm2-1]),creal(x1_f[kferm2-1]),cimag(x1_f[kferm2-1]),creal(x2_f[kferm2-1]),cimag(x2_f[kferm2-1]));
507#endif
508
509First mult
510#ifdef _DEBUGCG
511printf("\nHdslash_f:\tp_f[kferm2-1]=%.5e+%.5ei\tx1_f[kferm2-1]=%.5e+%.5ei\tx2_f[kferm2-1]=%.5e+%.5ei",\
512creal(p_f[kferm2-1]),cimag(p_f[kferm2-1]),creal(x1_f[kferm2-1]),cimag(x1_f[kferm2-1]),creal(x2_f[kferm2-1]),cimag(x2_f[kferm2-1]));
513#endif
514Post mult
515#ifdef _DEBUGCG
516printf("\nHdslashd_f:\tp_f[kferm2-1]=%.5e+%.5ei\tx1_f[kferm2-1]=%.5e+%.5ei\tx2_f[kferm2-1]=%.5e+%.5ei\n",\
517creal(p_f[kferm2-1]),cimag(p_f[kferm2-1]),creal(x1_f[kferm2-1]),cimag(x1_f[kferm2-1]),creal(x2_f[kferm2-1]),cimag(x2_f[kferm2-1]));
518#endif
519
520GAmmas
521#ifdef _DEBUGCG
522printf("Gammas:\n");
523for(int i=0;i<5;i++){
524for(int j=0;j<4;j++)
525printf("%.5e+%.5ei\t",creal(gamval_f[i*4+j]),cimag(gamval_f[i*4+j]));
526printf("\n");
527}
528printf("\nConstants (index %d):\nu11t[kvol-1]=%e+%.5ei\tu12t[kvol-1]=%e+%.5ei\tdk4m=%.5e\tdk4p=%.5e\tjqq=%.5e+I%.5f\tkappa=%.5f\n",\
529kvol-1,creal(u11t[kvol-1]),cimag(u11t[kvol-1]),creal(u12t[kvol-1]),cimag(u12t[kvol-1]),dk4m[kvol-1],dk4p[kvol-1],creal(jqq),cimag(jqq),akappa);
530#endif
531*/
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
Definition congrad.c:7
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...
Definition congrad.c:262
Matrix multiplication and related declarations.
int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:462
int Hdslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, float akappa)
Evaluates in single precision.
Definition matrices.c:802
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:584
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, float akappa)
Evaluates in single precision.
Definition matrices.c:711
int Par_fsum(float *dval)
Performs a reduction on a float dval to get a sum which is then distributed to all ranks.
int rank
The MPI rank.
Definition par_mpi.c:22
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define niterc
Hard limit for runaway trajectories in Conjugate gradient.
Definition sizes.h:163
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define ndirac
Dirac indices.
Definition sizes.h:177
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225