su2hmc
Loading...
Searching...
No Matches
congrad.c
Go to the documentation of this file.
1
6#include <matrices.h>
7int Congradq(int na,double res,Complex *X1,Complex *r,Complex_f *ut[2],unsigned int *iu,unsigned int *id,\
8 Complex_f *gamval_f,int *gamin,float *dk[2],Complex_f jqq,float akappa,int *itercg){
9 /*
10 * @brief Matrix Inversion via Mixed Precision Conjugate Gradient
11 * Solves @f$(M^\dagger)Mx=\Phi@f$
12 * Implements up/down partitioning
13 * The matrix multiplication step is done at single precision, while the update is done at double
14 *
15 * @param na: Flavour index
16 * @param res: Limit for conjugate gradient
17 * @param X1: @f(\Phi@f) initially, returned as @f((M^\dagger M)^{-1} \Phi@f)
18 * @param r: Partition of @f(\Phi@f) being used. Gets recycled as the residual vector
19 * @param ut[0]: First colour's trial field
20 * @param ut[1]: Second colour's trial field
21 * @param iu: Upper halo indices
22 * @param id: Lower halo indices
23 * @param gamval_f: Gamma matrices
24 * @param gamin: Dirac indices
25 * @param dk[0]:
26 * @param dk[1]:
27 * @param jqq: Diquark source
28 * @param akappa: Hopping Parameter
29 * @param itercg: Counts the iterations of the conjugate gradient
30 *
31 * @see Hdslash_f(), Hdslashd_f(), Par_fsum(), Par_dsum()
32 *
33 * @return 0 on success, integer error code otherwise
34 */
35 const char *funcname = "Congradq";
36 int ret_val=0;
37 const double resid = res*res;
38 //The κ^2 factor is needed to normalise the fields correctly
39 //jqq is the diquark condensate and is global scope.
40 const Complex_f fac_f = conj(jqq)*jqq*akappa*akappa;
41 //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
42 //n suffix is numerator, d is denominator
43 double alphan=1;
44 //The alpha and beta terms should be double, but that causes issues with BLAS pointers. Instead we declare
45 //them complex and work with the real part (especially for α_d)
46 //Give initial values Will be overwritten if niterx>0
47 double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
48 //Because we're dealing with flattened arrays here we can call cblas safely without the halo
49#ifdef __NVCC__
50 Complex_f *p_f, *x1_f, *x2_f, *r_f, *X1_f;
51 int device=-1; cudaGetDevice(&device);
52
53#ifdef _DEBUG
54 cudaMallocManaged((void **)&p_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
55 cudaMallocManaged((void **)&x1_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
56 cudaMallocManaged((void **)&x2_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
57 cudaMallocManaged((void **)&r_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
58 cudaMallocManaged((void **)&X1_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
59#else
60 //First two have halo exchanges, so getting NCCL working is important
61 cudaMallocAsync((void **)&p_f, kferm2Halo*sizeof(Complex_f),streams[0]);
62 cudaMallocAsync((void **)&x1_f, kferm2Halo*sizeof(Complex_f),streams[1]);
63 cudaMallocAsync((void **)&x2_f, kferm2*sizeof(Complex_f),streams[2]);
64 cudaMallocAsync((void **)&r_f, kferm2*sizeof(Complex_f),streams[3]);
65 cudaMallocAsync((void **)&X1_f, kferm2*sizeof(Complex_f),streams[4]);
66#endif
67#else
68 Complex_f *p_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
69 Complex_f *x1_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
70 Complex_f *x2_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
71 Complex_f *X1_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
72 Complex_f *r_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
73#endif
74 //Instead of copying element-wise in a loop, use memcpy.
75#ifdef __NVCC__
76 //Get X1 in single precision, then swap to AoS format
77 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
78
79 //And repeat for r
80 cuComplex_convert(r_f,r,kferm2,true,dimBlock,dimGrid);
81
82 //cudaMemcpy is blocking, so use async instead
83 cudaMemcpyAsync(p_f, X1_f, kferm2*sizeof(Complex_f),cudaMemcpyDeviceToDevice,NULL);
84 //Flip all the gauge fields around so memory is coalesced
85#else
86#pragma omp parallel for simd
87 for(int i=0;i<kferm2;i++){
88 r_f[i]=(Complex_f)r[i];
89 X1_f[i]=(Complex_f)X1[i];
90 }
91 memcpy(p_f, X1_f, kferm2*sizeof(Complex_f));
92#endif
93
94 //niterx isn't called as an index but we'll start from zero with the C code to make the
95 //if statements quicker to type
96 double betan; bool pf=true;
97 for(*itercg=0; *itercg<niterc; (*itercg)++){
98 //x2 = (M^†M)p
99 //No need to synchronise here. The memcpy in Hdslash is blocking
100 Hdslash_f(x1_f,p_f,ut,iu,id,gamval_f,gamin,dk,akappa);
101 Hdslashd_f(x2_f,x1_f,ut,iu,id,gamval_f,gamin,dk,akappa);
102#ifdef __NVCC__
103 cudaDeviceSynchronise();
104#endif
105 //x2 = (M^†M+J^2)p
106 //No point adding zero a couple of hundred times if the diquark source is zero
107 if(fac_f!=0){
108#ifdef __NVCC__
109 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&fac_f,(cuComplex *)p_f,1,(cuComplex *)x2_f,1);
110#elif defined USE_BLAS
111 cblas_caxpy(kferm2, &fac_f, p_f, 1, x2_f, 1);
112#else
113#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
114 for(int i=0; i<kferm2; i++)
115 x2_f[i]+=fac_f*p_f[i];
116#endif
117 }
118 //We can't evaluate α on the first *itercg because we need to get β_n.
119 if(*itercg){
120 //α_d= p* (M^†M+J^2)p
121#ifdef __NVCC__
122 cublasCdotc(cublas_handle,kferm2,(cuComplex *)p_f,1,(cuComplex *)x2_f,1,(cuComplex *)&alphad);
123#elif defined USE_BLAS
124 cblas_cdotc_sub(kferm2, p_f, 1, x2_f, 1, &alphad);
125#else
126 alphad=0;
127#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
128 for(int i=0; i<kferm2; i++)
129 alphad+=conj(p_f[i])*x2_f[i];
130#endif
131 //For now I'll cast it into a float for the reduction. Each rank only sends and writes
132 //to the real part so this is fine
133#if(nproc>1)
134 Par_fsum((float *)&alphad);
135#endif
136 //α=α_n/α_d = (r.r)/p(M^†M)p
137 alpha=alphan/creal(alphad);
138 //x-αp,
139#ifdef __NVCC__
140 Complex_f alpha_f = (Complex_f)alpha;
141 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&alpha_f,(cuComplex *)p_f,1,(cuComplex *)X1_f,1);
142#elif defined USE_BLAS
143 Complex_f alpha_f = (Complex_f)alpha;
144 cblas_caxpy(kferm2, &alpha_f, p_f, 1, X1_f, 1);
145#else
146 for(int i=0; i<kferm2; i++)
147 X1_f[i]+=alpha*p_f[i];
148#endif
149 }
150 // r_n+1 = r_n-α(M^† M)p_n and β_n=r*.r
151#ifdef __NVCC__
152 Complex_f alpha_m=(Complex_f)(-alpha);
153 cublasCaxpy(cublas_handle, kferm2,(cuComplex *)&alpha_m,(cuComplex *)x2_f,1,(cuComplex *)r_f,1);
154 float betan_f;
155 cublasScnrm2(cublas_handle,kferm2,(cuComplex *)r_f,1,&betan_f);
156 betan = betan_f*betan_f;
157#elif defined USE_BLAS
158 Complex_f alpha_m = (Complex_f)(-alpha);
159 cblas_caxpy(kferm2, &alpha_m, x2_f, 1, r_f, 1);
160 //Undo the negation for the BLAS routine
161 float betan_f = cblas_scnrm2(kferm2, r_f,1);
162 //Gotta square it to "undo" the norm
163 betan = betan_f*betan_f;
164#else
165 betan=0;
166#pragma omp parallel for simd aligned(r_f,x2_f:AVX) reduction(+:betan)
167 for(int i=0; i<kferm2; i++){
168 r_f[i]-=alpha*x2_f[i];
169 betan += conj(r_f[i])*r_f[i];
170 }
171#endif
172 //And... reduce.
173#if(nproc>1)
174 Par_dsum(&betan);
175#endif
176#ifdef _DEBUGCG
177#warning "CG Debugging"
178 char *endline = "\n";
179#else
180 char *endline = "\r";
181#endif
182#ifdef _DEBUG
183 if(!rank) printf("Iter(CG)=%i\tβ_n=%e\tα=%e%s", *itercg, betan, alpha,endline);
184#endif
185 if(betan<resid){
186 (*itercg)++;
187#ifdef _DEBUG
188 if(!rank) printf("\nIter(CG)=%i\tResidue: %e\tTolerance: %e\n", *itercg, betan, resid);
189#endif
190 ret_val=0; break;
191 }
192 else if(*itercg==niterc-1){
193 if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n", ITERLIM, funcname, *itercg, betan);
194 ret_val=ITERLIM; break;
195 }
196 //Here we evaluate β=(r_{k+1}.r_{k+1})/(r_k.r_k) and then shuffle our indices down the line.
197 //On the first iteration we define beta to be zero.
198 //Note that beta below is not the global beta and scoping is used to avoid conflict between them
199 Complex beta = (*itercg) ? betan/betad : 0;
200 betad=betan; alphan=betan;
201 //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
202 //β instead of x.
203#ifdef __NVCC__
204 Complex_f beta_f=(Complex_f)beta;
205 __managed__ Complex_f a = 1.0;
206 cublasCscal(cublas_handle,kferm2,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
207 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
208#elif (defined __INTEL_MKL__)
209 Complex_f a = 1.0;
210 Complex_f beta_f=(Complex_f)beta;
211 //There is cblas_?axpby in the MKL and AMD though, set a = 1 and b = β.
212 //If we get a small enough β_n before hitting the iteration cap we break
213 cblas_caxpby(kferm2, &a, r_f, 1, &beta_f, p_f, 1);
214#elif defined USE_BLAS
215 Complex_f beta_f=(Complex_f)beta;
216 cblas_cscal(kferm2,&beta_f,p_f,1);
217 Complex_f a = 1.0;
218 cblas_caxpy(kferm2,&a,r_f,1,p_f,1);
219#else
220 for(int i=0; i<kferm2; i++)
221 p_f[i]=r_f[i]+beta*p_f[i];
222#endif
223 }
224#ifdef __NVCC__
225//Restore arrays back to their previous salyout
226 cuComplex_convert(X1_f,X1,kferm2,false,dimBlock,dimGrid);
227 cuComplex_convert(r_f,r,kferm2,false,dimBlock,dimGrid);
228#else
229 for(int i=0;i<kferm2;i++){
230 X1[i]=(Complex)X1_f[i];
231 r[i]=(Complex)r_f[i];
232 }
233#endif
234#ifdef __NVCC__
235#ifdef _DEBUG
236 cudaDeviceSynchronise();
237 cudaFree(x1_f);cudaFree(x2_f); cudaFree(p_f);
238 cudaFree(r_f);cudaFree(X1_f);
239#else
240 //streams match the ones that allocated them.
241 cudaFreeAsync(p_f,streams[0]);cudaFreeAsync(x1_f,streams[1]);cudaFreeAsync(x2_f,streams[2]);
242 cudaDeviceSynchronise();
243 cudaFreeAsync(r_f,streams[3]);cudaFreeAsync(X1_f,streams[4]);
244#endif
245#else
246 free(x1_f);free(x2_f); free(p_f); free(r_f); free(X1_f);
247#endif
248 return ret_val;
249}
250int Congradp(int na,double res,Complex *Phi,Complex *xi,Complex_f *ut[2],unsigned int *iu,unsigned int *id,\
251 Complex_f *gamval,int *gamin, float *dk[2],Complex_f jqq,float akappa,int *itercg){
252 /*
253 * @brief Matrix Inversion via Conjugate Gradient
254 * Solves @f$(M^\dagger)Mx=\Phi@f$
255 * No even/odd partitioning.
256 * The matrix multiplication step is done at single precision, while the update is done at double
257 *
258 * @param na: Flavour index
259 * @param res: Limit for conjugate gradient
260 * @param Phi: @f(\Phi@f) initially,
261 * @param xi: Returned as @f((M^\dagger M)^{-1} \Phi@f)
262 * @param ut[0]: First colour's trial field
263 * @param ut[1]: Second colour's trial field
264 * @param iu: Upper halo indices
265 * @param id: Lower halo indices
266 * @param gamval: Gamma matrices
267 * @param gamin: Dirac indices
268 * @param dk[0]:
269 * @param dk[1]:
270 * @param jqq: Diquark source
271 * @param akappa: Hopping Parameter
272 * @param itercg: Counts the iterations of the conjugate gradient
273 *
274 * @return 0 on success, integer error code otherwise
275 */
276 const char *funcname = "Congradp";
277 //Return value
278 int ret_val=0;
279 const double resid = res*res;
280 //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
281 //These alpha and beta terms should be double, but that causes issues with BLAS. Instead we declare
282 //them Complex and work with the real part (especially for α_d)
283 //Give initial values Will be overwritten if niterx>0
284#ifdef __NVCC__
285 Complex_f *p_f, *r_f, *xi_f, *x1_f, *x2_f;
286 int device; cudaGetDevice(&device);
287#ifdef _DEBUG
288 cudaMallocManaged((void **)&p_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
289 cudaMallocManaged((void **)&r_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
290 cudaMallocManaged((void **)&x1_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
291 cudaMallocManaged((void **)&x2_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
292 cudaMallocManaged((void **)&xi_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
293#else
294 cudaMalloc((void **)&p_f, kfermHalo*sizeof(Complex_f));
295 cudaMalloc((void **)&r_f, kferm*sizeof(Complex_f));
296 cudaMalloc((void **)&x1_f, kfermHalo*sizeof(Complex_f));
297 cudaMalloc((void **)&x2_f, kferm*sizeof(Complex_f));
298 cudaMalloc((void **)&xi_f, kferm*sizeof(Complex_f));
299#endif
300#else
301 Complex_f *p_f = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
302 Complex_f *r_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
303 Complex_f *x1_f = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
304 Complex_f *x2_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
305 Complex_f *xi_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
306#endif
307 double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
308 double alphan=0.0;
309 //Instead of copying element-wise in a loop, use memcpy.
310#ifdef __NVCC__
311 //Get xi in single precision, then swap to AoS format
312 cuComplex_convert(p_f,xi,kferm,true,dimBlock,dimGrid);
313 Transpose_c(p_f,ngorkov*nc,kvol);
314 cudaMemcpy(xi_f,p_f,kferm*sizeof(Complex_f),cudaMemcpyDefault);
315
316 //And repeat for r
317 cuComplex_convert(r_f,Phi+na*kferm,kferm,true,dimBlock,dimGrid);
318 Transpose_c(r_f,ngorkov*nc,kvol);
319#else
320#pragma omp parallel for simd aligned(p_f,xi_f,xi,r_f,Phi:AVX)
321 for(int i =0;i<kferm;i++){
322 p_f[i]=xi_f[i]=(Complex_f)xi[i];
323 r_f[i]=(Complex_f)Phi[na*kferm+i];
324 }
325#endif
326
327 // Declaring placeholder arrays
328 // This x1 is NOT related to the /common/vectorp/X1 in the FORTRAN code and should not
329 // be confused with X1 the global variable
330
331 //niterx isn't called as an index but we'll start from zero with the C code to make the
332 //if statements quicker to type
333 double betan;
334#ifdef __NVCC__
335 cudaDeviceSynchronise();
336#endif
337 for((*itercg)=0; (*itercg)<=niterc; (*itercg)++){
338 //Don't overwrite on first run.
339 //x2=(M^†)x1=(M^†)Mp
340 Dslash_f(x1_f,p_f,ut[0],ut[1],iu,id,gamval,gamin,dk,jqq,akappa);
341 Dslashd_f(x2_f,x1_f,ut[0],ut[1],iu,id,gamval,gamin,dk,jqq,akappa);
342#ifdef __NVCC__
343 cudaDeviceSynchronise();
344#endif
345 //We can't evaluate α on the first niterx because we need to get β_n.
346 if(*itercg){
347 //x*.x
348#ifdef USE_BLAS
349 float alphad_f;
350#ifdef __NVCC__
351 cublasScnrm2(cublas_handle,kferm,(cuComplex*) x1_f, 1,(float *)&alphad_f);
352 alphad = alphad_f*alphad_f;
353#else
354 alphad_f = cblas_scnrm2(kferm, x1_f, 1);
355#endif
356 alphad = alphad_f*alphad_f;
357#else
358 alphad=0;
359 for(int i = 0; i<kferm; i++)
360 alphad+=conj(x1_f[i])*x1_f[i];
361#endif
362#if(nproc>1)
363 Par_fsum((float *)&alphad);
364#endif
365 //α=(r.r)/p(M^†)Mp
366 alpha=alphan/creal(alphad);
367 // Complex_f alpha_f = (Complex_f)alpha;
368 //x+αp
369#ifdef USE_BLAS
370 Complex_f alpha_f=(float)alpha;
371#ifdef __NVCC__
372 cublasCaxpy(cublas_handle,kferm,(cuComplex*) &alpha_f,(cuComplex*) p_f,1,(cuComplex*) xi_f,1);
373#else
374 cblas_caxpy(kferm, (Complex_f*)&alpha_f,(Complex_f*)p_f, 1, (Complex_f*)xi_f, 1);
375#endif
376#else
377#pragma omp parallel for simd aligned(xi_f,p_f:AVX)
378 for(int i = 0; i<kferm; i++)
379 xi_f[i]+=alpha*p_f[i];
380#endif
381 }
382
383 //r=α(M^†)Mp and β_n=r*.r
384#if defined USE_BLAS
385 Complex_f alpha_m=(Complex_f)(-alpha);
386 float betan_f=0;
387#ifdef __NVCC__
388 cublasCaxpy(cublas_handle,kferm, (cuComplex *)&alpha_m,(cuComplex *) x2_f, 1,(cuComplex *) r_f, 1);
389 //cudaDeviceSynchronise();
390 //r*.r
391 cublasScnrm2(cublas_handle,kferm,(cuComplex *) r_f,1,(float *)&betan_f);
392#else
393 cblas_caxpy(kferm,(Complex_f*) &alpha_m,(Complex_f*) x2_f, 1,(Complex_f*) r_f, 1);
394 //r*.r
395 betan_f = cblas_scnrm2(kferm, (Complex_f*)r_f,1);
396#endif
397 //Gotta square it to "undo" the norm
398 betan=betan_f*betan_f;
399#else
400 //Just like Congradq, this loop could be unrolled but will need a reduction to deal with the betan
401 //addition.
402 betan = 0;
403 //If we get a small enough β_n before hitting the iteration cap we break
404#pragma omp parallel for simd aligned(x2_f,r_f:AVX) reduction(+:betan)
405 for(int i = 0; i<kferm;i++){
406 r_f[i]-=alpha*x2_f[i];
407 betan+=conj(r_f[i])*r_f[i];
408 }
409#endif
410 //This is basically just congradq at the end. Check there for comments
411#if(nproc>1)
412 Par_dsum(&betan);
413#endif
414#ifdef _DEBUG
415#ifdef _DEBUGCG
416 char *endline = "\n";
417#else
418 char *endline = "\r";
419#endif
420 if(!rank) printf("Iter (CG) = %i β_n= %e α= %e%s", *itercg, betan, alpha,endline);
421#endif
422 if(betan<resid){
423 //Started counting from zero so add one to make it accurate
424 (*itercg)++;
425#ifdef _DEBUG
426 if(!rank) printf("\nIter (CG) = %i resid = %e toler = %e\n", *itercg, betan, resid);
427#endif
428 ret_val=0; break;
429 }
430 else if(*itercg==niterc-1){
431 if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n",
432 ITERLIM, funcname, niterc, betan);
433 ret_val=ITERLIM; break;
434 }
435 //Note that beta below is not the global beta and scoping is used to avoid conflict between them
436 Complex beta = (*itercg) ? betan/betad : 0;
437 betad=betan; alphan=betan;
438 //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
439 //β instead of x.
440 //There is cblas_zaxpby in the MKL though, set a = 1 and b = β.
441#ifdef USE_BLAS
442 Complex_f beta_f = (Complex_f)beta;
443 Complex_f a = 1.0;
444#ifdef __NVCC__
445 cublasCscal(cublas_handle,kferm,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
446 cublasCaxpy(cublas_handle,kferm,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
447 cudaDeviceSynchronise();
448#elif (defined __INTEL_MKL__ || defined AMD_BLAS)
449 cblas_caxpby(kferm, &a, r_f, 1, &beta_f, p_f, 1);
450#else
451 cblas_cscal(kferm,&beta_f,p_f,1);
452 cblas_caxpy(kferm,&a,r_f,1,p_f,1);
453#endif
454#else
455#pragma omp parallel for simd aligned(r_f,p_f:AVX)
456 for(int i=0; i<kferm; i++)
457 p_f[i]=r_f[i]+beta*p_f[i];
458#endif
459 }
460#ifdef __NVCC__
461 Transpose_c(xi_f,kvol,ngorkov*nc);
462 Transpose_c(r_f,kvol,ngorkov*nc);
463
464 cudaDeviceSynchronise();
465 cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
466#else
467#pragma omp simd
468 for(int i = 0; i <kferm;i++){
469 xi[i]=(Complex)xi_f[i];
470 }
471#endif
472#ifdef __NVCC__
473 cudaFree(p_f); cudaFree(r_f);cudaFree(x1_f); cudaFree(x2_f); cudaFree(xi_f);
474#else
475 free(p_f); free(r_f); free(x1_f); free(x2_f); free(xi_f);
476#endif
477 return ret_val;
478}
479/* Old clutter for debugging CG
480 * Pre mult
481#ifdef _DEBUGCG
482memset(x1_f,0,kferm2Halo*sizeof(Complex_f));
483#ifdef __NVCC__
484cudaMemPrefetchAsync(x1_f,kferm2*sizeof(Complex_f),device,NULL);
485cudaDeviceSynchronise();
486#endif
487printf("\nPre mult:\tp_f[kferm2-1]=%.5e+%.5ei\tx1_f[kferm2-1]=%.5e+%.5ei\tx2_f[kferm2-1]=%.5e+%.5ei\t",\
488creal(p_f[kferm2-1]),cimag(p_f[kferm2-1]),creal(x1_f[kferm2-1]),cimag(x1_f[kferm2-1]),creal(x2_f[kferm2-1]),cimag(x2_f[kferm2-1]));
489#endif
490
491First mult
492#ifdef _DEBUGCG
493printf("\nHdslash_f:\tp_f[kferm2-1]=%.5e+%.5ei\tx1_f[kferm2-1]=%.5e+%.5ei\tx2_f[kferm2-1]=%.5e+%.5ei",\
494creal(p_f[kferm2-1]),cimag(p_f[kferm2-1]),creal(x1_f[kferm2-1]),cimag(x1_f[kferm2-1]),creal(x2_f[kferm2-1]),cimag(x2_f[kferm2-1]));
495#endif
496Post mult
497#ifdef _DEBUGCG
498printf("\nHdslashd_f:\tp_f[kferm2-1]=%.5e+%.5ei\tx1_f[kferm2-1]=%.5e+%.5ei\tx2_f[kferm2-1]=%.5e+%.5ei\n",\
499creal(p_f[kferm2-1]),cimag(p_f[kferm2-1]),creal(x1_f[kferm2-1]),cimag(x1_f[kferm2-1]),creal(x2_f[kferm2-1]),cimag(x2_f[kferm2-1]));
500#endif
501
502GAmmas
503#ifdef _DEBUGCG
504printf("Gammas:\n");
505for(int i=0;i<5;i++){
506for(int j=0;j<4;j++)
507printf("%.5e+%.5ei\t",creal(gamval_f[i*4+j]),cimag(gamval_f[i*4+j]));
508printf("\n");
509}
510printf("\nConstants (index %d):\nu11t[kvol-1]=%e+%.5ei\tu12t[kvol-1]=%e+%.5ei\tdk4m=%.5e\tdk4p=%.5e\tjqq=%.5e+I%.5f\tkappa=%.5f\n",\
511kvol-1,creal(u11t[kvol-1]),cimag(u11t[kvol-1]),creal(u12t[kvol-1]),cimag(u12t[kvol-1]),dk4m[kvol-1],dk4p[kvol-1],creal(jqq),cimag(jqq),akappa);
512#endif
513*/
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
Definition congrad.c:7
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...
Definition congrad.c:250
Matrix multiplication and related declarations.
int Hdslashd_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates in single precision.
Definition matrices.c:855
int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:463
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates in single precision.
Definition matrices.c:712
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:585
int Par_fsum(float *dval)
Performs a reduction on a float dval to get a sum which is then distributed to all ranks.
int rank
The MPI rank.
Definition par_mpi.c:22
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define niterc
Hard limit for runaway trajectories in Conjugate gradient.
Definition sizes.h:163
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225