su2hmc
Loading...
Searching...
No Matches
su2hmc.h File Reference

Function declarations for most of the routines. More...

#include <errorcodes.h>
#include <integrate.h>
#include <sizes.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
Include dependency graph for su2hmc.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions

int Force (double *dSdpi, int iflag, double res1, Complex *X0, Complex *X1, Complex *Phi, Complex *ut[2], Complex_f *ut_f[2], unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], Complex_f jqq, float akappa, float beta, double *ancg)
 Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time.
 
int Gauge_force (double *dSdpi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, float beta)
 Calculates the gauge force due to the Wilson Action at each intermediate time.
 
int Init (int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq, Complex *u[2], Complex *ut[2], Complex_f *ut_f[2], Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], unsigned int *iu, unsigned int *id)
 Initialises the system.
 
int Hamilton (double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
 Calculate the Hamiltonian.
 
int Congradq (int na, double res, Complex *X1, Complex *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
 Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) Implements up/down partitioning The matrix multiplication step is done at single precision, while the update is done at double.
 
int Congradp (int na, double res, Complex *Phi, Complex *xi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
 Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) The matrix multiplication step is done at single precision, while the update is done at double.
 
int Measure (double *pbp, double *endenf, double *denf, Complex *qq, Complex *qbqb, double res, int *itercg, Complex *ut[2], Complex_f *ut_f[2], unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], Complex_f jqq, float akappa, Complex *Phi, Complex *R1)
 Calculate fermion expectation values via a noisy estimator.
 
int Average_Plaquette (double *hg, double *avplaqs, double *avplaqt, Complex_f *ut[2], unsigned int *iu, float beta)
 Calculates the gauge action using new (how new?) lookup table.
 
int SU2plaq (Complex_f *ut[2], Complex_f Sigma[2], unsigned int *iu, int i, int mu, int nu)
 Calculates the plaquette at site i in the \(\mu--\nu\) direction.
 
double Polyakov (Complex_f *ut[2])
 Calculate the Polyakov loop (no prizes for guessing that one...)
 
int C_gather (Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
 Extracts all the single precision gauge links in the \(\mu\) direction only.
 
int Z_gather (Complex *x, Complex *y, int n, unsigned int *table, unsigned int mu)
 Extracts all the double precision gauge links in the \(\mu\) direction only.
 
int Fill_Small_Phi (int na, Complex *smallPhi, Complex *Phi)
 
int UpDownPart (const int na, Complex *X0, Complex *R1)
 
int Reunitarise (Complex *ut[2])
 Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
 

Detailed Description

Function declarations for most of the routines.

Definition in file su2hmc.h.

Function Documentation

◆ Average_Plaquette()

int Average_Plaquette ( double * hg,
double * avplaqs,
double * avplaqt,
Complex_f * ut[2],
unsigned int * iu,
float beta )

Calculates the gauge action using new (how new?) lookup table.

Follows a routine called qedplaq in some QED3 code

Parameters
hgGauge component of Hamilton
avplaqsAverage spacial Plaquette
avplaqtAverage Temporal Plaquette
utThe trial fields
iuUpper halo indices
betaInverse gauge coupling
See also
Par_dsum
Returns
Zero on success, integer error code otherwise

Definition at line 20 of file bosonic.c.

20 {
21 /*
22 * Calculates the gauge action using new (how new?) lookup table
23 * Follows a routine called qedplaq in some QED3 code
24 *
25 * Parameters:
26 * =========
27 * hg Gauge component of Hamilton
28 * avplaqs Average spacial Plaquette
29 * avplaqt Average Temporal Plaquette
30 * u11t,u12t The trial fields
31 * iu Upper halo indices
32 * beta Inverse gauge coupling
33 *
34 * Calls:
35 * ======
36 * Par_dsum()
37 *
38 * Return:
39 * ======
40 * Zero on success, integer error code otherwise
41 */
42 const char *funcname = "Average_Plaquette";
43 /*There was a halo exchange here but moved it outside
44 The FORTRAN code used several consecutive loops to get the plaquette
45 Instead we'll just make the arrays variables and do everything in one loop
46 Should work since in the FORTRAN Sigma11[i] only depends on i components for example
47 Since the ν loop doesn't get called for μ=0 we'll start at μ=1
48 */
49#ifdef __NVCC__
50 __managed__ double hgs = 0; __managed__ double hgt = 0;
51 cuAverage_Plaquette(&hgs, &hgt, ut[0], ut[1], iu,dimGrid,dimBlock);
52#else
53 double hgs = 0; double hgt = 0;
54 for(int mu=1;mu<ndim;mu++)
55 for(int nu=0;nu<mu;nu++)
56 //Don't merge into a single loop. Makes vectorisation easier?
57 //Or merge into a single loop and dispense with the a arrays?
58#pragma omp parallel for simd reduction(+:hgs,hgt)
59 for(int i=0;i<kvol;i++){
60 Complex_f Sigma[2];
61 SU2plaq(ut,Sigma,iu,i,mu,nu);
62 switch(mu){
63 //Time component
64 case(ndim-1): hgt -= creal(Sigma[0]);
65 break;
66 //Space component
67 default: hgs -= creal(Sigma[0]);
68 break;
69 }
70 }
71#endif
72#if(nproc>1)
73 Par_dsum(&hgs); Par_dsum(&hgt);
74#endif
75 *avplaqs=-hgs/(3.0*gvol); *avplaqt=-hgt/(gvol*3.0);
76 *hg=(hgs+hgt)*beta;
77#ifdef _DEBUG
78 if(!rank)
79 printf("hgs=%e hgt=%e hg=%e\n", hgs, hgt, *hg);
80#endif
81 return 0;
82}
int SU2plaq(Complex_f *ut[2], Complex_f Sigma[2], unsigned int *iu, int i, int mu, int nu)
Calculates the plaquette at site i in the direction.
Definition bosonic.c:85
int rank
The MPI rank.
Definition par_mpi.c:22
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
#define kvol
Sublattice volume.
Definition sizes.h:154
#define gvol
Lattice volume.
Definition sizes.h:92
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179

References Complex_f, gvol, kvol, ndim, Par_dsum(), rank, and SU2plaq().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ C_gather()

int C_gather ( Complex_f * x,
Complex_f * y,
int n,
unsigned int * table,
unsigned int mu )
inline

Extracts all the single precision gauge links in the \(\mu\) direction only.

Parameters
xThe output
yThe gauge field for a particular colour
nNumber of sites in the gauge field. This is typically kvol
tableTable containing information on nearest neighbours. Usually id or iu
muDireciton we're interested in extractng
Returns
Zero on success, integer error code otherwise

Definition at line 357 of file su2hmc.c.

358{
359 const char *funcname = "C_gather";
360 //FORTRAN had a second parameter m giving the size of y (kvol+halo) normally
361 //Pointers mean that's not an issue for us so I'm leaving it out
362#ifdef __NVCC__
363 cuC_gather(x,y,n,table,mu,dimBlock,dimGrid);
364#else
365#pragma omp parallel for simd aligned (x,y,table:AVX)
366 for(int i=0; i<n; i++)
367 x[i]=y[table[i*ndim+mu]*ndim+mu];
368#endif
369 return 0;
370}

References Complex_f, and ndim.

Here is the caller graph for this function:

◆ Congradp()

int Congradp ( int na,
double res,
Complex * Phi,
Complex * xi,
Complex_f * ut[2],
unsigned int * iu,
unsigned int * id,
Complex_f * gamval,
int * gamin,
float * dk[2],
Complex_f jqq,
float akappa,
int * itercg )

Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) The matrix multiplication step is done at single precision, while the update is done at double.

Parameters
naFlavour index
resLimit for conjugate gradient
PhiPseudofermion field.
xiReturned as \((M^\dagger M)^{-1} \Phi\)
u11tFirst colour's trial field
u12tSecond colour's trial field
iuUpper halo indices
idLower halo indices
gamval_fSingle precision gamma matrices rescaled by kappa
gaminDirac indices
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping Parameter
itercgCounts the iterations of the conjugate gradient
Returns
0 on success, integer error code otherwise

Definition at line 250 of file congrad.c.

251 {
252 /*
253 * @brief Matrix Inversion via Conjugate Gradient
254 * Solves @f$(M^\dagger)Mx=\Phi@f$
255 * No even/odd partitioning.
256 * The matrix multiplication step is done at single precision, while the update is done at double
257 *
258 * @param na: Flavour index
259 * @param res: Limit for conjugate gradient
260 * @param Phi: @f(\Phi@f) initially,
261 * @param xi: Returned as @f((M^\dagger M)^{-1} \Phi@f)
262 * @param ut[0]: First colour's trial field
263 * @param ut[1]: Second colour's trial field
264 * @param iu: Upper halo indices
265 * @param id: Lower halo indices
266 * @param gamval: Gamma matrices
267 * @param gamin: Dirac indices
268 * @param dk[0]:
269 * @param dk[1]:
270 * @param jqq: Diquark source
271 * @param akappa: Hopping Parameter
272 * @param itercg: Counts the iterations of the conjugate gradient
273 *
274 * @return 0 on success, integer error code otherwise
275 */
276 const char *funcname = "Congradp";
277 //Return value
278 int ret_val=0;
279 const double resid = res*res;
280 //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
281 //These alpha and beta terms should be double, but that causes issues with BLAS. Instead we declare
282 //them Complex and work with the real part (especially for α_d)
283 //Give initial values Will be overwritten if niterx>0
284#ifdef __NVCC__
285 Complex_f *p_f, *r_f, *xi_f, *x1_f, *x2_f;
286 int device; cudaGetDevice(&device);
287#ifdef _DEBUG
288 cudaMallocManaged((void **)&p_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
289 cudaMallocManaged((void **)&r_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
290 cudaMallocManaged((void **)&x1_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
291 cudaMallocManaged((void **)&x2_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
292 cudaMallocManaged((void **)&xi_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
293#else
294 cudaMalloc((void **)&p_f, kfermHalo*sizeof(Complex_f));
295 cudaMalloc((void **)&r_f, kferm*sizeof(Complex_f));
296 cudaMalloc((void **)&x1_f, kfermHalo*sizeof(Complex_f));
297 cudaMalloc((void **)&x2_f, kferm*sizeof(Complex_f));
298 cudaMalloc((void **)&xi_f, kferm*sizeof(Complex_f));
299#endif
300#else
301 Complex_f *p_f = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
302 Complex_f *r_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
303 Complex_f *x1_f = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
304 Complex_f *x2_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
305 Complex_f *xi_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
306#endif
307 double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
308 double alphan=0.0;
309 //Instead of copying element-wise in a loop, use memcpy.
310#ifdef __NVCC__
311 //Get xi in single precision, then swap to AoS format
312 cuComplex_convert(p_f,xi,kferm,true,dimBlock,dimGrid);
313 Transpose_c(p_f,ngorkov*nc,kvol);
314 cudaMemcpy(xi_f,p_f,kferm*sizeof(Complex_f),cudaMemcpyDefault);
315
316 //And repeat for r
317 cuComplex_convert(r_f,Phi+na*kferm,kferm,true,dimBlock,dimGrid);
318 Transpose_c(r_f,ngorkov*nc,kvol);
319#else
320#pragma omp parallel for simd aligned(p_f,xi_f,xi,r_f,Phi:AVX)
321 for(int i =0;i<kferm;i++){
322 p_f[i]=xi_f[i]=(Complex_f)xi[i];
323 r_f[i]=(Complex_f)Phi[na*kferm+i];
324 }
325#endif
326
327 // Declaring placeholder arrays
328 // This x1 is NOT related to the /common/vectorp/X1 in the FORTRAN code and should not
329 // be confused with X1 the global variable
330
331 //niterx isn't called as an index but we'll start from zero with the C code to make the
332 //if statements quicker to type
333 double betan;
334#ifdef __NVCC__
335 cudaDeviceSynchronise();
336#endif
337 for((*itercg)=0; (*itercg)<=niterc; (*itercg)++){
338 //Don't overwrite on first run.
339 //x2=(M^†)x1=(M^†)Mp
340 Dslash_f(x1_f,p_f,ut[0],ut[1],iu,id,gamval,gamin,dk,jqq,akappa);
341 Dslashd_f(x2_f,x1_f,ut[0],ut[1],iu,id,gamval,gamin,dk,jqq,akappa);
342#ifdef __NVCC__
343 cudaDeviceSynchronise();
344#endif
345 //We can't evaluate α on the first niterx because we need to get β_n.
346 if(*itercg){
347 //x*.x
348#ifdef USE_BLAS
349 float alphad_f;
350#ifdef __NVCC__
351 cublasScnrm2(cublas_handle,kferm,(cuComplex*) x1_f, 1,(float *)&alphad_f);
352 alphad = alphad_f*alphad_f;
353#else
354 alphad_f = cblas_scnrm2(kferm, x1_f, 1);
355#endif
356 alphad = alphad_f*alphad_f;
357#else
358 alphad=0;
359 for(int i = 0; i<kferm; i++)
360 alphad+=conj(x1_f[i])*x1_f[i];
361#endif
362#if(nproc>1)
363 Par_fsum((float *)&alphad);
364#endif
365 //α=(r.r)/p(M^†)Mp
366 alpha=alphan/creal(alphad);
367 // Complex_f alpha_f = (Complex_f)alpha;
368 //x+αp
369#ifdef USE_BLAS
370 Complex_f alpha_f=(float)alpha;
371#ifdef __NVCC__
372 cublasCaxpy(cublas_handle,kferm,(cuComplex*) &alpha_f,(cuComplex*) p_f,1,(cuComplex*) xi_f,1);
373#else
374 cblas_caxpy(kferm, (Complex_f*)&alpha_f,(Complex_f*)p_f, 1, (Complex_f*)xi_f, 1);
375#endif
376#else
377#pragma omp parallel for simd aligned(xi_f,p_f:AVX)
378 for(int i = 0; i<kferm; i++)
379 xi_f[i]+=alpha*p_f[i];
380#endif
381 }
382
383 //r=α(M^†)Mp and β_n=r*.r
384#if defined USE_BLAS
385 Complex_f alpha_m=(Complex_f)(-alpha);
386 float betan_f=0;
387#ifdef __NVCC__
388 cublasCaxpy(cublas_handle,kferm, (cuComplex *)&alpha_m,(cuComplex *) x2_f, 1,(cuComplex *) r_f, 1);
389 //cudaDeviceSynchronise();
390 //r*.r
391 cublasScnrm2(cublas_handle,kferm,(cuComplex *) r_f,1,(float *)&betan_f);
392#else
393 cblas_caxpy(kferm,(Complex_f*) &alpha_m,(Complex_f*) x2_f, 1,(Complex_f*) r_f, 1);
394 //r*.r
395 betan_f = cblas_scnrm2(kferm, (Complex_f*)r_f,1);
396#endif
397 //Gotta square it to "undo" the norm
398 betan=betan_f*betan_f;
399#else
400 //Just like Congradq, this loop could be unrolled but will need a reduction to deal with the betan
401 //addition.
402 betan = 0;
403 //If we get a small enough β_n before hitting the iteration cap we break
404#pragma omp parallel for simd aligned(x2_f,r_f:AVX) reduction(+:betan)
405 for(int i = 0; i<kferm;i++){
406 r_f[i]-=alpha*x2_f[i];
407 betan+=conj(r_f[i])*r_f[i];
408 }
409#endif
410 //This is basically just congradq at the end. Check there for comments
411#if(nproc>1)
412 Par_dsum(&betan);
413#endif
414#ifdef _DEBUG
415#ifdef _DEBUGCG
416 char *endline = "\n";
417#else
418 char *endline = "\r";
419#endif
420 if(!rank) printf("Iter (CG) = %i β_n= %e α= %e%s", *itercg, betan, alpha,endline);
421#endif
422 if(betan<resid){
423 //Started counting from zero so add one to make it accurate
424 (*itercg)++;
425#ifdef _DEBUG
426 if(!rank) printf("\nIter (CG) = %i resid = %e toler = %e\n", *itercg, betan, resid);
427#endif
428 ret_val=0; break;
429 }
430 else if(*itercg==niterc-1){
431 if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n",
432 ITERLIM, funcname, niterc, betan);
433 ret_val=ITERLIM; break;
434 }
435 //Note that beta below is not the global beta and scoping is used to avoid conflict between them
436 Complex beta = (*itercg) ? betan/betad : 0;
437 betad=betan; alphan=betan;
438 //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
439 //β instead of x.
440 //There is cblas_zaxpby in the MKL though, set a = 1 and b = β.
441#ifdef USE_BLAS
442 Complex_f beta_f = (Complex_f)beta;
443 Complex_f a = 1.0;
444#ifdef __NVCC__
445 cublasCscal(cublas_handle,kferm,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
446 cublasCaxpy(cublas_handle,kferm,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
447 cudaDeviceSynchronise();
448#elif (defined __INTEL_MKL__ || defined AMD_BLAS)
449 cblas_caxpby(kferm, &a, r_f, 1, &beta_f, p_f, 1);
450#else
451 cblas_cscal(kferm,&beta_f,p_f,1);
452 cblas_caxpy(kferm,&a,r_f,1,p_f,1);
453#endif
454#else
455#pragma omp parallel for simd aligned(r_f,p_f:AVX)
456 for(int i=0; i<kferm; i++)
457 p_f[i]=r_f[i]+beta*p_f[i];
458#endif
459 }
460#ifdef __NVCC__
461 Transpose_c(xi_f,kvol,ngorkov*nc);
462 Transpose_c(r_f,kvol,ngorkov*nc);
463
464 cudaDeviceSynchronise();
465 cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
466#else
467#pragma omp simd
468 for(int i = 0; i <kferm;i++){
469 xi[i]=(Complex)xi_f[i];
470 }
471#endif
472#ifdef __NVCC__
473 cudaFree(p_f); cudaFree(r_f);cudaFree(x1_f); cudaFree(x2_f); cudaFree(xi_f);
474#else
475 free(p_f); free(r_f); free(x1_f); free(x2_f); free(xi_f);
476#endif
477 return ret_val;
478}
int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:463
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:585
int Par_fsum(float *dval)
Performs a reduction on a float dval to get a sum which is then distributed to all ranks.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define niterc
Hard limit for runaway trajectories in Conjugate gradient.
Definition sizes.h:163
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225

References AVX, Complex, Complex_f, Dslash_f(), Dslashd_f(), kferm, kfermHalo, kvol, nc, ngorkov, niterc, Par_dsum(), Par_fsum(), and rank.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Congradq()

int Congradq ( int na,
double res,
Complex * X1,
Complex * r,
Complex_f * ut[2],
unsigned int * iu,
unsigned int * id,
Complex_f * gamval_f,
int * gamin,
float * dk[2],
Complex_f jqq,
float akappa,
int * itercg )

Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) Implements up/down partitioning The matrix multiplication step is done at single precision, while the update is done at double.

Parameters
naFlavour index
resLimit for conjugate gradient
X1Pseudofermion field \(\Phi\) initially, returned as \((M^\dagger M)^{-1} \Phi\)
rPartition of \(\Phi\) being used. Gets recycled as the residual vector
utTrial colour fields
iuUpper halo indices
idLower halo indices
gamval_fSingle precision gamma matrices rescaled by kappa
gaminDirac indices
dk\(\left(1+\gamma_0\right)e^{-\mu}\) and \(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping Parameter
itercgCounts the iterations of the conjugate gradient
Returns
0 on success, integer error code otherwise

Definition at line 7 of file congrad.c.

8 {
9 /*
10 * @brief Matrix Inversion via Mixed Precision Conjugate Gradient
11 * Solves @f$(M^\dagger)Mx=\Phi@f$
12 * Implements up/down partitioning
13 * The matrix multiplication step is done at single precision, while the update is done at double
14 *
15 * @param na: Flavour index
16 * @param res: Limit for conjugate gradient
17 * @param X1: @f(\Phi@f) initially, returned as @f((M^\dagger M)^{-1} \Phi@f)
18 * @param r: Partition of @f(\Phi@f) being used. Gets recycled as the residual vector
19 * @param ut[0]: First colour's trial field
20 * @param ut[1]: Second colour's trial field
21 * @param iu: Upper halo indices
22 * @param id: Lower halo indices
23 * @param gamval_f: Gamma matrices
24 * @param gamin: Dirac indices
25 * @param dk[0]:
26 * @param dk[1]:
27 * @param jqq: Diquark source
28 * @param akappa: Hopping Parameter
29 * @param itercg: Counts the iterations of the conjugate gradient
30 *
31 * @see Hdslash_f(), Hdslashd_f(), Par_fsum(), Par_dsum()
32 *
33 * @return 0 on success, integer error code otherwise
34 */
35 const char *funcname = "Congradq";
36 int ret_val=0;
37 const double resid = res*res;
38 //The κ^2 factor is needed to normalise the fields correctly
39 //jqq is the diquark condensate and is global scope.
40 const Complex_f fac_f = conj(jqq)*jqq*akappa*akappa;
41 //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
42 //n suffix is numerator, d is denominator
43 double alphan=1;
44 //The alpha and beta terms should be double, but that causes issues with BLAS pointers. Instead we declare
45 //them complex and work with the real part (especially for α_d)
46 //Give initial values Will be overwritten if niterx>0
47 double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
48 //Because we're dealing with flattened arrays here we can call cblas safely without the halo
49#ifdef __NVCC__
50 Complex_f *p_f, *x1_f, *x2_f, *r_f, *X1_f;
51 int device=-1; cudaGetDevice(&device);
52
53#ifdef _DEBUG
54 cudaMallocManaged((void **)&p_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
55 cudaMallocManaged((void **)&x1_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
56 cudaMallocManaged((void **)&x2_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
57 cudaMallocManaged((void **)&r_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
58 cudaMallocManaged((void **)&X1_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
59#else
60 //First two have halo exchanges, so getting NCCL working is important
61 cudaMallocAsync((void **)&p_f, kferm2Halo*sizeof(Complex_f),streams[0]);
62 cudaMallocAsync((void **)&x1_f, kferm2Halo*sizeof(Complex_f),streams[1]);
63 cudaMallocAsync((void **)&x2_f, kferm2*sizeof(Complex_f),streams[2]);
64 cudaMallocAsync((void **)&r_f, kferm2*sizeof(Complex_f),streams[3]);
65 cudaMallocAsync((void **)&X1_f, kferm2*sizeof(Complex_f),streams[4]);
66#endif
67#else
68 Complex_f *p_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
69 Complex_f *x1_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
70 Complex_f *x2_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
71 Complex_f *X1_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
72 Complex_f *r_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
73#endif
74 //Instead of copying element-wise in a loop, use memcpy.
75#ifdef __NVCC__
76 //Get X1 in single precision, then swap to AoS format
77 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
78
79 //And repeat for r
80 cuComplex_convert(r_f,r,kferm2,true,dimBlock,dimGrid);
81
82 //cudaMemcpy is blocking, so use async instead
83 cudaMemcpyAsync(p_f, X1_f, kferm2*sizeof(Complex_f),cudaMemcpyDeviceToDevice,NULL);
84 //Flip all the gauge fields around so memory is coalesced
85#else
86#pragma omp parallel for simd
87 for(int i=0;i<kferm2;i++){
88 r_f[i]=(Complex_f)r[i];
89 X1_f[i]=(Complex_f)X1[i];
90 }
91 memcpy(p_f, X1_f, kferm2*sizeof(Complex_f));
92#endif
93
94 //niterx isn't called as an index but we'll start from zero with the C code to make the
95 //if statements quicker to type
96 double betan; bool pf=true;
97 for(*itercg=0; *itercg<niterc; (*itercg)++){
98 //x2 = (M^†M)p
99 //No need to synchronise here. The memcpy in Hdslash is blocking
100 Hdslash_f(x1_f,p_f,ut,iu,id,gamval_f,gamin,dk,akappa);
101 Hdslashd_f(x2_f,x1_f,ut,iu,id,gamval_f,gamin,dk,akappa);
102#ifdef __NVCC__
103 cudaDeviceSynchronise();
104#endif
105 //x2 = (M^†M+J^2)p
106 //No point adding zero a couple of hundred times if the diquark source is zero
107 if(fac_f!=0){
108#ifdef __NVCC__
109 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&fac_f,(cuComplex *)p_f,1,(cuComplex *)x2_f,1);
110#elif defined USE_BLAS
111 cblas_caxpy(kferm2, &fac_f, p_f, 1, x2_f, 1);
112#else
113#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
114 for(int i=0; i<kferm2; i++)
115 x2_f[i]+=fac_f*p_f[i];
116#endif
117 }
118 //We can't evaluate α on the first *itercg because we need to get β_n.
119 if(*itercg){
120 //α_d= p* (M^†M+J^2)p
121#ifdef __NVCC__
122 cublasCdotc(cublas_handle,kferm2,(cuComplex *)p_f,1,(cuComplex *)x2_f,1,(cuComplex *)&alphad);
123#elif defined USE_BLAS
124 cblas_cdotc_sub(kferm2, p_f, 1, x2_f, 1, &alphad);
125#else
126 alphad=0;
127#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
128 for(int i=0; i<kferm2; i++)
129 alphad+=conj(p_f[i])*x2_f[i];
130#endif
131 //For now I'll cast it into a float for the reduction. Each rank only sends and writes
132 //to the real part so this is fine
133#if(nproc>1)
134 Par_fsum((float *)&alphad);
135#endif
136 //α=α_n/α_d = (r.r)/p(M^†M)p
137 alpha=alphan/creal(alphad);
138 //x-αp,
139#ifdef __NVCC__
140 Complex_f alpha_f = (Complex_f)alpha;
141 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&alpha_f,(cuComplex *)p_f,1,(cuComplex *)X1_f,1);
142#elif defined USE_BLAS
143 Complex_f alpha_f = (Complex_f)alpha;
144 cblas_caxpy(kferm2, &alpha_f, p_f, 1, X1_f, 1);
145#else
146 for(int i=0; i<kferm2; i++)
147 X1_f[i]+=alpha*p_f[i];
148#endif
149 }
150 // r_n+1 = r_n-α(M^† M)p_n and β_n=r*.r
151#ifdef __NVCC__
152 Complex_f alpha_m=(Complex_f)(-alpha);
153 cublasCaxpy(cublas_handle, kferm2,(cuComplex *)&alpha_m,(cuComplex *)x2_f,1,(cuComplex *)r_f,1);
154 float betan_f;
155 cublasScnrm2(cublas_handle,kferm2,(cuComplex *)r_f,1,&betan_f);
156 betan = betan_f*betan_f;
157#elif defined USE_BLAS
158 Complex_f alpha_m = (Complex_f)(-alpha);
159 cblas_caxpy(kferm2, &alpha_m, x2_f, 1, r_f, 1);
160 //Undo the negation for the BLAS routine
161 float betan_f = cblas_scnrm2(kferm2, r_f,1);
162 //Gotta square it to "undo" the norm
163 betan = betan_f*betan_f;
164#else
165 betan=0;
166#pragma omp parallel for simd aligned(r_f,x2_f:AVX) reduction(+:betan)
167 for(int i=0; i<kferm2; i++){
168 r_f[i]-=alpha*x2_f[i];
169 betan += conj(r_f[i])*r_f[i];
170 }
171#endif
172 //And... reduce.
173#if(nproc>1)
174 Par_dsum(&betan);
175#endif
176#ifdef _DEBUGCG
177#warning "CG Debugging"
178 char *endline = "\n";
179#else
180 char *endline = "\r";
181#endif
182#ifdef _DEBUG
183 if(!rank) printf("Iter(CG)=%i\tβ_n=%e\tα=%e%s", *itercg, betan, alpha,endline);
184#endif
185 if(betan<resid){
186 (*itercg)++;
187#ifdef _DEBUG
188 if(!rank) printf("\nIter(CG)=%i\tResidue: %e\tTolerance: %e\n", *itercg, betan, resid);
189#endif
190 ret_val=0; break;
191 }
192 else if(*itercg==niterc-1){
193 if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n", ITERLIM, funcname, *itercg, betan);
194 ret_val=ITERLIM; break;
195 }
196 //Here we evaluate β=(r_{k+1}.r_{k+1})/(r_k.r_k) and then shuffle our indices down the line.
197 //On the first iteration we define beta to be zero.
198 //Note that beta below is not the global beta and scoping is used to avoid conflict between them
199 Complex beta = (*itercg) ? betan/betad : 0;
200 betad=betan; alphan=betan;
201 //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
202 //β instead of x.
203#ifdef __NVCC__
204 Complex_f beta_f=(Complex_f)beta;
205 __managed__ Complex_f a = 1.0;
206 cublasCscal(cublas_handle,kferm2,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
207 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
208#elif (defined __INTEL_MKL__)
209 Complex_f a = 1.0;
210 Complex_f beta_f=(Complex_f)beta;
211 //There is cblas_?axpby in the MKL and AMD though, set a = 1 and b = β.
212 //If we get a small enough β_n before hitting the iteration cap we break
213 cblas_caxpby(kferm2, &a, r_f, 1, &beta_f, p_f, 1);
214#elif defined USE_BLAS
215 Complex_f beta_f=(Complex_f)beta;
216 cblas_cscal(kferm2,&beta_f,p_f,1);
217 Complex_f a = 1.0;
218 cblas_caxpy(kferm2,&a,r_f,1,p_f,1);
219#else
220 for(int i=0; i<kferm2; i++)
221 p_f[i]=r_f[i]+beta*p_f[i];
222#endif
223 }
224#ifdef __NVCC__
225//Restore arrays back to their previous salyout
226 cuComplex_convert(X1_f,X1,kferm2,false,dimBlock,dimGrid);
227 cuComplex_convert(r_f,r,kferm2,false,dimBlock,dimGrid);
228#else
229 for(int i=0;i<kferm2;i++){
230 X1[i]=(Complex)X1_f[i];
231 r[i]=(Complex)r_f[i];
232 }
233#endif
234#ifdef __NVCC__
235#ifdef _DEBUG
236 cudaDeviceSynchronise();
237 cudaFree(x1_f);cudaFree(x2_f); cudaFree(p_f);
238 cudaFree(r_f);cudaFree(X1_f);
239#else
240 //streams match the ones that allocated them.
241 cudaFreeAsync(p_f,streams[0]);cudaFreeAsync(x1_f,streams[1]);cudaFreeAsync(x2_f,streams[2]);
242 cudaDeviceSynchronise();
243 cudaFreeAsync(r_f,streams[3]);cudaFreeAsync(X1_f,streams[4]);
244#endif
245#else
246 free(x1_f);free(x2_f); free(p_f); free(r_f); free(X1_f);
247#endif
248 return ret_val;
249}
int Hdslashd_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates in single precision.
Definition matrices.c:855
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates in single precision.
Definition matrices.c:712
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188

References AVX, Complex, Complex_f, Hdslash_f(), Hdslashd_f(), kferm2, kferm2Halo, niterc, Par_dsum(), Par_fsum(), and rank.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Fill_Small_Phi()

int Fill_Small_Phi ( int na,
Complex * smallPhi,
Complex * Phi )
inline

Copies necessary (2*4*kvol) elements of Phi into a vector variable

Parameters
naflavour index
smallPhiThe partitioned output
PhiThe pseudofermion field
Returns
Zero on success, integer error code otherwise

Definition at line 385 of file su2hmc.c.

386{
387 /*Copies necessary (2*4*kvol) elements of Phi into a vector variable
388 *
389 * Globals:
390 * =======
391 * Phi: The source array
392 *
393 * Parameters:
394 * ==========
395 * int na: flavour index
396 * Complex *smallPhi: The target array
397 *
398 * Returns:
399 * =======
400 * Zero on success, integer error code otherwise
401 */
402 const char *funcname = "Fill_Small_Phi";
403 //BIG and small phi index
404#ifdef __NVCC__
405 cuFill_Small_Phi(na,smallPhi,Phi,dimBlock,dimGrid);
406#else
407#pragma omp parallel for simd aligned(smallPhi,Phi:AVX) collapse(3)
408 for(int i = 0; i<kvol;i++)
409 for(int idirac = 0; idirac<ndirac; idirac++)
410 for(int ic= 0; ic<nc; ic++)
411 // PHI_index=i*16+j*2+k;
412 smallPhi[(i*ndirac+idirac)*nc+ic]=Phi[((na*kvol+i)*ngorkov+idirac)*nc+ic];
413#endif
414 return 0;
415}
#define ndirac
Dirac indices.
Definition sizes.h:177

References Complex, kvol, nc, ndirac, and ngorkov.

Here is the caller graph for this function:

◆ Force()

int Force ( double * dSdpi,
int iflag,
double res1,
Complex * X0,
Complex * X1,
Complex * Phi,
Complex * ut[2],
Complex_f * ut_f[2],
unsigned int * iu,
unsigned int * id,
Complex * gamval,
Complex_f * gamval_f,
int * gamin,
double * dk[2],
float * dk_f[2],
Complex_f jqq,
float akappa,
float beta,
double * ancg )

Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time.

Parameters
dSdpiThe force
iflagInvert before evaluating the force. 0 to invert, one not to. Blame FORTRAN...
res1Conjugate gradient residule
X0Up/down partitioned pseudofermion field
X1Holder for the partitioned fermion field, then the conjugate gradient output
PhiPseudofermion field
utDouble precision colour fields
u2t_fSingle precision colour fields
iu,idLattice indices
gaminGamma indices
gamvalDouble precision gamma matrices rescaled by kappa
gamval_fSingle precision gamma matrices rescaled by kappa
dk\(e^{-\mu}\) and \(e^\mu\)
dk_f\(e^{-\mu}\) and \(e^\mu\) float
jqqDiquark source
akappaHopping parameter
betaInverse gauge coupling
ancgCounter for conjugate gradient iterations
Returns
Zero on success, integer error code otherwise

Definition at line 94 of file force.c.

96 {
97 /*
98 * @brief Calculates the force @f$\frac{dS}{d\pi}@f$ at each intermediate time
99 *
100 * @param dSdpi: The force
101 * @param iflag: Invert before evaluating the force?
102 * @param res1: Conjugate gradient residule
103 * @param X0: Up/down partitioned pseudofermion field
104 * @param X1: Holder for the partitioned fermion field, then the conjugate gradient output
105 * @param Phi: Pseudofermion field
106 * @param ut[0],ut[1] Double precision colour fields
107 * @param ut_f[0],ut_f[1]: Single precision colour fields
108 * @param iu,id: Lattice indices
109 * @param gamin: Gamma indices
110 * @param gamval: Double precision gamma matrices
111 * @param gamval_f: Single precision gamma matrices
112 * @param dk[0]: @f$\left(1+\gamma_0\right)e^{-\mu}@f$
113 * @param dk[1]: @f$\left(1-\gamma_0\right)e^\mu@f$
114 * @param dk_f[0]: @f$\left(1+\gamma_0\right)e^{-\mu}@f$ float
115 * @param dk_f[1]: @f$\left(1-\gamma_0\right)e^\mu@f$ float
116 * @param jqq: Diquark source
117 * @param akappa: Hopping parameter
118 * @param beta: Inverse gauge coupling
119 * @param ancg: Counter for conjugate gradient iterations
120 *
121 * @return Zero on success, integer error code otherwise
122 */
123 const char *funcname = "Force";
124#ifdef __NVCC__
125 int device=-1;
126 cudaGetDevice(&device);
127#endif
128#ifndef NO_GAUGE
129 Gauge_force(dSdpi,ut_f,iu,id,beta);
130#endif
131 if(!akappa)
132 return 0;
133 //X1=(M†M)^{1} Phi
134 int itercg=1;
135#ifdef __NVCC__
136 Complex_f *X1_f, *X2_f;
137 cudaMallocAsync((void **)&X2_f,kferm2*sizeof(Complex_f),streams[0]);
138 cudaMallocAsync((void **)&X1_f,kferm2*sizeof(Complex_f),NULL);
139 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
140#else
141 Complex *X2= (Complex *)aligned_alloc(AVX,kferm2Halo*sizeof(Complex));
142#endif
143 for(int na = 0; na<nf; na++){
144#ifdef __NVCC__
145 cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,NULL);
146#else
147 memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
148#endif
149 if(!iflag){
150#ifdef __NVCC__
151 Complex *smallPhi;
152 cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),streams[0]);
153#else
154 Complex *smallPhi = (Complex *)aligned_alloc(AVX,kferm2*sizeof(Complex));
155#endif
156 Fill_Small_Phi(na, smallPhi, Phi);
157 // Congradq(na, res1,smallPhi, &itercg );
158 Congradq(na,res1,X1,smallPhi,ut_f,iu,id,gamval_f,gamin,dk_f,jqq,akappa,&itercg);
159#ifdef __NVCC__
160 cudaFreeAsync(smallPhi,streams[0]);
161#else
162 free(smallPhi);
163#endif
164 *ancg+=itercg;
165#ifdef __NVCC__
166 Complex blasa=2.0; double blasb=-1.0;
167 cublasZdscal(cublas_handle,kferm2,&blasb,(cuDoubleComplex *)(X0+na*kferm2),1);
168 cublasZaxpy(cublas_handle,kferm2,(cuDoubleComplex *)&blasa,(cuDoubleComplex *)X1,1,(cuDoubleComplex *)(X0+na*kferm2),1);
169 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
170 //HDslash launches a different stream so we need a barrieer
171 cudaDeviceSynchronise();
172#elif (defined __INTEL_MKL__)
173 Complex blasa=2.0; Complex blasb=-1.0;
174 //This is not a general BLAS Routine. BLIS and MKl support it
175 //CUDA and GSL does not support it
176 cblas_zaxpby(kferm2, &blasa, X1, 1, &blasb, X0+na*kferm2, 1);
177#elif defined USE_BLAS
178 Complex blasa=2.0; double blasb=-1.0;
179 cblas_zdscal(kferm2,blasb,X0+na*kferm2,1);
180 cblas_zaxpy(kferm2,&blasa,X1,1,X0+na*kferm2,1);
181#else
182#pragma omp parallel for simd collapse(2)
183 for(int i=0;i<kvol;i++)
184 for(int idirac=0;idirac<ndirac;idirac++){
185 X0[((na*kvol+i)*ndirac+idirac)*nc]=
186 2*X1[(i*ndirac+idirac)*nc]-X0[((na*kvol+i)*ndirac+idirac)*nc];
187 X0[((na*kvol+i)*ndirac+idirac)*nc+1]=
188 2*X1[(i*ndirac+idirac)*nc+1]-X0[((na*kvol+i)*ndirac+idirac)*nc+1];
189 }
190#endif
191 }
192 #ifdef __NVCC__
193 Hdslash_f(X2_f,X1_f,ut_f,iu,id,gamval_f,gamin,dk_f,akappa);
194 #else
195 Hdslash(X2,X1,ut,iu,id,gamval,gamin,dk,akappa);
196 #endif
197#ifdef __NVCC__
198 float blasd=2.0;
199 cudaDeviceSynchronise();
200 cublasCsscal(cublas_handle,kferm2, &blasd, (cuComplex *)X2_f, 1);
201#elif defined USE_BLAS
202 double blasd=2.0;
203 cblas_zdscal(kferm2, blasd, X2, 1);
204#else
205#pragma unroll
206 for(int i=0;i<kferm2;i++)
207 X2[i]*=2;
208#endif
209#if(npx>1)
210 ZHalo_swap_dir(X1,8,0,DOWN);
211 ZHalo_swap_dir(X2,8,0,DOWN);
212#endif
213#if(npy>1)
214 ZHalo_swap_dir(X1,8,1,DOWN);
215 ZHalo_swap_dir(X2,8,1,DOWN);
216#endif
217#if(npz>1)
218 ZHalo_swap_dir(X1,8,2,DOWN);
219 ZHalo_swap_dir(X2,8,2,DOWN);
220#endif
221#if(npt>1)
222 ZHalo_swap_dir(X1,8,3,DOWN);
223 ZHalo_swap_dir(X2,8,3,DOWN);
224#endif
225
226 // The original FORTRAN Comment:
227 // dSdpi=dSdpi-Re(X1*(d(Mdagger)dp)*X2) -- Yikes!
228 // we're gonna need drugs for this one......
229 //
230 // Makes references to X1(.,.,iu(i,mu)) AND X2(.,.,iu(i,mu))
231 // as a result, need to swap the DOWN halos in all dirs for
232 // both these arrays, each of which has 8 cpts
233 //
234#ifdef __NVCC__
235 cuForce(dSdpi,ut_f,X1_f,X2_f,gamval_f,dk_f,iu,gamin,akappa,dimGrid,dimBlock);
236 cudaDeviceSynchronise();
237#else
238#pragma omp parallel for
239 for(int i=0;i<kvol;i++)
240 for(int idirac=0;idirac<ndirac;idirac++){
241 int mu, uid, igork1;
242#ifndef NO_SPACE
243#pragma omp simd //aligned(dSdpi,X1,X2,ut[0],ut[1],iu:AVX)
244 for(mu=0; mu<3; mu++){
245 //Long term ambition. I used the diff command on the different
246 //spacial components of dSdpi and saw a lot of the values required
247 //for them are duplicates (u11(i,mu)*X2(1,idirac,i) is used again with
248 //a minus in front for example. Why not evaluate them first /and then plug
249 //them into the equation? Reduce the number of evaluations needed and look
250 //a bit neater (although harder to follow as a consequence).
251
252 //Up indices
253 uid = iu[mu+ndim*i];
254 igork1 = gamin[mu*ndirac+idirac];
255
256 //REMINDER. Gamma is already scaled by kappa when we defined them. So if yer trying to rederive this from
257 //Montvay and Munster and notice a missing kappa in the code, that is why.
258 dSdpi[(i*nadj)*ndim+mu]+=akappa*creal(I*
259 (conj(X1[(i*ndirac+idirac)*nc])*
260 (-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
261 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
262 +conj(X1[(uid*ndirac+idirac)*nc])*
263 ( ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
264 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
265 +conj(X1[(i*ndirac+idirac)*nc+1])*
266 (ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
267 +ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
268 +conj(X1[(uid*ndirac+idirac)*nc+1])*
269 (-ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
270 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
271 dSdpi[(i*nadj)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
272 (conj(X1[(i*ndirac+idirac)*nc])*
273 (-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
274 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
275 +conj(X1[(uid*ndirac+idirac)*nc])*
276 (-ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
277 +conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
278 +conj(X1[(i*ndirac+idirac)*nc+1])*
279 (ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
280 +ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
281 +conj(X1[(uid*ndirac+idirac)*nc+1])*
282 (ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
283 +conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
284
285 dSdpi[(i*nadj+1)*ndim+mu]+=akappa*creal(
286 (conj(X1[(i*ndirac+idirac)*nc])*
287 (-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
288 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
289 +conj(X1[(uid*ndirac+idirac)*nc])*
290 (-ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
291 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
292 +conj(X1[(i*ndirac+idirac)*nc+1])*
293 (-ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
294 -ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
295 +conj(X1[(uid*ndirac+idirac)*nc+1])*
296 (ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
297 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
298 dSdpi[(i*nadj+1)*ndim+mu]+=creal(gamval[mu*ndirac+idirac]*
299 (conj(X1[(i*ndirac+idirac)*nc])*
300 (-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
301 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
302 +conj(X1[(uid*ndirac+idirac)*nc])*
303 (ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
304 +conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
305 +conj(X1[(i*ndirac+idirac)*nc+1])*
306 (-ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
307 -ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
308 +conj(X1[(uid*ndirac+idirac)*nc+1])*
309 (-ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
310 +conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
311
312 dSdpi[(i*nadj+2)*ndim+mu]+=akappa*creal(I*
313 (conj(X1[(i*ndirac+idirac)*nc])*
314 (ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
315 +ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
316 +conj(X1[(uid*ndirac+idirac)*nc])*
317 (-conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
318 -ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])
319 +conj(X1[(i*ndirac+idirac)*nc+1])*
320 (conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
321 -conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
322 +conj(X1[(uid*ndirac+idirac)*nc+1])*
323 (-conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
324 +ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])));
325 dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
326 (conj(X1[(i*ndirac+idirac)*nc])*
327 (ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
328 +ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
329 +conj(X1[(uid*ndirac+idirac)*nc])*
330 (conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
331 +ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])
332 +conj(X1[(i*ndirac+idirac)*nc+1])*
333 (conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
334 -conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
335 +conj(X1[(uid*ndirac+idirac)*nc+1])*
336 (conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
337 -ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])));
338
339 }
340#endif
341 //We're not done tripping yet!! Time like term is different. dk4? shows up
342 //For consistency we'll leave mu in instead of hard coding.
343 mu=3;
344 uid = iu[mu+ndim*i];
345 igork1 = gamin[mu*ndirac+idirac];
346#ifndef NO_TIME
347 dSdpi[(i*nadj)*ndim+mu]+=creal(I*
348 (conj(X1[(i*ndirac+idirac)*nc])*
349 (dk[0][i]*(-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
350 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
351 +conj(X1[(uid*ndirac+idirac)*nc])*
352 (dk[1][i]* (+ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
353 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
354 +conj(X1[(i*ndirac+idirac)*nc+1])*
355 (dk[0][i]* (ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
356 +ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
357 +conj(X1[(uid*ndirac+idirac)*nc+1])*
358 (dk[1][i]* (-ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
359 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))))
360 +creal(I*
361 (conj(X1[(i*ndirac+idirac)*nc])*
362 (dk[0][i]*(-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
363 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
364 +conj(X1[(uid*ndirac+idirac)*nc])*
365 (-dk[1][i]* (ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
366 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
367 +conj(X1[(i*ndirac+idirac)*nc+1])*
368 (dk[0][i]* (ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
369 +ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
370 +conj(X1[(uid*ndirac+idirac)*nc+1])*
371 (-dk[1][i]* (-ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
372 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
373
374 dSdpi[(i*nadj+1)*ndim+mu]+=creal(
375 conj(X1[(i*ndirac+idirac)*nc])*
376 (dk[0][i]*(-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
377 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
378 +conj(X1[(uid*ndirac+idirac)*nc])*
379 (dk[1][i]* (-ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
380 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
381 +conj(X1[(i*ndirac+idirac)*nc+1])*
382 (dk[0][i]* (-ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
383 -ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
384 +conj(X1[(uid*ndirac+idirac)*nc+1])*
385 (dk[1][i]* ( ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
386 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])))
387 +creal(
388 (conj(X1[(i*ndirac+idirac)*nc])*
389 (dk[0][i]*(-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
390 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
391 +conj(X1[(uid*ndirac+idirac)*nc])*
392 (-dk[1][i]* (-ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
393 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
394 +conj(X1[(i*ndirac+idirac)*nc+1])*
395 (dk[0][i]* (-ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
396 -ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
397 +conj(X1[(uid*ndirac+idirac)*nc+1])*
398 (-dk[1][i]* (ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
399 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
400
401 dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*
402 (conj(X1[(i*ndirac+idirac)*nc])*
403 (dk[0][i]* (ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
404 +ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
405 +conj(X1[(uid*ndirac+idirac)*nc])*
406 (dk[1][i]*(-conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
407 -ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))
408 +conj(X1[(i*ndirac+idirac)*nc+1])*
409 (dk[0][i]* (conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
410 -conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
411 +conj(X1[(uid*ndirac+idirac)*nc+1])*
412 (dk[1][i]*(-conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
413 +ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))))
414 +creal(I*
415 (conj(X1[(i*ndirac+idirac)*nc])*
416 (dk[0][i]* (ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
417 +ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
418 +conj(X1[(uid*ndirac+idirac)*nc])*
419 (-dk[1][i]*(-conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
420 -ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))
421 +conj(X1[(i*ndirac+idirac)*nc+1])*
422 (dk[0][i]* (conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
423 -conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
424 +conj(X1[(uid*ndirac+idirac)*nc+1])*
425 (-dk[1][i]*(-conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
426 +ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))));
427
428#endif
429 }
430#endif
431 }
432#ifdef __NVCC__
433 cudaFreeAsync(X1_f,streams[0]); cudaFreeAsync(X2_f,streams[1]);
434#else
435 free(X2);
436#endif
437 return 0;
438}
int Gauge_force(double *dSdpi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
Definition force.c:6
int Hdslash(Complex *phi, Complex *r, Complex *u11t[2], unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk[2], float akappa)
Evaluates in double precision.
Definition matrices.c:268
#define DOWN
Flag for send down.
Definition par_mpi.h:35
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define nadj
adjacent spatial indices
Definition sizes.h:175
#define nf
Fermion flavours (double it)
Definition sizes.h:151
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
Definition congrad.c:7
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
Definition su2hmc.c:385

References AVX, Complex, Complex_f, Congradq(), DOWN, Fill_Small_Phi(), Gauge_force(), Hdslash(), Hdslash_f(), kferm2, kferm2Halo, kvol, nadj, nc, ndim, ndirac, nf, and ZHalo_swap_dir().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Gauge_force()

int Gauge_force ( double * dSdpi,
Complex_f * ut[2],
unsigned int * iu,
unsigned int * id,
float beta )

Calculates the gauge force due to the Wilson Action at each intermediate time.

Parameters
dSdpiThe force
utGauge fields
iu,idLattice indices
betaInverse gauge coupling
Returns
Zero on success, integer error code otherwise

Definition at line 6 of file force.c.

6 {
7 /*
8 * Calculates dSdpi due to the Wilson Action at each intermediate time
9 *
10 * Calls:
11 * =====
12 * C_Halo_swap_all, C_gather, C_Halo_swap_dir
13 *
14 * Parameters:
15 * =======
16 * double *dSdpi
17 * Complex_f *ut[0]
18 * Complex_f *ut[1]
19 * unsigned int *iu
20 * unsigned int *id
21 * float beta
22 *
23 * Returns:
24 * =======
25 * Zero on success, integer error code otherwise
26 */
27 const char *funcname = "Gauge_force";
28
29 //We define zero halos for debugging
30 // #ifdef _DEBUG
31 // memset(ut[0][kvol], 0, ndim*halo*sizeof(Complex_f));
32 // memset(ut[1][kvol], 0, ndim*halo*sizeof(Complex_f));
33 // #endif
34 //Was a trial field halo exchange here at one point.
35#ifdef __NVCC__
36 cuGauge_force(ut,dSdpi,beta,iu,id,dimGrid,dimBlock);
37#else
38 Complex_f *Sigma[2], *ush[2];
39 Sigma[0] = (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f));
40 Sigma[1]= (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f));
41 ush[0] = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f));
42 ush[1] = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f));
43 //Holders for directions
44 for(int mu=0; mu<ndim; mu++){
45 memset(Sigma[0],0, kvol*sizeof(Complex_f));
46 memset(Sigma[1],0, kvol*sizeof(Complex_f));
47 for(int nu=0; nu<ndim; nu++)
48 if(nu!=mu){
49 //The +ν Staple
50#pragma omp parallel for simd //aligned(ut[0],ut[1],Sigma[0],Sigma[1],iu:AVX)
51 for(int i=0;i<kvol;i++){
52 int uidm = iu[mu+ndim*i];
53 int uidn = iu[nu+ndim*i];
54 Complex_f a11=ut[0][uidm*ndim+nu]*conj(ut[0][uidn*ndim+mu])+\
55 ut[1][uidm*ndim+nu]*conj(ut[1][uidn*ndim+mu]);
56 Complex_f a12=-ut[0][uidm*ndim+nu]*ut[1][uidn*ndim+mu]+\
57 ut[1][uidm*ndim+nu]*ut[0][uidn*ndim+mu];
58 Sigma[0][i]+=a11*conj(ut[0][i*ndim+nu])+a12*conj(ut[1][i*ndim+nu]);
59 Sigma[1][i]+=-a11*ut[1][i*ndim+nu]+a12*ut[0][i*ndim+nu];
60 }
61 C_gather(ush[0], ut[0], kvol, id, nu);
62 C_gather(ush[1], ut[1], kvol, id, nu);
63#if(nproc>1)
64 CHalo_swap_dir(ush[0], 1, mu, DOWN); CHalo_swap_dir(ush[1], 1, mu, DOWN);
65#endif
66 //Next up, the -ν staple
67#pragma omp parallel for simd //aligned(ut[0],ut[1],ush[0],ush[1],Sigma[0],Sigma[1],iu,id:AVX)
68 for(int i=0;i<kvol;i++){
69 int uidm = iu[mu+ndim*i];
70 int didn = id[nu+ndim*i];
71 //uidm is correct here
72 Complex_f a11=conj(ush[0][uidm])*conj(ut[0][didn*ndim+mu])-\
73 ush[1][uidm]*conj(ut[1][didn*ndim+mu]);
74 Complex_f a12=-conj(ush[0][uidm])*ut[1][didn*ndim+mu]-\
75 ush[1][uidm]*ut[0][didn*ndim+mu];
76 Sigma[0][i]+=a11*ut[0][didn*ndim+nu]-a12*conj(ut[1][didn*ndim+nu]);
77 Sigma[1][i]+=a11*ut[1][didn*ndim+nu]+a12*conj(ut[0][didn*ndim+nu]);
78 }
79 }
80#pragma omp parallel for simd //aligned(ut[0],ut[1],Sigma[0],Sigma[1],dSdpi:AVX)
81 for(int i=0;i<kvol;i++){
82 Complex_f a11 = ut[0][i*ndim+mu]*Sigma[1][i]+ut[1][i*ndim+mu]*conj(Sigma[0][i]);
83 Complex_f a12 = ut[0][i*ndim+mu]*Sigma[0][i]+conj(ut[1][i*ndim+mu])*Sigma[1][i];
84
85 dSdpi[(i*nadj)*ndim+mu]=(double)(beta*cimag(a11));
86 dSdpi[(i*nadj+1)*ndim+mu]=(double)(beta*creal(a11));
87 dSdpi[(i*nadj+2)*ndim+mu]=(double)(beta*cimag(a12));
88 }
89 }
90 free(ush[0]); free(ush[1]); free(Sigma[0]); free(Sigma[1]);
91#endif
92 return 0;
93}
int CHalo_swap_dir(Complex_f *c, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define halo
Total Halo size.
Definition sizes.h:222
int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
Extracts all the single precision gauge links in the direction only.
Definition su2hmc.c:357

References AVX, C_gather(), CHalo_swap_dir(), Complex_f, DOWN, halo, kvol, nadj, and ndim.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Hamilton()

int Hamilton ( double * h,
double * s,
double res2,
double * pp,
Complex * X0,
Complex * X1,
Complex * Phi,
Complex_f * ut[2],
unsigned int * iu,
unsigned int * id,
Complex_f * gamval_f,
int * gamin,
float * dk[2],
Complex_f jqq,
float akappa,
float beta,
double * ancgh,
int traj )

Calculate the Hamiltonian.

Parameters
hHamiltonian
sAction
res2Limit for conjugate gradient
ppMomentum field
X0Up/down partitioned pseudofermion field
X1Holder for the partitioned fermion field, then the conjugate gradient output
PhiPseudofermion field
utGauge fields (single precision)
iu,idLattice indices
gamval_fSingle precision gamma matrices rescaled by kappa
gaminGamma indices
dk_f\(\left(1+\gamma_0\right)e^{-\mu}\) and \(\left(1-\gamma_0\right)e^\mu\) float
jqqDiquark source
akappaHopping parameter
betaInverse gauge coupling
ancghConjugate gradient iterations counter
trajCalling trajectory for error reporting
Returns
Zero on success. Integer Error code otherwise.

Definition at line 245 of file su2hmc.c.

247 {
248 /*
249 * @brief Calculate the Hamiltonian
250 *
251 *
252 * @param h: Hamiltonian
253 * @param s: Action
254 * @param res2: Limit for conjugate gradient
255 * @param X0:
256 * @param X1:
257 * @param Phi:
258 * @param ut[0],ut[1]: Gauge fields
259 * @param ut[0],ut[1]: Gauge fields
260 * @param iu,id: Lattice indices
261 * @param gamval_f: Gamma matrices
262 * @param gamin: Gamma indices
263 * @param dk_f[0]: $exp(-\mu)$ float
264 * @param dk_f[1]: $exp(\mu)$ float
265 * @param jqq: Diquark source
266 * @param akappa: Hopping parameter
267 * @param beta: Inverse gauge coupling
268 * @param ancgh: Conjugate gradient iterations counter
269 *
270 * @return Zero on success. Integer Error code otherwise.
271 */
272 const char *funcname = "Hamilton";
273 //Iterate over momentum terms.
274#ifdef __NVCC__
275 double hp;
276 int device=-1;
277 cudaGetDevice(&device);
278 cudaMemPrefetchAsync(pp,kmom*sizeof(double),device,NULL);
279 cublasDnrm2(cublas_handle, kmom, pp, 1,&hp);
280 hp*=hp;
281#elif defined USE_BLAS
282 double hp = cblas_dnrm2(kmom, pp, 1);
283 hp*=hp;
284#else
285 double hp=0;
286 for(int i = 0; i<kmom; i++)
287 hp+=pp[i]*pp[i];
288#endif
289 hp*=0.5;
290 double avplaqs, avplaqt;
291 double hg = 0;
292 //avplaq? isn't seen again here.
293 Average_Plaquette(&hg,&avplaqs,&avplaqt,ut,iu,beta);
294
295 double hf = 0; int itercg = 0;
296#ifdef __NVCC__
297 Complex *smallPhi;
298 cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),NULL);
299#else
300 Complex *smallPhi = aligned_alloc(AVX,kferm2*sizeof(Complex));
301#endif
302 //Iterating over flavours
303 for(int na=0;na<nf;na++){
304#ifdef __NVCC__
305#ifdef _DEBUG
306 cudaDeviceSynchronise();
307#endif
308 cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,streams[0]);
309#ifdef _DEBUG
310 cudaDeviceSynchronise();
311#endif
312#else
313 memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
314#endif
315 Fill_Small_Phi(na, smallPhi, Phi);
316 if(Congradq(na,res2,X1,smallPhi,ut,iu,id,gamval_f,gamin,dk,jqq,akappa,&itercg))
317 fprintf(stderr,"Trajectory %d\n", traj);
318
319 *ancgh+=itercg;
320#ifdef __NVCC__
321 cudaMemcpyAsync(X0+na*kferm2,X1,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,streams[0]);
322#else
323 memcpy(X0+na*kferm2,X1,kferm2*sizeof(Complex));
324#endif
325 Fill_Small_Phi(na, smallPhi,Phi);
326#ifdef __NVCC__
327 Complex dot;
328 cublasZdotc(cublas_handle,kferm2,(cuDoubleComplex *)smallPhi,1,(cuDoubleComplex *) X1,1,(cuDoubleComplex *) &dot);
329 hf+=creal(dot);
330#elif defined USE_BLAS
331 Complex dot;
332 cblas_zdotc_sub(kferm2, smallPhi, 1, X1, 1, &dot);
333 hf+=creal(dot);
334#else
335 //It is a dot product of the flattened arrays, could use
336 //a module to convert index to coordinate array...
337 for(int j=0;j<kferm2;j++)
338 hf+=creal(conj(smallPhi[j])*X1[j]);
339#endif
340 }
341#ifdef __NVCC__
342 cudaFreeAsync(smallPhi,NULL);
343#else
344 free(smallPhi);
345#endif
346 //hg was summed over inside of Average_Plaquette.
347#if(nproc>1)
348 Par_dsum(&hp); Par_dsum(&hf);
349#endif
350 *s=hg+hf; *h=(*s)+hp;
351#ifdef _DEBUG
352 if(!rank)
353 printf("hg=%.5e; hf=%.5e; hp=%.5e; h=%.5e\n", hg, hf, hp, *h);
354#endif
355 return 0;
356}
#define kmom
sublattice momentum sites
Definition sizes.h:184
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
Definition su2hmc.c:385
int Average_Plaquette(double *hg, double *avplaqs, double *avplaqt, Complex_f *ut[2], unsigned int *iu, float beta)
Calculates the gauge action using new (how new?) lookup table.
Definition bosonic.c:20

References Average_Plaquette(), AVX, Complex, Complex_f, Congradq(), Fill_Small_Phi(), kferm2, kmom, nf, Par_dsum(), and rank.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Init()

int Init ( int istart,
int ibound,
int iread,
float beta,
float fmu,
float akappa,
Complex_f ajq,
Complex * u[2],
Complex * ut[2],
Complex_f * ut_f[2],
Complex * gamval,
Complex_f * gamval_f,
int * gamin,
double * dk[2],
float * dk_f[2],
unsigned int * iu,
unsigned int * id )

Initialises the system.

Parameters
istartZero for cold, >1 for hot, <1 for none
iboundPeriodic boundary conditions
ireadRead configuration from file
betaInverse gauge coupling
fmuChemical potential
akappaHopping parameter
ajqDiquark source
uGauge fields
utTrial gauge field
ut_fTrial gauge field (single precision)
dk\(\left(1+\gamma_0\right)e^{-\mu}\) and \(\left(1-\gamma_0\right)^\mu\)
dk_f\(\left(1+\gamma_0\right)e^{-\mu}\) and \(\left(1-\gamma_0\right)^\mu\) float
iu,idUp halo indices
gaminGamma matrix indices
gamvalDouble precision gamma matrices rescaled by kappa
gamval_fSingle precision gamma matrices rescaled by kappa
Returns
Zero on success, integer error code otherwise

Definition at line 19 of file su2hmc.c.

24 {
25 /*
26 * Initialises the system
27 *
28 * Calls:
29 * ======
30 * Addrc, Check_addr, ran2, DHalo_swap_dir, Par_sread, Par_ranset, Reunitarise
31 *
32 * Globals:
33 * =======
34 * Complex gamval: Gamma Matrices
35 * Complex_f gamval_f: Float Gamma matrices:
36 *
37 * Parameters:
38 * ==========
39 * int istart: Zero for cold, >1 for hot, <1 for none
40 * int ibound: Periodic boundary conditions
41 * int iread: Read configuration from file
42 * float beta: beta
43 * float fmu: Chemical potential
44 * float akappa: Hopping parameter
45 * Complex_f ajq: Diquark source
46 * Complex *u[0]: First colour field
47 * Complex *u[1]: Second colour field
48 * Complex *ut[0]: First colour trial field
49 * Complex *ut[1]: Second colour trial field
50 * Complex_f *ut_f[0]: First float trial field
51 * Complex_f *ut_f[1]: Second float trial field
52 * double *dk[0] $exp(-\mu)$
53 * double *dk[1]: $exp(\mu)$
54 * float *dk_f[0]: $exp(-\mu)$ float
55 * float *dk_f[1]: $exp(\mu)$ float
56 * unsigned int *iu: Up halo indices
57 * unsigned int *id: Down halo indices
58 *
59 * Returns:
60 * =======
61 * Zero on success, integer error code otherwise
62 */
63 const char *funcname = "Init";
64
65#ifdef _OPENMP
66 omp_set_num_threads(nthreads);
67#ifdef __INTEL_MKL__
68 mkl_set_num_threads(nthreads);
69#endif
70#endif
71 //First things first, calculate a few constants for coordinates
72 Addrc(iu, id);
73 //And confirm they're legit
76#ifdef _DEBUG
77 printf("Checked addresses\n");
78#endif
79 double chem1=exp(fmu); double chem2 = 1/chem1;
80 //CUDA this. Only limit will be the bus speed
81#pragma omp parallel for simd //aligned(dk[0],dk[1]:AVX)
82 for(int i = 0; i<kvol; i++){
83 dk[1][i]=akappa*chem1;
84 dk[0][i]=akappa*chem2;
85 }
86 //Anti periodic Boundary Conditions. Flip the terms at the edge of the time
87 //direction
88 if(ibound == -1 && pcoord[3+ndim*rank]==npt-1){
89#ifdef _DEBUG
90 printf("Implementing antiperiodic boundary conditions on rank %i\n", rank);
91#endif
92#pragma omp parallel for simd //aligned(dk[0],dk[1]:AVX)
93 for(int k= kvol-1; k>=kvol-kvol3; k--){
94 //int k = kvol - kvol3 + i;
95 dk[1][k]*=-1;
96 dk[0][k]*=-1;
97 }
98 }
99 //These are constant so swap the halos when initialising and be done with it
100 //May need to add a synchronisation statement here first
101#if(npt>1)
102 DHalo_swap_dir(dk[1], 1, 3, UP);
103 DHalo_swap_dir(dk[0], 1, 3, UP);
104#endif
105 //Float versions
106#ifdef __NVCC__
107 cuReal_convert(dk_f[1],dk[1],kvol+halo,true,dimBlock,dimGrid);
108 cuReal_convert(dk_f[0],dk[0],kvol+halo,true,dimBlock,dimGrid);
109#else
110#pragma omp parallel for simd //aligned(dk[0],dk[1],dk_f[0],dk_f[1]:AVX)
111 for(int i=0;i<kvol+halo;i++){
112 dk_f[1][i]=(float)dk[1][i];
113 dk_f[0][i]=(float)dk[0][i];
114 }
115#endif
116 //What row of each dirac/sigma matrix contains the entry acting on element i of the spinor
117 int __attribute__((aligned(AVX))) gamin_t[4][4] = {{3,2,1,0},{3,2,1,0},{2,3,0,1},{2,3,0,1}};
118 //Gamma Matrices in Chiral Representation
119 //See Appendix 8.1.2 of Montvay and Munster
120 //_t is for temp. We copy these into the real gamvals later
121#ifdef __NVCC__
122 cudaMemcpy(gamin,gamin_t,4*4*sizeof(int),cudaMemcpyDefault);
123#else
124 memcpy(gamin,gamin_t,4*4*sizeof(int));
125#endif
126 //Each row of the dirac matrix contains only one non-zero entry, so that's all we encode here
127 Complex __attribute__((aligned(AVX))) gamval_t[5][4] = {{-I,-I,I,I},{-1,1,1,-1},{-I,I,I,-I},{1,1,1,1},{1,1,-1,-1}};
128 //Each gamma matrix is rescaled by akappa by flattening the gamval array
129#if defined USE_BLAS
130 //Don't cuBLAS this. It is small and won't saturate the GPU. Let the CPU handle
131 //it and just copy it later
132 cblas_zdscal(5*4, akappa, gamval_t, 1);
133#else
134#pragma omp parallel for simd collapse(2) aligned(gamval,gamval_f:AVX)
135 for(int i=0;i<5;i++)
136 for(int j=0;j<4;j++)
137 gamval_t[i][j]*=akappa;
138#endif
139
140
141#ifdef __NVCC__
142 cudaMemcpy(gamval,gamval_t,5*4*sizeof(Complex),cudaMemcpyDefault);
143 cuComplex_convert(gamval_f,gamval,20,true,dimBlockOne,dimGridOne);
144#else
145 memcpy(gamval,gamval_t,5*4*sizeof(Complex));
146 for(int i=0;i<5*4;i++)
147 gamval_f[i]=(Complex_f)gamval[i];
148#endif
149
150#ifdef __CLOVER__
151 int __attribute__((aligned(AVX))) sigin_t[7][4] = {{0,1,2,3},{0,1,2,3},{1,0,3,2},{1,0,3,2},{1,0,3,2},{1,0,3,2},{0,1,2,3}};
152 //The sigma matrices are the commutators of the gamma matrices. These are antisymmetric when you swap the indices
153 //Zero is the identical index case.
154 //1 is sigma_1,2
155 //2 is sigma_1,3
156 //3 is sigma_1,4
157 //4 is sigma_2,3
158 //5 is sigma_2,4
159 //6 is sigma_3,4
160 //TODO: Do we mutiply by 1/2 and kappa here or not? I say yes to be consistent with gamma. It means the factor of 2
161 //in the sigma matrices get's droppeed here
162 Complex __attribute__((aligned(AVX))) sigval_t[7][4] = {{0,0,0,0},{-1,1,-1,1},{-I,I,-I,I},{1,1,-1,-1},{-1,-1,-1,-1},{-I,I,I,-I},{1,-1,-1,1}};
163#if defined USE_BLAS
164 cblas_zdscal(6*4, akappa, sigval_t+4*sizeof(Complex), 1);
165#else
166#pragma omp parallel for simd collapse(2) aligned(sigval,sigval_f:AVX)
167 for(int i=1;i<7;i++)
168 for(int j=0;j<4;j++)
169 sigval_t[i][j]*=akappa;
170#endif
171
172#ifdef __NVCC__
173 cudaMemcpy(sigval,sigval_t,7*4*sizeof(Complex),cudaMemcpyDefault);
174 cuComplex_convert(sigval_f,sigval,28,true,dimBlockOne,dimGridOne);
175#else
176 memcpy(sigval,sigval_t,7*4*sizeof(Complex));
177 for(int i=0;i<7*4;i++)
178 sigval_f[i]=(Complex_f)sigval[i];
179#endif
180#endif
181
182 if(iread){
183 if(!rank) printf("Calling Par_sread() for configuration: %i\n", iread);
184 Par_sread(iread, beta, fmu, akappa, ajq,u[0],u[1],ut[0],ut[1]);
185 Par_ranset(&seed,iread);
186 }
187 else{
188 Par_ranset(&seed,iread);
189 if(istart==0){
190 //Initialise a cold start to zero
191 //memset is safe to use here because zero is zero
192#pragma omp parallel for simd //aligned(ut[0]:AVX)
193 //Leave it to the GPU?
194 for(int i=0; i<kvol*ndim;i++){
195 ut[0][i]=1; ut[1][i]=0;
196 }
197 }
198 else if(istart>0){
199 //Ideally, we can use gsl_ranlux as the PRNG
200#ifdef __RANLUX__
201 for(int i=0; i<kvol*ndim;i++){
202 ut[0][i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
203 ut[1][i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
204 }
205 //If not, the Intel Vectorise Mersenne Twister
206#elif (defined __INTEL_MKL__&&!defined USE_RAN2)
207 //Good news, casting works for using a double to create random complex numbers
208 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*ndim*kvol, ut[0], -1, 1);
209 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*ndim*kvol, ut[1], -1, 1);
210 //Last resort, Numerical Recipes' Ran2
211#else
212 for(int i=0; i<kvol*ndim;i++){
213 ut[0][i]=2*(ran2(&seed)-0.5+I*(ran2(&seed)-0.5));
214 ut[1][i]=2*(ran2(&seed)-0.5+I*(ran2(&seed)-0.5));
215 }
216#endif
217 }
218 else
219 fprintf(stderr,"Warning %i in %s: Gauge fields are not initialised.\n", NOINIT, funcname);
220
221#ifdef __NVCC__
222 int device=-1;
223 cudaGetDevice(&device);
224 cudaMemPrefetchAsync(ut[0], ndim*kvol*sizeof(Complex),device,streams[0]);
225 cudaMemPrefetchAsync(ut[1], ndim*kvol*sizeof(Complex),device,streams[1]);
226#endif
227 //Send trials to accelerator for reunitarisation
228 Reunitarise(ut);
229 //Get trials back
230#ifdef __NVCC__
231 cudaMemcpyAsync(u[0],ut[0],ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[0]);
232 cudaMemPrefetchAsync(u[0], ndim*kvol*sizeof(Complex),device,streams[0]);
233 cudaMemcpyAsync(u[1],ut[1],ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[1]);
234 cudaMemPrefetchAsync(u[1], ndim*kvol*sizeof(Complex),device,streams[1]);
235#else
236 memcpy(u[0], ut[0], ndim*kvol*sizeof(Complex));
237 memcpy(u[1], ut[1], ndim*kvol*sizeof(Complex));
238#endif
239 }
240#ifdef _DEBUG
241 printf("Initialisation Complete\n");
242#endif
243 return 0;
244}
int Check_addr(unsigned int *table, int lns, int lnt, int imin, int imax)
Definition coord.c:334
int Addrc(unsigned int *iu, unsigned int *id)
Loads the addresses required during the update.
Definition coord.c:16
#define UP
Flag for send up.
Definition par_mpi.h:37
int Par_sread(const int iread, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t)
Reads and assigns the gauges from file.
Definition par_mpi.c:127
int DHalo_swap_dir(double *d, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int * pcoord
The processor grid.
Definition par_mpi.c:19
int Par_ranset(long *seed, int iread)
Uses the rank to get a new seed. Copying from the FORTRAN description here c create new seeds in rang...
Definition random.c:135
double ran2(long *idum)
Generates uniformly distributed random double between zero and one as described in numerical recipes....
Definition random.c:440
long seed
RAN2 seed.
Definition random.c:27
#define ksizet
Sublattice t extent.
Definition sizes.h:149
#define nthreads
Number of threads for OpenMP, which can be overwritten at runtime.
Definition sizes.h:135
#define ksize
Sublattice spatial extent for a cubic lattice.
Definition sizes.h:146
#define kvol3
Sublattice spatial volume.
Definition sizes.h:156
#define npt
Processor grid t extent.
Definition sizes.h:124
int Reunitarise(Complex *ut[2])
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
Definition matrices.c:1012

References Addrc(), AVX, Check_addr(), Complex, Complex_f, DHalo_swap_dir(), halo, ksize, ksizet, kvol, kvol3, ndim, npt, nthreads, Par_ranset(), Par_sread(), pcoord, ran2(), rank, Reunitarise(), seed, and UP.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Measure()

int Measure ( double * pbp,
double * endenf,
double * denf,
Complex * qq,
Complex * qbqb,
double res,
int * itercg,
Complex * ut[2],
Complex_f * ut_f[2],
unsigned int * iu,
unsigned int * id,
Complex * gamval,
Complex_f * gamval_f,
int * gamin,
double * dk[2],
float * dk_f[2],
Complex_f jqq,
float akappa,
Complex * Phi,
Complex * R1 )

Calculate fermion expectation values via a noisy estimator.

Matrix inversion via conjugate gradient algorithm Solves \(MX=X_1\) (Numerical Recipes section 2.10 pp.70-73)
uses NEW lookup tables ** Implemented in Congradp()

Parameters
pbp\(\langle\bar{\Psi}\Psi\rangle\)
endenfEnergy density
denfNumber Density
qqDiquark condensate
qbqbAntidiquark condensate
resConjugate Gradient Residue
itercgIterations of Conjugate Gradient
utDouble precisiongauge field
ut_fSingle precision gauge fields
iu,idLattice indices
gamvalDouble precision gamma matrices rescaled by kappa
gamval_fSingle precision gamma matrices rescaled by kappa
gaminIndices for Dirac terms
dk\(\left(1+\gamma_0\right)e^{-\mu}\) and \(\left(1-\gamma_0\right)e^\mu\) double
dk_f\(\left(1+\gamma_0\right)e^{-\mu}\) and \(\left(1-\gamma_0\right)e^\mu\) float
jqqDiquark source
akappaHopping parameter
PhiPseudofermion field
R1A useful array for holding things that was already assigned in main. In particular, we'll be using it to catch the output of \( M^\dagger\Xi\) before the inversion, then used to store the output of the inversion
Returns
Zero on success, integer error code otherwise

Definition at line 8 of file fermionic.c.

11 {
12 /*
13 * @brief Calculate fermion expectation values via a noisy estimator
14 *
15 * Matrix inversion via conjugate gradient algorithm
16 * Solves @f(Mx=x_1@f)
17 * (Numerical Recipes section 2.10 pp.70-73)
18 * uses NEW lookup tables **
19 * Implemented in Congradq
20 *
21 * @param pbp: @f(\langle\bar{\Psi}\Psi\rangle@f)
22 * @param endenf: Energy density
23 * @param denf: Number Density
24 * @param qq: Diquark condensate
25 * @param qbqb: Antidiquark condensate
26 * @param res: Conjugate Gradient Residue
27 * @param itercg: Iterations of Conjugate Gradient
28 * @param u11t,u12t Double precisiongauge field
29 * @param u11t_f,u12t_f: Single precision gauge fields
30 * @param iu,id Lattice indices
31 * @param gamval_f: Gamma matrices
32 * @param gamin: Indices for Dirac terms
33 * @param dk4m_f: $exp(-\mu)$ float
34 * @param dk4p_f: $exp(\mu)$ float
35 * @param jqq: Diquark source
36 * @param akappa: Hopping parameter
37 * @param Phi: Pseudofermion field
38 * @param R1: A useful array for holding things that was already assigned in main.
39 * In particular, we'll be using it to catch the output of
40 * @f$ M^\dagger\Xi@f$ before the inversion, then used to store the
41 * output of the inversion
42 *
43 * @return Zero on success, integer error code otherwise
44 */
45 const char *funcname = "Measure";
46 //This x is just a storage container
47
48#ifdef __NVCC__
49 int device=-1;
50 cudaGetDevice(&device);
51 Complex *x, *xi; Complex_f *xi_f, *R1_f;
52#ifdef _DEBUG
53 cudaMallocManaged((void **)&R1_f,kfermHalo*sizeof(Complex_f), cudaMemAttachGlobal);
54#else
55 cudaMallocAsync((void **)&R1_f,kfermHalo*sizeof(Complex_f),streams[1]);
56#endif
57 cudaMallocManaged((void **)&x,kfermHalo*sizeof(Complex), cudaMemAttachGlobal);
58 cudaMallocManaged((void **)&xi,kferm*sizeof(Complex), cudaMemAttachGlobal);
59 cudaMallocManaged((void **)&xi_f,kfermHalo*sizeof(Complex_f), cudaMemAttachGlobal);
60#else
61 Complex *x =(Complex *)aligned_alloc(AVX,kfermHalo*sizeof(Complex));
62 Complex *xi =(Complex *)aligned_alloc(AVX,kferm*sizeof(Complex));
63 Complex_f *xi_f =(Complex_f *)aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
64 Complex_f *R1_f = (Complex_f *)aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
65#endif
66 //Setting up noise.
67#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
68 Gauss_c(xi_f, kferm, 0, (float)(1/sqrt(2)));
69#else
70 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, xi_f, 0, 1/sqrt(2));
71#endif
72#ifdef __NVCC__
73 cudaMemPrefetchAsync(xi_f,kferm*sizeof(Complex_f),device,streams[0]);
74 cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
75 //Transpose needed here for Dslashd
76 Transpose_c(xi_f,ngorkov*nc,kvol);
77 //Flip all the gauge fields around so memory is coalesced
78 cudaMemcpyAsync(x, xi, kferm*sizeof(Complex),cudaMemcpyDefault,0);
79#else
80#pragma omp parallel for simd aligned(xi,xi_f:AVX)
81 for(int i=0;i<kferm;i++)
82 xi[i]=(Complex)xi_f[i];
83 memcpy(x, xi, kferm*sizeof(Complex));
84#endif
85 //R_1= @f$M^\dagger\Xi@f$
86 //R1 is local in FORTRAN but since its going to be reset anyway I'm going to recycle the
87 //global
88 Dslashd_f(R1_f,xi_f,ut_f[0],ut_f[1],iu,id,gamval_f,gamin,dk_f,jqq,akappa);
89#ifdef __NVCC__
90 cudaDeviceSynchronise();
91 cudaFree(xi_f);
92 Transpose_c(R1_f,kvol,ngorkov*nc);
93 cuComplex_convert(R1_f,R1,kferm,false,dimBlock,dimGrid);
94 cudaMemcpy(Phi, R1, kferm*sizeof(Complex),cudaMemcpyDefault);
95#else
96#pragma omp parallel for simd aligned(R1,R1_f:AVX)
97 for(int i=0;i<kferm;i++)
98 R1[i]=(Complex)R1_f[i];
99 //Copying R1 to the first (zeroth) flavour index of Phi
100 //This should be safe with memcpy since the pointer name
101 //references the first block of memory for that pointer
102 memcpy(Phi, R1, kferm*sizeof(Complex));
103#endif
104 //Evaluate xi = (M^† M)^-1 R_1
105 // Congradp(0, res, R1_f, itercg);
106 //If the conjugate gradient fails to converge for some reason, restart it.
107 //That's causing issues with NaN's. Plan B is to not record the measurements.
108 if(Congradp(0, res, Phi, R1,ut_f,iu,id,gamval_f,gamin,dk_f,jqq,akappa,itercg)==ITERLIM)
109 return ITERLIM;
110 //itercg=0;
111 //if(!rank) fprintf(stderr, "Restarting conjugate gradient from %s\n", funcname);
112 //Congradp(0, res, Phi, R1_f,ut_f[0],ut_f[1],iu,id,gamval_f,gamin,dk_f[0],dk_f[1],jqq,akappa,itercg);
113 //itercg+=niterc;
114 /*
115#pragma omp parallel for simd aligned(R1,R1_f:AVX)
116for(int i=0;i<kferm;i++)
117xi[i]=(Complex)R1_f[i];
118*/
119#ifdef __NVCC__
120 cudaMemcpyAsync(xi,R1,kferm*sizeof(Complex),cudaMemcpyDefault,streams[0]);
121#ifdef _DEBUG
122 cudaFree(R1_f);
123#else
124 cudaFreeAsync(R1_f,streams[1]);
125#endif
126#else
127 memcpy(xi,R1,kferm*sizeof(Complex));
128 free(xi_f); free(R1_f);
129#endif
130#ifdef USE_BLAS
131 Complex buff;
132#ifdef __NVCC__
133 cublasZdotc(cublas_handle,kferm,(cuDoubleComplex *)x,1,(cuDoubleComplex *)xi,1,(cuDoubleComplex *)&buff);
134 cudaDeviceSynchronise();
135#elif defined USE_BLAS
136 cblas_zdotc_sub(kferm, x, 1, xi, 1, &buff);
137#endif
138 *pbp=creal(buff);
139#else
140 *pbp = 0;
141#pragma unroll
142 for(int i=0;i<kferm;i++)
143 *pbp+=creal(conj(x[i])*xi[i]);
144#endif
145#if(nproc>1)
146 Par_dsum(pbp);
147#endif
148 *pbp/=4*gvol;
149
150 *qbqb=*qq=0;
151#if defined USE_BLAS
152 for(int idirac = 0; idirac<ndirac; idirac++){
153 int igork=idirac+4;
154 //Unrolling the colour indices, Then its just (γ_5*x)*Ξ or (γ_5*Ξ)*x
155#pragma unroll
156 for(int ic = 0; ic<nc; ic++){
157 Complex dot;
158 //Because we have kvol on the outer index and are summing over it, we set the
159 //step for BLAS to be ngorkov*nc=16.
160 //Does this make sense to do on the GPU?
161#ifdef __NVCC__
162 cublasZdotc(cublas_handle,kvol,(cuDoubleComplex *)(x+idirac*nc+ic),ngorkov*nc,(cuDoubleComplex *)(xi+igork*nc+ic), ngorkov*nc,(cuDoubleComplex *)&dot);
163#else
164 cblas_zdotc_sub(kvol, &x[idirac*nc+ic], ngorkov*nc, &xi[igork*nc+ic], ngorkov*nc, &dot);
165#endif
166 *qbqb+=gamval[4*ndirac+idirac]*dot;
167#ifdef __NVCC__
168 cublasZdotc(cublas_handle,kvol,(cuDoubleComplex *)(x+igork*nc+ic),ngorkov*nc,(cuDoubleComplex *)(xi+idirac*nc+ic), ngorkov*nc,(cuDoubleComplex *)&dot);
169#else
170 cblas_zdotc_sub(kvol, &x[igork*nc+ic], ngorkov*nc, &xi[idirac*nc+ic], ngorkov*nc, &dot);
171#endif
172 *qq-=gamval[4*ndirac+idirac]*dot;
173 }
174 }
175#else
176#pragma unroll(2)
177 for(int i=0; i<kvol; i++)
178 //What is the optimal order to evaluate these in?
179 for(int idirac = 0; idirac<ndirac; idirac++){
180 int igork=idirac+4;
181 *qbqb+=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+idirac)*nc])*xi[(i*ngorkov+igork)*nc];
182 *qq-=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+igork)*nc])*xi[(i*ngorkov+idirac)*nc];
183 *qbqb+=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+idirac)*nc+1])*xi[(i*ngorkov+igork)*nc+1];
184 *qq-=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+igork)*nc+1])*xi[(i*ngorkov+idirac)*nc+1];
185 }
186#endif
187 //In the FORTRAN Code dsum was used instead despite qq and qbqb being complex
188 //Since we only care about the real part this shouldn't cause (m)any serious issues
189#if(nproc>1)
190 Par_dsum((double *)qq); Par_dsum((double *)qbqb);
191#endif
192 *qq=(*qq+*qbqb)/(2*gvol);
193 Complex xu, xd, xuu, xdd;
194 xu=xd=xuu=xdd=0;
195
196 //Halos
197#if(npt>1)
198 ZHalo_swap_dir(x,16,3,DOWN); ZHalo_swap_dir(x,16,3,UP);
199#endif
200 //Pesky halo exchange indices again
201 //The halo exchange for the trial fields was done already at the end of the trajectory
202 //No point doing it again
203
204 //Instead of typing id[i*ndim+3] a lot, we'll just assign them to variables.
205 //Idea. One loop instead of two loops but for xuu and xdd just use ngorkov-(igorkov+1) instead
206 //Dirty CUDA work around since it won't convert thrust<complex> to double
207 //TODO: get a reduction routine ready for CUDA
208#ifdef __NVCC__
209 //Swapping back the gauge fields to SoA since the rest of the code is running on CPU and hasn't been ported
210 Transpose_z(ut[0],kvol,ndim);
211 Transpose_z(ut[1],kvol,ndim);
212 //Set up index arrays for CPU
213 Transpose_U(iu,kvol,ndim);
214 Transpose_U(id,kvol,ndim);
215 cudaDeviceSynchronise();
216#else
217#pragma omp parallel for reduction(+:xd,xu,xdd,xuu)
218#endif
219 for(int i = 0; i<kvol; i++){
220 int did=id[3+ndim*i];
221 int uid=iu[3+ndim*i];
222 for(int igorkov=0; igorkov<4; igorkov++){
223 int igork1=gamin[3*ndirac+igorkov];
224 //For the C Version I'll try and factorise where possible
225 xu+=dk[1][did]*(conj(x[(did*ngorkov+igorkov)*nc])*(\
226 ut[0][did*ndim+3]*(xi[(i*ngorkov+igork1)*nc]-xi[(i*ngorkov+igorkov)*nc])+\
227 ut[1][did*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]-xi[(i*ngorkov+igorkov)*nc+1]) )+\
228 conj(x[(did*ngorkov+igorkov)*nc+1])*(\
229 conj(ut[0][did*ndim+3])*(xi[(i*ngorkov+igork1)*nc+1]-xi[(i*ngorkov+igorkov)*nc+1])+\
230 conj(ut[1][did*ndim+3])*(xi[(i*ngorkov+igorkov)*nc]-xi[(i*ngorkov+igork1)*nc])));
231 }
232 for(int igorkov=0; igorkov<4; igorkov++){
233 int igork1=gamin[3*ndirac+igorkov];
234 xd+=dk[0][i]*(conj(x[(uid*ngorkov+igorkov)*nc])*(\
235 conj(ut[0][i*ndim+3])*(xi[(i*ngorkov+igork1)*nc]+xi[(i*ngorkov+igorkov)*nc])-\
236 ut[1][i*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]+xi[(i*ngorkov+igorkov)*nc+1]) )+\
237 conj(x[(uid*ngorkov+igorkov)*nc+1])*(\
238 ut[0][i*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]+xi[(i*ngorkov+igorkov)*nc+1])+\
239 conj(ut[1][i*ndim+3])*(xi[(i*ngorkov+igorkov)*nc]+xi[(i*ngorkov+igork1)*nc]) ) );
240 }
241 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
242 int igork1PP=4+gamin[3*ndirac+igorkovPP-4];
243 xuu-=dk[0][did]*(conj(x[(did*ngorkov+igorkovPP)*nc])*(\
244 ut[0][did*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc]-xi[(i*ngorkov+igorkovPP)*nc])+\
245 ut[1][did*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]-xi[(i*ngorkov+igorkovPP)*nc+1]) )+\
246 conj(x[(did*ngorkov+igorkovPP)*nc+1])*(\
247 conj(ut[0][did*ndim+3])*(xi[(i*ngorkov+igork1PP)*nc+1]-xi[(i*ngorkov+igorkovPP)*nc+1])+\
248 conj(ut[1][did*ndim+3])*(xi[(i*ngorkov+igorkovPP)*nc]-xi[(i*ngorkov+igork1PP)*nc]) ) );
249 }
250 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
251 int igork1PP=4+gamin[3*ndirac+igorkovPP-4];
252 xdd-=dk[1][i]*(conj(x[(uid*ngorkov+igorkovPP)*nc])*(\
253 conj(ut[0][i*ndim+3])*(xi[(i*ngorkov+igork1PP)*nc]+xi[(i*ngorkov+igorkovPP)*nc])-\
254 ut[1][i*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]+xi[(i*ngorkov+igorkovPP)*nc+1]) )+\
255 conj(x[(uid*ngorkov+igorkovPP)*nc+1])*(\
256 ut[0][i*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]+xi[(i*ngorkov+igorkovPP)*nc+1])+\
257 conj(ut[1][i*ndim+3])*(xi[(i*ngorkov+igorkovPP)*nc]+xi[(i*ngorkov+igork1PP)*nc]) ) );
258 }
259 }
260 *endenf=creal(xu-xd-xuu+xdd);
261 *denf=creal(xu+xd+xuu+xdd);
262
263#if(nproc>1)
264 Par_dsum(endenf); Par_dsum(denf);
265#endif
266 *endenf/=2*gvol; *denf/=2*gvol;
267 //Future task. Chiral susceptibility measurements
268#ifdef __NVCC__
269 cudaFree(x); cudaFree(xi);
270 //Revert index and gauge arrays
271 Transpose_z(ut[0],ndim,kvol);
272 Transpose_z(ut[1],ndim,kvol);
273 Transpose_U(iu,ndim,kvol);
274 Transpose_U(id,ndim,kvol);
275#else
276 free(x); free(xi);
277#endif
278 return 0;
279}
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
Definition random.c:260
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...
Definition congrad.c:250

References AVX, Complex, Complex_f, Congradp(), DOWN, Dslashd_f(), Gauss_c(), gvol, kferm, kfermHalo, kvol, nc, ndim, ndirac, ngorkov, Par_dsum(), UP, and ZHalo_swap_dir().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Polyakov()

double Polyakov ( Complex_f * ut[2])

Calculate the Polyakov loop (no prizes for guessing that one...)

Parameters
u11t,u12tThe gauge fields
See also
Par_tmul, Par_dsum
Returns
Double corresponding to the polyakov loop

Calculate the Polyakov loop (no prizes for guessing that one...)

Parameters
ut[0],ut[1]The trial fields

Calls:

Par_tmul(), Par_dsum()

Returns
Double corresponding to the polyakov loop

Definition at line 127 of file bosonic.c.

127 {
139 const char *funcname = "Polyakov";
140 double poly = 0;
141 Complex_f *Sigma[2];
142#ifdef __NVCC__
143 cuPolyakov(Sigma,ut,dimGrid,dimBlock);
144#else
145 Sigma[0] = (Complex_f *)aligned_alloc(AVX,kvol3*sizeof(Complex_f));
146 Sigma[1] = (Complex_f *)aligned_alloc(AVX,kvol3*sizeof(Complex_f));
147
148 //Extract the time component from each site and save in corresponding Sigma
149#ifdef USE_BLAS
150 cblas_ccopy(kvol3, ut[0]+3, ndim, Sigma[0], 1);
151 cblas_ccopy(kvol3, ut[1]+3, ndim, Sigma[1], 1);
152#else
153 for(int i=0; i<kvol3; i++){
154 Sigma[0][i]=ut[0][i*ndim+3];
155 Sigma[1][i]=ut[1][i*ndim+3];
156 }
157#endif
158 /* Some Fortran commentary
159 Changed this routine.
160 ut[0] and ut[1] now defined as normal ie (kvol+halo,4).
161 Copy of Sigma[0] and Sigma[1] is changed so that it copies
162 in blocks of ksizet.
163 Variable indexu also used to select correct element of ut[0] and ut[1]
164 in loop 10 below.
165
166 Change the order of multiplication so that it can
167 be done in parallel. Start at t=1 and go up to t=T:
168 previously started at t+T and looped back to 1, 2, ... T-1
169 Buffers
170 There is a dependency. Can only parallelise the inner loop
171 */
172#pragma unroll
173 for(int it=1;it<ksizet;it++)
174#pragma omp parallel for simd
175 for(int i=0;i<kvol3;i++){
176 //Seems a bit more efficient to increment indexu instead of reassigning
177 //it every single loop
178 int indexu=it*kvol3+i;
179 Complex_f a11=Sigma[0][i]*ut[0][indexu*ndim+3]-Sigma[1][i]*conj(ut[1][indexu*ndim+3]);
180 //Instead of having to store a second buffer just assign it directly
181 Sigma[1][i]=Sigma[0][i]*ut[1][indexu*ndim+3]+Sigma[1][i]*conj(ut[0][indexu*ndim+3]);
182 Sigma[0][i]=a11;
183 }
184 free(Sigma[1]);
185#endif
186
187 //Multiply this partial loop with the contributions of the other cores in the
188 //Time-like dimension
189 //
190 //Par_tmul does nothing if there is only a single processor in the time direction. So we only compile
191 //its call if it is required
192#if (npt>1)
193#ifdef __NVCC_
194#error Par_tmul is not yet implimented in CUDA as Sigma[1] is device only memory
195#endif
196#ifdef _DEBUG
197 printf("Multiplying with MPI\n");
198#endif
199 Par_tmul(Sigma[0], Sigma[1]);
200 //end of #if(npt>1)
201#endif
202 /*Now all cores have the value for the complete Polyakov line at all spacial sites
203 We need to globally sum over spacial processors but not across time as these
204 are duplicates. So we zero the value for all but t=0
205 This is (according to the FORTRAN code) a bit of a hack
206 I will expand on this hack and completely avoid any work
207 for this case rather than calculating everything just to set it to zero
208 */
209 if(!pcoord[3+rank*ndim])
210#pragma omp parallel for simd reduction(+:poly)
211 for(int i=0;i<kvol3;i++)
212 poly+=creal(Sigma[0][i]);
213#ifdef __NVCC__
214 cudaFree(Sigma[0]);
215#else
216 free(Sigma[0]);
217#endif
218
219#if(nproc>1)
220 Par_dsum(&poly);
221#endif
222 poly/=gvol3;
223 return poly;
224}
#define gvol3
Lattice spatial volume.
Definition sizes.h:94

References AVX, Complex_f, gvol3, ksizet, kvol3, ndim, Par_dsum(), pcoord, and rank.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Reunitarise()

int Reunitarise ( Complex * ut[2])
inline

Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.

If you're looking at the FORTRAN code be careful. There are two header files for the /trial/ header. One with u11 u12 (which was included here originally) and the other with u11t and u12t.

Parameters
utTrial fields to be reunitarised
Returns
Zero on success, integer error code otherwise

Definition at line 1012 of file matrices.c.

1012 {
1013 /*
1014 * @brief Reunitarises ut[0] and ut[1] as in conj(ut[0][i])*ut[0][i]+conj(ut[1][i])*ut[1][i]=1
1015 *
1016 * If you're looking at the FORTRAN code be careful. There are two header files
1017 * for the /trial/ header. One with u11 u12 (which was included here originally)
1018 * and the other with ut[0] and ut[1].
1019 *
1020 * @see cuReunitarise (CUDA Wrapper)
1021 *
1022 * @param ut[0], ut[1] Trial fields to be reunitarised
1023 *
1024 * @return Zero on success, integer error code otherwise
1025 */
1026 const char *funcname = "Reunitarise";
1027#ifdef __NVCC__
1028 cuReunitarise(ut[0],ut[1],dimGrid,dimBlock);
1029#else
1030#pragma omp parallel for simd
1031 for(int i=0; i<kvol*ndim; i++){
1032 //Declaring anorm inside the loop will hopefully let the compiler know it
1033 //is safe to vectorise aggressively
1034 double anorm=sqrt(conj(ut[0][i])*ut[0][i]+conj(ut[1][i])*ut[1][i]);
1035 // Exception handling code. May be faster to leave out as the exit prevents vectorisation.
1036 // if(anorm==0){
1037 // fprintf(stderr, "Error %i in %s on rank %i: anorm = 0 for μ=%i and i=%i.\nExiting...\n\n",
1038 // DIVZERO, funcname, rank, mu, i);
1039 // MPI_Finalise();
1040 // exit(DIVZERO);
1041 // }
1042 ut[0][i]/=anorm;
1043 ut[1][i]/=anorm;
1044 }
1045#endif
1046 return 0;
1047}

References Complex, kvol, and ndim.

Here is the caller graph for this function:

◆ SU2plaq()

int SU2plaq ( Complex_f * ut[2],
Complex_f Sigma[2],
unsigned int * iu,
int i,
int mu,
int nu )
inline

Calculates the plaquette at site i in the \(\mu--\nu\) direction.

Parameters
u2tTrial fields
SigmaPlaquette components
iLattice site
iuUpper halo indices
mu,nuPlaquette direction. Note that mu and nu can be negative to facilitate calculating plaquettes for Clover terms. No sanity checks are conducted on them in this routine.
Returns
double corresponding to the plaquette value

Calculates the trace of the plaquette at site i in the μ-ν direction

Parameters
ut[0],ut[1]Trial fields
Sigma11,Sigma12Trial fields
*iuUpper halo indices
isite index
mu,nuPlaquette direction. Note that mu and nu can be negative to facilitate calculating plaquettes for Clover terms. No sanity checks are conducted on them in this routine.

Return:

double corresponding to the plaquette value

Definition at line 85 of file bosonic.c.

85 {
102 const char *funcname = "SU2plaq";
103 int uidm = iu[mu+ndim*i];
104 /***
105 * Let's take a quick moment to compare this to the analysis code.
106 * The analysis code stores the gauge field as a 4 component real valued vector, whereas the produciton code
107 * used two complex numbers.
108 *
109 * Analysis code: u=(Re(u11),Im(u12),Re(u12),Im(u11))
110 * Production code: u11=u[0]+I*u[3] u12=u[2]+I*u[1]
111 *
112 * This applies to the Sigmas and a's below too
113 */
114
115 Sigma[0]=ut[0][i*ndim+mu]*ut[0][uidm*ndim+nu]-ut[1][i*ndim+mu]*conj(ut[1][uidm*ndim+nu]);
116 Sigma[1]=ut[0][i*ndim+mu]*ut[1][uidm*ndim+nu]+ut[1][i*ndim+mu]*conj(ut[0][uidm*ndim+nu]);
117
118 int uidn = iu[nu+ndim*i];
119 Complex_f a11=Sigma[0]*conj(ut[0][uidn*ndim+mu])+Sigma[1]*conj(ut[1][uidn*ndim+mu]);
120 Complex_f a12=-Sigma[0]*ut[1][uidn*ndim+mu]+Sigma[1]*ut[0][uidn*ndim+mu];
121
122 Sigma[0]=a11*conj(ut[0][i*ndim+nu])+a12*conj(ut[1][i*ndim+nu]);
123 Sigma[1]=-a11*ut[1][i*ndim+nu]+a12*ut[0][i*ndim+mu];
124 return 0;
125}

References Complex_f, and ndim.

Here is the caller graph for this function:

◆ UpDownPart()

int UpDownPart ( const int na,
Complex * X0,
Complex * R1 )
inline

Definition at line 416 of file su2hmc.c.

416 {
417#ifdef __NVCC__
418 cuUpDownPart(na,X0,R1,dimBlock,dimGrid);
419 cudaDeviceSynchronise();
420#else
421 //The only reason this was removed from the original function is for diagnostics
422#pragma omp parallel for simd collapse(2) aligned(X0,R1:AVX)
423 for(int i=0; i<kvol; i++)
424 for(int idirac = 0; idirac < ndirac; idirac++){
425 X0[((na*kvol+i)*ndirac+idirac)*nc]=R1[(i*ngorkov+idirac)*nc];
426 X0[((na*kvol+i)*ndirac+idirac)*nc+1]=R1[(i*ngorkov+idirac)*nc+1];
427 }
428#endif
429 return 0;
430}

◆ Z_gather()

int Z_gather ( Complex * x,
Complex * y,
int n,
unsigned int * table,
unsigned int mu )
inline

Extracts all the double precision gauge links in the \(\mu\) direction only.

Parameters
xThe output
yThe gauge field for a particular colour
nNumber of sites in the gauge field. This is typically kvol
tableTable containing information on nearest neighbours. Usually id or iu
muDireciton we're interested in extractng
Returns
Zero on success, integer error code otherwise

Definition at line 371 of file su2hmc.c.

372{
373 const char *funcname = "Z_gather";
374 //FORTRAN had a second parameter m giving the size of y (kvol+halo) normally
375 //Pointers mean that's not an issue for us so I'm leaving it out
376#ifdef __NVCC__
377 cuZ_gather(x,y,n,table,mu,dimBlock,dimGrid);
378#else
379#pragma omp parallel for simd aligned (x,y,table:AVX)
380 for(int i=0; i<n; i++)
381 x[i]=y[table[i*ndim+mu]*ndim+mu];
382#endif
383 return 0;
384}

References Complex, and ndim.