Function declarations for most of the routines. More...

#include <errorcodes.h>
#include <integrate.h>
#include <sizes.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

Include dependency graph for su2hmc.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions
int	Force (double dSdpi, int iflag, double res1, Complex X0, Complex X1, Complex Phi, Complex u11t, Complex u12t, Complex_f u11t_f, Complex_f u12t_f, unsigned int iu, unsigned int id, Complex gamval, Complex_f gamval_f, int gamin, double dk4m, double dk4p, float dk4m_f, float dk4p_f, Complex_f jqq, float akappa, float beta, double ancg)
	Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time.

int	Gauge_force (double dSdpi, Complex_f u11t, Complex_f u12t, unsigned int iu, unsigned int *id, float beta)
	Calculates the gauge force due to the Wilson Action at each intermediate time.

int	Init (int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq, Complex u11, Complex u12, Complex u11t, Complex u12t, Complex_f u11t_f, Complex_f u12t_f, Complex gamval, Complex_f gamval_f, int gamin, double dk4m, double dk4p, float dk4m_f, float dk4p_f, unsigned int iu, unsigned int *id)
	Initialises the system.

int	Hamilton (double h, double s, double res2, double pp, Complex X0, Complex X1, Complex Phi, Complex u11t, Complex u12t, Complex_f u11t_f, Complex_f u12t_f, unsigned int iu, unsigned int id, Complex_f gamval_f, int gamin, float dk4m_f, float dk4p_f, Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
	Calculate the Hamiltonian.

int	Congradq (int na, double res, Complex X1, Complex r, Complex_f u11t_f, Complex_f u12t_f, unsigned int iu, unsigned int id, Complex_f gamval_f, int gamin, float dk4m_f, float dk4p_f, Complex_f jqq, float akappa, int *itercg)
	Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) Implements up/down partitioning The matrix multiplication step is done at single precision, while the update is done at double.

int	Congradp (int na, double res, Complex Phi, Complex xi, Complex_f u11t, Complex_f u12t, unsigned int iu, unsigned int id, Complex_f gamval, int gamin, float dk4m, float dk4p, Complex_f jqq, float akappa, int *itercg)
	Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) The matrix multiplication step is done at single precision, while the update is done at double.

int	Measure (double pbp, double endenf, double denf, Complex qq, Complex qbqb, double res, int itercg, Complex u11t, Complex u12t, Complex_f u11t_f, Complex_f u12t_f, unsigned int iu, unsigned int id, Complex gamval, Complex_f gamval_f, int gamin, double dk4m, double dk4p, float dk4m_f, float dk4p_f, Complex_f jqq, float akappa, Complex Phi, Complex *R1)
	Calculate fermion expectation values via a noisy estimator.

int	Average_Plaquette (double hg, double avplaqs, double avplaqt, Complex_f u11t, Complex_f u12t, unsigned int iu, float beta)
	Calculates the gauge action using new (how new?) lookup table.

float	SU2plaq (Complex_f u11t, Complex_f u12t, unsigned int *iu, int i, int mu, int nu)
	Calculates the plaquette at site i in the \(\mu--\nu\) direction.

double	Polyakov (Complex_f u11t, Complex_f u12t)
	Calculate the Polyakov loop (no prizes for guessing that one...)

int	C_gather (Complex_f x, Complex_f y, int n, unsigned int *table, unsigned int mu)
	Extracts all the single precision gauge links in the \(\mu\) direction only.

int	Z_gather (Complex x, Complex y, int n, unsigned int *table, unsigned int mu)
	Extracts all the double precision gauge links in the \(\mu\) direction only.

int	Fill_Small_Phi (int na, Complex smallPhi, Complex Phi)

int	UpDownPart (const int na, Complex X0, Complex R1)

int	Reunitarise (Complex u11t, Complex u12t)
	Reunitarises u11t and u12t as in conj(u11t[i])u11t[i]+conj(u12t[i])u12t[i]=1.

Detailed Description

Function declarations for most of the routines.

Definition in file su2hmc.h.

Function Documentation

◆ Average_Plaquette()

int Average_Plaquette	(	double *	hg,
		double *	avplaqs,
		double *	avplaqt,
		Complex_f *	u11t,
		Complex_f *	u12t,
		unsigned int *	iu,
		float	beta )

Calculates the gauge action using new (how new?) lookup table.

Follows a routine called qedplaq in some QED3 code

Parameters

hg	Gauge component of Hamilton
avplaqs	Average spacial Plaquette
avplaqt	Average Temporal Plaquette
u11t,u12t	The trial fields
iu	Upper halo indices
beta	Inverse gauge coupling

See also: Par_dsum

Returns: Zero on success, integer error code otherwise

Definition at line 8 of file bosonic.c.

                                                                                                                                   {
   /* 
    * Calculates the gauge action using new (how new?) lookup table
    * Follows a routine called qedplaq in some QED3 code
    * 
    * Parameters:
    * =========
    * hg          Gauge component of Hamilton
    * avplaqs     Average spacial Plaquette
    * avplaqt     Average Temporal Plaquette
    * u11t,u12t   The trial fields
    * iu          Upper halo indices
    * beta        Inverse gauge coupling
    *
    * Calls:
    * =====
    * Par_dsum
    *
    * Return:
    * ======
    * Zero on success, integer error code otherwise
    */
   const char *funcname = "Average_Plaquette";
   /*There was a halo exchange here but moved it outside
     The FORTRAN code used several consecutive loops to get the plaquette
     Instead we'll just make the arrays variables and do everything in one loop
     Should work since in the FORTRAN Sigma11[i] only depends on i components  for example
     Since the ν loop doesn't get called for μ=0 we'll start at μ=1
     */
#ifdef __NVCC__
   __managed__ double hgs = 0; __managed__ double hgt = 0;
   cuAverage_Plaquette(&hgs, &hgt, u11t, u12t, iu,dimGrid,dimBlock);
#else
   double hgs = 0; double hgt = 0;
   for(int mu=1;mu<ndim;mu++)
      for(int nu=0;nu<mu;nu++)
         //Don't merge into a single loop. Makes vectorisation easier?
         //Or merge into a single loop and dispense with the a arrays?
#pragma omp parallel for simd aligned(u11t,u12t,iu:AVX) reduction(+:hgs,hgt)
         for(int i=0;i<kvol;i++){
            //Save us from typing iu[mu+ndim*i] everywhere
            switch(mu){
               //Time component
               case(ndim-1):  hgt -= SU2plaq(u11t,u12t,iu,i,mu,nu);
                              break;
                              //Space component
               default: hgs -= SU2plaq(u11t,u12t,iu,i,mu,nu);
                        break;
            }
         }
#endif
#if(nproc>1)
   Par_dsum(&hgs); Par_dsum(&hgt);
#endif
   *avplaqs=-hgs/(3.0*gvol); *avplaqt=-hgt/(gvol*3.0);
   *hg=(hgs+hgt)*beta;
#ifdef _DEBUG
   if(!rank)
      printf("hgs=%e  hgt=%e  hg=%e\n", hgs, hgt, *hg);
#endif
   return 0;
}

References AVX, gvol, kvol, ndim, Par_dsum(), rank, and SU2plaq().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ C_gather()

int C_gather	(	Complex_f *	x,
		Complex_f *	y,
		int	n,
		unsigned int *	table,
		unsigned int	mu )

inline

Extracts all the single precision gauge links in the \(\mu\) direction only.

Parameters

x	The output
y	The gauge field for a particular colour
n	Number of sites in the gauge field. This is typically kvol
table	Table containing information on nearest neighbours. Usually id or iu
mu	Direciton we're interested in extractng

Returns: Zero on success, integer error code otherwise

Definition at line 321 of file su2hmc.c.

{
   const char *funcname = "C_gather";
   //FORTRAN had a second parameter m giving the size of y (kvol+halo) normally
   //Pointers mean that's not an issue for us so I'm leaving it out
#ifdef __NVCC__
   cuC_gather(x,y,n,table,mu,dimBlock,dimGrid);
#else
#pragma omp parallel for simd aligned (x,y,table:AVX)
   for(int i=0; i<n; i++)
      x[i]=y[table[i*ndim+mu]*ndim+mu];
#endif
   return 0;
}

References ndim.

Here is the caller graph for this function:

◆ Congradp()

int Congradp	(	int	na,
		double	res,
		Complex *	Phi,
		Complex *	xi,
		Complex_f *	u11t,
		Complex_f *	u12t,
		unsigned int *	iu,
		unsigned int *	id,
		Complex_f *	gamval,
		int *	gamin,
		float *	dk4m,
		float *	dk4p,
		Complex_f	jqq,
		float	akappa,
		int *	itercg )

Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) The matrix multiplication step is done at single precision, while the update is done at double.

Parameters

na	Flavour index
res	Limit for conjugate gradient
Phi	Pseudofermion field.
xi	Returned as \((M^\dagger M)^{-1} \Phi\)
u11t	First colour's trial field
u12t	Second colour's trial field
iu	Upper halo indices
id	Lower halo indices
gamval_f	Single precision gamma matrices rescaled by kappa
gamin	Dirac indices
dk4m	\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p	\(\left(1-\gamma_0\right)e^\mu\)
jqq	Diquark source
akappa	Hopping Parameter
itercg	Counts the iterations of the conjugate gradient

Returns: 0 on success, integer error code otherwise

Definition at line 262 of file congrad.c.

                                                                                                   {
   /*
    * @brief Matrix Inversion via Conjugate Gradient
    * Solves @f$(M^\dagger)Mx=\Phi@f$
    * No even/odd partitioning.
    * The matrix multiplication step is done at single precision, while the update is done at double
    *
    * @param   na:         Flavour index
    * @param   res:        Limit for conjugate gradient
    * @param   Phi:        @f(\Phi@f) initially, 
    * @param   xi:         Returned as @f((M^\dagger M)^{-1} \Phi@f)
    * @param   u11t:       First colour's trial field
    * @param   u12t:       Second colour's trial field
    * @param   iu:         Upper halo indices
    * @param   id:         Lower halo indices
    * @param   gamval:     Gamma matrices
    * @param   gamin:      Dirac indices
    * @param   dk4m:
    * @param   dk4p:
    * @param   jqq:        Diquark source
    * @param   akappa:     Hopping Parameter
    * @param   itercg:     Counts the iterations of the conjugate gradient
    *
    * @return 0 on success, integer error code otherwise
    */
   const char *funcname = "Congradp";
   //Return value
   int ret_val=0;
   const double resid = res*res;
   //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
   //These alpha and beta terms should be double, but that causes issues with BLAS. Instead we declare
   //them Complex and work with the real part (especially for α_d)
   //Give initial values Will be overwritten if niterx>0
#ifdef __NVCC__
   Complex_f *p_f, *r_f, *xi_f, *x1_f, *x2_f;
   int device; cudaGetDevice(&device);
#ifdef _DEBUG
   cudaMallocManaged((void **)&p_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&r_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&x1_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&x2_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&xi_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
#else
   cudaMalloc((void **)&p_f, kfermHalo*sizeof(Complex_f));
   cudaMalloc((void **)&r_f, kferm*sizeof(Complex_f));
   cudaMalloc((void **)&x1_f, kfermHalo*sizeof(Complex_f));
   cudaMalloc((void **)&x2_f, kferm*sizeof(Complex_f));
   cudaMalloc((void **)&xi_f, kferm*sizeof(Complex_f));
#endif
#else
   Complex_f *p_f  = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
   Complex_f *r_f  = aligned_alloc(AVX,kferm*sizeof(Complex_f));
   Complex_f *x1_f   =  aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
   Complex_f *x2_f   =  aligned_alloc(AVX,kferm*sizeof(Complex_f));
   Complex_f *xi_f   =  aligned_alloc(AVX,kferm*sizeof(Complex_f));
#endif
   double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
   double alphan=0.0;
   //Instead of copying element-wise in a loop, use memcpy.
#ifdef __NVCC__
   //Get xi  in single precision, then swap to AoS format
   cuComplex_convert(p_f,xi,kferm,true,dimBlock,dimGrid);
   Transpose_c(p_f,ngorkov*nc,kvol,dimGrid,dimBlock);
   cudaMemcpy(xi_f,p_f,kferm*sizeof(Complex_f),cudaMemcpyDefault);
 
   //And repeat for r
   cuComplex_convert(r_f,Phi+na*kferm,kferm,true,dimBlock,dimGrid);
   Transpose_c(r_f,ngorkov*nc,kvol,dimGrid,dimBlock);
 
   //Flip all the gauge fields around so memory is coalesced
   Transpose_c(u11t,ndim,kvol,dimGrid,dimBlock);
   Transpose_c(u12t,ndim,kvol,dimGrid,dimBlock);
#else
#pragma omp parallel for simd aligned(p_f,xi_f,xi,r_f,Phi:AVX)
   for(int i =0;i<kferm;i++){
      p_f[i]=xi_f[i]=(Complex_f)xi[i];
      r_f[i]=(Complex_f)Phi[na*kferm+i];
   }
#endif
 
   // Declaring placeholder arrays 
   // This x1 is NOT related to the /common/vectorp/X1 in the FORTRAN code and should not
   // be confused with X1 the global variable
 
   //niterx isn't called as an index but we'll start from zero with the C code to make the
   //if statements quicker to type
   double betan;
#ifdef __NVCC__
   cudaDeviceSynchronise();
#endif
   for((*itercg)=0; (*itercg)<=niterc; (*itercg)++){
      //Don't overwrite on first run. 
      //x2=(M^†)x1=(M^†)Mp
      Dslash_f(x1_f,p_f,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa);
      Dslashd_f(x2_f,x1_f,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa);
#ifdef __NVCC__
      cudaDeviceSynchronise();
#endif
      //We can't evaluate α on the first niterx because we need to get β_n.
      if(*itercg){
         //x*.x
#ifdef USE_BLAS
         float alphad_f;
#ifdef __NVCC__
         cublasScnrm2(cublas_handle,kferm,(cuComplex*) x1_f, 1,(float *)&alphad_f);
         alphad = alphad_f*alphad_f;
#else
         alphad_f = cblas_scnrm2(kferm, x1_f, 1);
#endif
         alphad = alphad_f*alphad_f;
#else
         alphad=0;
         for(int i = 0; i<kferm; i++)
            alphad+=conj(x1_f[i])*x1_f[i];
#endif
#if(nproc>1)
         Par_fsum((float *)&alphad);
#endif
         //α=(r.r)/p(M^†)Mp
         alpha=alphan/creal(alphad);
         //       Complex_f alpha_f = (Complex_f)alpha;
         //x+αp
#ifdef USE_BLAS
         Complex_f alpha_f=(float)alpha;
#ifdef __NVCC__
         cublasCaxpy(cublas_handle,kferm,(cuComplex*) &alpha_f,(cuComplex*) p_f,1,(cuComplex*) xi_f,1);
#else
         cblas_caxpy(kferm, (Complex_f*)&alpha_f,(Complex_f*)p_f, 1, (Complex_f*)xi_f, 1);
#endif
#else
#pragma omp parallel for simd aligned(xi_f,p_f:AVX)
         for(int i = 0; i<kferm; i++)
            xi_f[i]+=alpha*p_f[i];
#endif
      }
 
      //r=α(M^†)Mp and β_n=r*.r
#if defined USE_BLAS
      Complex_f alpha_m=(Complex_f)(-alpha);
      float betan_f=0;
#ifdef __NVCC__
      cublasCaxpy(cublas_handle,kferm, (cuComplex *)&alpha_m,(cuComplex *) x2_f, 1,(cuComplex *) r_f, 1);
      //cudaDeviceSynchronise();
      //r*.r
      cublasScnrm2(cublas_handle,kferm,(cuComplex *) r_f,1,(float *)&betan_f);
#else
      cblas_caxpy(kferm,(Complex_f*) &alpha_m,(Complex_f*) x2_f, 1,(Complex_f*) r_f, 1);
      //r*.r
      betan_f = cblas_scnrm2(kferm, (Complex_f*)r_f,1);
#endif
      //Gotta square it to "undo" the norm
      betan=betan_f*betan_f;
#else
      //Just like Congradq, this loop could be unrolled but will need a reduction to deal with the betan 
      //addition.
      betan = 0;
      //If we get a small enough β_n before hitting the iteration cap we break
#pragma omp parallel for simd aligned(x2_f,r_f:AVX) reduction(+:betan)
      for(int i = 0; i<kferm;i++){
         r_f[i]-=alpha*x2_f[i];
         betan+=conj(r_f[i])*r_f[i];
      }
#endif
      //This is basically just congradq at the end. Check there for comments
#if(nproc>1)
      Par_dsum(&betan);
#endif
#ifdef _DEBUG
#ifdef _DEBUGCG
      char *endline = "\n";
#else
      char *endline = "\r";
#endif
      if(!rank) printf("Iter (CG) = %i β_n= %e α= %e%s", *itercg, betan, alpha,endline);
#endif
      if(betan<resid){
         //Started counting from zero so add one to make it accurate
         (*itercg)++;
#ifdef _DEBUG
         if(!rank) printf("\nIter (CG) = %i resid = %e toler = %e\n", *itercg, betan, resid);
#endif
         ret_val=0;  break;
      }
      else if(*itercg==niterc-1){
         if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n",
               ITERLIM, funcname, niterc, betan);
         ret_val=ITERLIM;  break;
      }
      //Note that beta below is not the global beta and scoping is used to avoid conflict between them
      Complex beta = (*itercg) ? betan/betad : 0;
      betad=betan; alphan=betan;
      //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by 
      //β instead of x.
      //There is cblas_zaxpby in the MKL though, set a = 1 and b = β.
#ifdef USE_BLAS
      Complex_f beta_f = (Complex_f)beta;
      Complex_f a = 1.0;
#ifdef __NVCC__
      cublasCscal(cublas_handle,kferm,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
      cublasCaxpy(cublas_handle,kferm,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
      cudaDeviceSynchronise();
#elif (defined __INTEL_MKL__ || defined AMD_BLAS)
      cblas_caxpby(kferm, &a, r_f, 1, &beta_f,  p_f, 1);
#else
      cblas_cscal(kferm,&beta_f,p_f,1);
      cblas_caxpy(kferm,&a,r_f,1,p_f,1);
#endif
#else
#pragma omp parallel for simd aligned(r_f,p_f:AVX)
      for(int i=0; i<kferm; i++)
         p_f[i]=r_f[i]+beta*p_f[i];
#endif
   }
#ifdef __NVCC__
   Transpose_c(xi_f,kvol,ngorkov*nc,dimGrid,dimBlock);
   Transpose_c(r_f,kvol,ngorkov*nc,dimGrid,dimBlock);
 
   Transpose_c(u11t,kvol,ndim,dimGrid,dimBlock);
   Transpose_c(u12t,kvol,ndim,dimGrid,dimBlock);
   cudaDeviceSynchronise();
   cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
#else
#pragma omp simd
   for(int i = 0; i <kferm;i++){
      xi[i]=(Complex)xi_f[i];
   }
#endif
#ifdef   __NVCC__
   cudaFree(p_f); cudaFree(r_f);cudaFree(x1_f); cudaFree(x2_f); cudaFree(xi_f); 
#else
   free(p_f); free(r_f); free(x1_f); free(x2_f); free(xi_f); 
#endif
   return ret_val;
}

References AVX, Complex, Complex_f, Dslash_f(), Dslashd_f(), kferm, kfermHalo, kvol, nc, ndim, ngorkov, niterc, Par_dsum(), Par_fsum(), and rank.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ Congradq()

int Congradq	(	int	na,
		double	res,
		Complex *	X1,
		Complex *	r,
		Complex_f *	u11t_f,
		Complex_f *	u12t_f,
		unsigned int *	iu,
		unsigned int *	id,
		Complex_f *	gamval_f,
		int *	gamin,
		float *	dk4m_f,
		float *	dk4p_f,
		Complex_f	jqq,
		float	akappa,
		int *	itercg )

Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) Implements up/down partitioning The matrix multiplication step is done at single precision, while the update is done at double.

Parameters

na	Flavour index
res	Limit for conjugate gradient
X1	Pseudofermion field \(\Phi\) initially, returned as \((M^\dagger M)^{-1} \Phi\)
r	Partition of \(\Phi\) being used. Gets recycled as the residual vector
u11t_f	First colour's trial field
u12t_f	Second colour's trial field
iu	Upper halo indices
id	Lower halo indices
gamval_f	Single precision gamma matrices rescaled by kappa
gamin	Dirac indices
dk4m_f	\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p_f	\(\left(1-\gamma_0\right)e^\mu\)
jqq	Diquark source
akappa	Hopping Parameter
itercg	Counts the iterations of the conjugate gradient

Returns: 0 on success, integer error code otherwise

Definition at line 7 of file congrad.c.

                                                                                                    {
   /*
    * @brief Matrix Inversion via Mixed Precision Conjugate Gradient
    * Solves @f$(M^\dagger)Mx=\Phi@f$
    * Implements up/down partitioning
    * The matrix multiplication step is done at single precision, while the update is done at double
    *
    * @param  na:          Flavour index
    * @param  res:         Limit for conjugate gradient
    * @param  X1:          @f(\Phi@f) initially, returned as @f((M^\dagger M)^{-1} \Phi@f)
    * @param  r:           Partition of @f(\Phi@f) being used. Gets recycled as the residual vector
    * @param  u11t:        First colour's trial field
    * @param  u12t:        Second colour's trial field
    * @param  iu:          Upper halo indices
    * @param  id:          Lower halo indices
    * @param  gamval_f:    Gamma matrices
    * @param  gamin:       Dirac indices
    * @param  dk4m:
    * @param  dk4p:
    * @param  jqq:         Diquark source
    * @param  akappa:      Hopping Parameter
    * @param  itercg:      Counts the iterations of the conjugate gradient
    * 
    * @see Hdslash_f(), Hdslashd_f(), Par_fsum(), Par_dsum()
    *
    * @return 0 on success, integer error code otherwise
    */
   const char *funcname = "Congradq";
   int ret_val=0;
   const double resid = res*res;
   //The κ^2 factor is needed to normalise the fields correctly
   //jqq is the diquark condensate and is global scope.
   const Complex_f fac_f = conj(jqq)*jqq*akappa*akappa;
   //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
   //n suffix is numerator, d is denominator
   double alphan=1;
   //The alpha and beta terms should be double, but that causes issues with BLAS pointers. Instead we declare
   //them complex and work with the real part (especially for α_d)
   //Give initial values Will be overwritten if niterx>0
   double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
   //Because we're dealing with flattened arrays here we can call cblas safely without the halo
#ifdef __NVCC__
   Complex_f *p_f, *x1_f, *x2_f, *r_f, *X1_f;
   int device=-1; cudaGetDevice(&device);
 
#ifdef _DEBUG
   cudaMallocManaged((void **)&p_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&x1_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&x2_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&r_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&X1_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
#else
   //First two have halo exchanges, so getting NCCL working is important
   cudaMallocAsync((void **)&p_f, kferm2Halo*sizeof(Complex_f),streams[0]);
   cudaMallocAsync((void **)&x1_f, kferm2Halo*sizeof(Complex_f),streams[1]);
   cudaMallocAsync((void **)&x2_f, kferm2*sizeof(Complex_f),streams[2]);
   cudaMallocAsync((void **)&r_f, kferm2*sizeof(Complex_f),streams[3]);
   cudaMallocAsync((void **)&X1_f, kferm2*sizeof(Complex_f),streams[4]);
#endif
#else
   Complex_f *p_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
   Complex_f *x1_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
   Complex_f *x2_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
   Complex_f *X1_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
   Complex_f *r_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
#endif
   //Instead of copying element-wise in a loop, use memcpy.
#ifdef __NVCC__
   //Get X1 in single precision, then swap to AoS format
   cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
   Transpose_c(X1_f,ndirac*nc,kvol,dimGrid,dimBlock);
 
   //And repeat for r
   cuComplex_convert(r_f,r,kferm2,true,dimBlock,dimGrid);
   Transpose_c(r_f,ndirac*nc,kvol,dimGrid,dimBlock);
 
   //cudaMemcpy is blocking, so use async instead
   cudaMemcpyAsync(p_f, X1_f, kferm2*sizeof(Complex_f),cudaMemcpyDeviceToDevice,NULL);
   //Flip all the gauge fields around so memory is coalesced
   Transpose_c(u11t,ndim,kvol,dimGrid,dimBlock);
   Transpose_c(u12t,ndim,kvol,dimGrid,dimBlock);
   Transpose_I(iu,ndim,kvol,dimGrid,dimBlock);
   Transpose_I(id,ndim,kvol,dimGrid,dimBlock);
#else
#pragma omp parallel for simd
   for(int i=0;i<kferm2;i++){
      r_f[i]=(Complex_f)r[i];
      X1_f[i]=(Complex_f)X1[i];
   }
   memcpy(p_f, X1_f, kferm2*sizeof(Complex_f));
#endif
 
   //niterx isn't called as an index but we'll start from zero with the C code to make the
   //if statements quicker to type
   double betan; bool pf=true;
   for(*itercg=0; *itercg<niterc; (*itercg)++){
      //x2 =  (M^†M)p 
      //No need to synchronise here. The memcpy in Hdslash is blocking
      Hdslash_f(x1_f,p_f,u11t,u12t,iu,id,gamval_f,gamin,dk4m,dk4p,akappa);
      Hdslashd_f(x2_f,x1_f,u11t,u12t,iu,id,gamval_f,gamin,dk4m,dk4p,akappa);
#ifdef   __NVCC__
      cudaDeviceSynchronise();
#endif
      //x2 =  (M^†M+J^2)p 
      //No point adding zero a couple of hundred times if the diquark source is zero
      if(fac_f!=0){
#ifdef   __NVCC__
         cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&fac_f,(cuComplex *)p_f,1,(cuComplex *)x2_f,1);
#elif defined USE_BLAS
         cblas_caxpy(kferm2, &fac_f, p_f, 1, x2_f, 1);
#else
#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
         for(int i=0; i<kferm2; i++)
            x2_f[i]+=fac_f*p_f[i];
#endif
      }
      //We can't evaluate α on the first *itercg because we need to get β_n.
      if(*itercg){
         //α_d= p* (M^†M+J^2)p
#ifdef __NVCC__
         cublasCdotc(cublas_handle,kferm2,(cuComplex *)p_f,1,(cuComplex *)x2_f,1,(cuComplex *)&alphad);
#elif defined USE_BLAS
         cblas_cdotc_sub(kferm2, p_f, 1, x2_f, 1, &alphad);
#else
         alphad=0;
#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
         for(int i=0; i<kferm2; i++)
            alphad+=conj(p_f[i])*x2_f[i];
#endif
         //For now I'll cast it into a float for the reduction. Each rank only sends and writes
         //to the real part so this is fine
#if(nproc>1)
         Par_fsum((float *)&alphad);
#endif
         //α=α_n/α_d = (r.r)/p(M^†M)p 
         alpha=alphan/creal(alphad);
         //x-αp, 
#ifdef __NVCC__
         Complex_f alpha_f = (Complex_f)alpha;
         cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&alpha_f,(cuComplex *)p_f,1,(cuComplex *)X1_f,1);
#elif defined USE_BLAS
         Complex_f alpha_f = (Complex_f)alpha;
         cblas_caxpy(kferm2, &alpha_f, p_f, 1, X1_f, 1);
#else
         for(int i=0; i<kferm2; i++)
            X1_f[i]+=alpha*p_f[i];
#endif
      }        
      // r_n+1 = r_n-α(M^† M)p_n and β_n=r*.r
#ifdef   __NVCC__
      Complex_f alpha_m=(Complex_f)(-alpha);
      cublasCaxpy(cublas_handle, kferm2,(cuComplex *)&alpha_m,(cuComplex *)x2_f,1,(cuComplex *)r_f,1);
      float betan_f;
      cublasScnrm2(cublas_handle,kferm2,(cuComplex *)r_f,1,&betan_f);
      betan = betan_f*betan_f;
#elif defined USE_BLAS
      Complex_f alpha_m = (Complex_f)(-alpha);
      cblas_caxpy(kferm2, &alpha_m, x2_f, 1, r_f, 1);
      //Undo the negation for the BLAS routine
      float betan_f = cblas_scnrm2(kferm2, r_f,1);
      //Gotta square it to "undo" the norm
      betan = betan_f*betan_f;
#else
      betan=0;
#pragma omp parallel for simd aligned(r_f,x2_f:AVX) reduction(+:betan) 
      for(int i=0; i<kferm2; i++){
         r_f[i]-=alpha*x2_f[i];
         betan += conj(r_f[i])*r_f[i];
      }
#endif
      //And... reduce.
#if(nproc>1)
      Par_dsum(&betan);
#endif
#ifdef _DEBUGCG
#warning "CG Debugging"
      char *endline = "\n";
#else
      char *endline = "\r";
#endif
#ifdef _DEBUG
      if(!rank) printf("Iter(CG)=%i\tβ_n=%e\tα=%e%s", *itercg, betan, alpha,endline);
#endif
      if(betan<resid){ 
         (*itercg)++;
#ifdef _DEBUG
         if(!rank) printf("\nIter(CG)=%i\tResidue: %e\tTolerance: %e\n", *itercg, betan, resid);
#endif
         ret_val=0;  break;
      }
      else if(*itercg==niterc-1){
         if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n", ITERLIM, funcname, *itercg, betan);
         ret_val=ITERLIM;  break;
      }
      //Here we evaluate β=(r_{k+1}.r_{k+1})/(r_k.r_k) and then shuffle our indices down the line.
      //On the first iteration we define beta to be zero.
      //Note that beta below is not the global beta and scoping is used to avoid conflict between them
      Complex beta = (*itercg) ?  betan/betad : 0;
      betad=betan; alphan=betan;
      //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
      //β instead of x.
#ifdef __NVCC__
      Complex_f beta_f=(Complex_f)beta;
      __managed__ Complex_f a = 1.0;
      cublasCscal(cublas_handle,kferm2,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
      cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
#elif (defined __INTEL_MKL__)
      Complex_f a = 1.0;
      Complex_f beta_f=(Complex_f)beta;
      //There is cblas_?axpby in the MKL and AMD though, set a = 1 and b = β.
      //If we get a small enough β_n before hitting the iteration cap we break
      cblas_caxpby(kferm2, &a, r_f, 1, &beta_f,  p_f, 1);
#elif defined USE_BLAS
      Complex_f beta_f=(Complex_f)beta;
      cblas_cscal(kferm2,&beta_f,p_f,1);
      Complex_f a = 1.0;
      cblas_caxpy(kferm2,&a,r_f,1,p_f,1);
#else 
      for(int i=0; i<kferm2; i++)
         p_f[i]=r_f[i]+beta*p_f[i];
#endif
   }
#ifdef __NVCC__
//Restore arrays back to their previous salyout
   Transpose_c(X1_f,kvol,ndirac*nc,dimGrid,dimBlock);
   cuComplex_convert(X1_f,X1,kferm2,false,dimBlock,dimGrid);
   Transpose_c(r_f,kvol,ndirac*nc,dimGrid,dimBlock);
   cuComplex_convert(r_f,r,kferm2,false,dimBlock,dimGrid);
   Transpose_c(u11t,kvol,ndim,dimGrid,dimBlock);
   Transpose_c(u12t,kvol,ndim,dimGrid,dimBlock);
   Transpose_I(iu,kvol,ndim,dimGrid,dimBlock);
   Transpose_I(id,kvol,ndim,dimGrid,dimBlock);
#else
   for(int i=0;i<kferm2;i++){
      X1[i]=(Complex)X1_f[i];
      r[i]=(Complex)r_f[i];
   }
#endif
#ifdef __NVCC__
#ifdef _DEBUG
   cudaDeviceSynchronise();
   cudaFree(x1_f);cudaFree(x2_f); cudaFree(p_f);
   cudaFree(r_f);cudaFree(X1_f);
#else
   //streams match the ones that allocated them.
   cudaFreeAsync(p_f,streams[0]);cudaFreeAsync(x1_f,streams[1]);cudaFreeAsync(x2_f,streams[2]);
   cudaDeviceSynchronise();
   cudaFreeAsync(r_f,streams[3]);cudaFreeAsync(X1_f,streams[4]);
#endif
#else
   free(x1_f);free(x2_f); free(p_f);  free(r_f); free(X1_f);
#endif
   return ret_val;
}

References AVX, Complex, Complex_f, Hdslash_f(), Hdslashd_f(), kferm2, kferm2Halo, kvol, nc, ndim, ndirac, niterc, Par_dsum(), Par_fsum(), and rank.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ Fill_Small_Phi()

int Fill_Small_Phi	(	int	na,
		Complex *	smallPhi,
		Complex *	Phi )

inline

Copies necessary (2*4*kvol) elements of Phi into a vector variable

Parameters

na	flavour index
smallPhi	The partitioned output
Phi	The pseudofermion field

Returns: Zero on success, integer error code otherwise

Definition at line 349 of file su2hmc.c.

{
   /*Copies necessary (2*4*kvol) elements of Phi into a vector variable
    *
    * Globals:
    * =======
    * Phi:    The source array
    * 
    * Parameters:
    * ==========
    * int na: flavour index
    * Complex *smallPhi:     The target array
    *
    * Returns:
    * =======
    * Zero on success, integer error code otherwise
    */
   const char *funcname = "Fill_Small_Phi";
   //BIG and small phi index
#ifdef __NVCC__
   cuFill_Small_Phi(na,smallPhi,Phi,dimBlock,dimGrid);
#else
#pragma omp parallel for simd aligned(smallPhi,Phi:AVX) collapse(3)
   for(int i = 0; i<kvol;i++)
      for(int idirac = 0; idirac<ndirac; idirac++)
         for(int ic= 0; ic<nc; ic++)
            //   PHI_index=i*16+j*2+k;
            smallPhi[(i*ndirac+idirac)*nc+ic]=Phi[((na*kvol+i)*ngorkov+idirac)*nc+ic];
#endif
   return 0;
}

References kvol, nc, ndirac, and ngorkov.

Here is the caller graph for this function:

◆ Force()

int Force	(	double *	dSdpi,
		int	iflag,
		double	res1,
		Complex *	X0,
		Complex *	X1,
		Complex *	Phi,
		Complex *	u11t,
		Complex *	u12t,
		Complex_f *	u11t_f,
		Complex_f *	u12t_f,
		unsigned int *	iu,
		unsigned int *	id,
		Complex *	gamval,
		Complex_f *	gamval_f,
		int *	gamin,
		double *	dk4m,
		double *	dk4p,
		float *	dk4m_f,
		float *	dk4p_f,
		Complex_f	jqq,
		float	akappa,
		float	beta,
		double *	ancg )

Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time.

Parameters

dSdpi	The force
iflag	Invert before evaluating the force. 0 to invert, one not to. Blame FORTRAN...
res1	Conjugate gradient residule
X0	Up/down partitioned pseudofermion field
X1	Holder for the partitioned fermion field, then the conjugate gradient output
Phi	Pseudofermion field
u11t,u12t	Double precision colour fields
u11t_f,u12t_f	Single precision colour fields
iu,id	Lattice indices
gamin	Gamma indices
gamval	Double precision gamma matrices rescaled by kappa
gamval_f	Single precision gamma matrices rescaled by kappa
dk4m	\(e^{-\mu}\)
dk4p	\(e^\mu\)
dk4m_f	\(e^{-\mu}\) float
dk4p_f	\(e^\mu\) float
jqq	Diquark source
akappa	Hopping parameter
beta	Inverse gauge coupling
ancg	Counter for conjugate gradient iterations

Returns: Zero on success, integer error code otherwise

Definition at line 131 of file force.c.

                                           {
   /*
    * @brief Calculates the force @f$\frac{dS}{d\pi}@f$ at each intermediate time
    * 
    * @param   dSdpi:         The force
    * @param   iflag:         Invert before evaluating the force? 
    * @param   res1:          Conjugate gradient residule
    * @param   X0:            Up/down partitioned pseudofermion field
    * @param   X1:            Holder for the partitioned fermion field, then the conjugate gradient output
    * @param   Phi:           Pseudofermion field
    * @param   u11t,u12t      Double precision colour fields
    * @param   u11t_f,u12t_f: Single precision colour fields
    * @param   iu,id:         Lattice indices
    * @param   gamin:         Gamma indices
    * @param   gamval:        Double precision gamma matrices
    * @param   gamval_f:      Single precision gamma matrices
    * @param   dk4m:          @f$\left(1+\gamma_0\right)e^{-\mu}@f$
    * @param   dk4p:          @f$\left(1-\gamma_0\right)e^\mu@f$
    * @param   dk4m_f:        @f$\left(1+\gamma_0\right)e^{-\mu}@f$ float
    * @param   dk4p_f:        @f$\left(1-\gamma_0\right)e^\mu@f$ float
    * @param   jqq:           Diquark source
    * @param   akappa:        Hopping parameter
    * @param   beta:          Inverse gauge coupling
    * @param   ancg:          Counter for conjugate gradient iterations
    *
    * @return Zero on success, integer error code otherwise
    */
   const char *funcname = "Force";
#ifdef __NVCC__
   int device=-1;
   cudaGetDevice(&device);
#endif
#ifndef NO_GAUGE
   Gauge_force(dSdpi,u11t_f,u12t_f,iu,id,beta);
#endif
   //X1=(M†M)^{1} Phi
   int itercg=1;
#ifdef __NVCC__
   Complex *X2;
   cudaMallocManaged((void **)&X2,kferm2Halo*sizeof(Complex),cudaMemAttachGlobal);
#else
   Complex *X2= (Complex *)aligned_alloc(AVX,kferm2Halo*sizeof(Complex));
#endif
   for(int na = 0; na<nf; na++){
#ifdef __NVCC__
      cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,NULL);
#else
      memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
#endif
      if(!iflag){
#ifdef __NVCC__
         Complex *smallPhi;
         cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),streams[0]);
#else
         Complex *smallPhi = (Complex *)aligned_alloc(AVX,kferm2*sizeof(Complex)); 
#endif
         Fill_Small_Phi(na, smallPhi, Phi);
         // Congradq(na, res1,smallPhi, &itercg );
         Congradq(na,res1,X1,smallPhi,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg);
#ifdef __NVCC__
         cudaFreeAsync(smallPhi,streams[0]);
#else
         free(smallPhi);
#endif
         *ancg+=itercg;
#ifdef __NVCC__
         Complex blasa=2.0; double blasb=-1.0;
         cublasZdscal(cublas_handle,kferm2,&blasb,(cuDoubleComplex *)(X0+na*kferm2),1);
         cublasZaxpy(cublas_handle,kferm2,(cuDoubleComplex *)&blasa,(cuDoubleComplex *)X1,1,(cuDoubleComplex *)(X0+na*kferm2),1);
         //HDslash launches a different stream so we need a barrieer
         cudaDeviceSynchronise();
#elif (defined __INTEL_MKL__)
         Complex blasa=2.0; Complex blasb=-1.0;
         //This is not a general BLAS Routine. BLIS and MKl support it
         //CUDA and GSL does not support it
         cblas_zaxpby(kferm2, &blasa, X1, 1, &blasb, X0+na*kferm2, 1); 
#elif defined USE_BLAS
         Complex blasa=2.0; double blasb=-1.0;
         cblas_zdscal(kferm2,blasb,X0+na*kferm2,1);
         cblas_zaxpy(kferm2,&blasa,X1,1,X0+na*kferm2,1);
#else
#pragma omp parallel for simd collapse(2)
         for(int i=0;i<kvol;i++)
            for(int idirac=0;idirac<ndirac;idirac++){
               X0[((na*kvol+i)*ndirac+idirac)*nc]=
                  2*X1[(i*ndirac+idirac)*nc]-X0[((na*kvol+i)*ndirac+idirac)*nc];
               X0[((na*kvol+i)*ndirac+idirac)*nc+1]=
                  2*X1[(i*ndirac+idirac)*nc+1]-X0[((na*kvol+i)*ndirac+idirac)*nc+1];
            }
#endif
      }
      Hdslash(X2,X1,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,akappa);
#ifdef __NVCC__
      double blasd=2.0;
      cudaDeviceSynchronise();
      cublasZdscal(cublas_handle,kferm2, &blasd, (cuDoubleComplex *)X2, 1);
#elif defined USE_BLAS
      double blasd=2.0;
      cblas_zdscal(kferm2, blasd, X2, 1);
#else
#pragma unroll
      for(int i=0;i<kferm2;i++)
         X2[i]*=2;
#endif
#if(npx>1)
      ZHalo_swap_dir(X1,8,0,DOWN);
      ZHalo_swap_dir(X2,8,0,DOWN);
#endif
#if(npy>1)
      ZHalo_swap_dir(X1,8,1,DOWN);
      ZHalo_swap_dir(X2,8,1,DOWN);
#endif
#if(npz>1)
      ZHalo_swap_dir(X1,8,2,DOWN);
      ZHalo_swap_dir(X2,8,2,DOWN);
#endif
#if(npt>1)
      ZHalo_swap_dir(X1,8,3,DOWN);
      ZHalo_swap_dir(X2,8,3,DOWN);
#endif
 
      // The original FORTRAN Comment:
      //    dSdpi=dSdpi-Re(X1*(d(Mdagger)dp)*X2) -- Yikes!
      //   we're gonna need drugs for this one......
      //
      //  Makes references to X1(.,.,iu(i,mu)) AND X2(.,.,iu(i,mu))
      //  as a result, need to swap the DOWN halos in all dirs for
      //  both these arrays, each of which has 8 cpts
      //
#ifdef __NVCC__
   Complex_f *X1_f, *X2_f;
   cudaMallocAsync((void **)&X1_f,kferm2*sizeof(Complex_f),NULL);
   cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
   Transpose_c(X1_f,ndirac*nc,kvol,dimGrid,dimBlock);
 
   cudaMallocAsync((void **)&X2_f,kferm2*sizeof(Complex_f),NULL);
   cuComplex_convert(X2_f,X2,kferm2,true,dimBlock,dimGrid);
   Transpose_c(X2_f,ndirac*nc,kvol,dimGrid,dimBlock);
// Transpose_z(X1,kvol,ndirac*nc,dimGrid,dimBlock); Transpose_z(X2,kvol,ndirac*nc,dimGrid,dimBlock);
      cuForce(dSdpi,u11t_f,u12t_f,X1_f,X2_f,gamval_f,dk4m_f,dk4p_f,iu,gamin,akappa,dimGrid,dimBlock);
   cudaDeviceSynchronise();
   cudaFreeAsync(X1_f,NULL); cudaFreeAsync(X2_f,NULL);
#else
#pragma omp parallel for
      for(int i=0;i<kvol;i++)
         for(int idirac=0;idirac<ndirac;idirac++){
            int mu, uid, igork1;
#ifndef NO_SPACE
#pragma omp simd aligned(dSdpi,X1,X2,u11t,u12t,iu:AVX)
            for(mu=0; mu<3; mu++){
               //Long term ambition. I used the diff command on the different
               //spacial components of dSdpi and saw a lot of the values required
               //for them are duplicates (u11(i,mu)*X2(1,idirac,i) is used again with
               //a minus in front for example. Why not evaluate them first /and then plug 
               //them into the equation? Reduce the number of evaluations needed and look
               //a bit neater (although harder to follow as a consequence).
 
               //Up indices
               uid = iu[mu+ndim*i];
               igork1 = gamin[mu*ndirac+idirac];   
 
               //REMINDER. Gamma is already scaled by kappa when we defined them. So if yer trying to rederive this from
               //Montvay and Munster and notice a missing kappa in the code, that is why.
               dSdpi[(i*nadj)*ndim+mu]+=akappa*creal(I*
                     (conj(X1[(i*ndirac+idirac)*nc])*
                      (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
                       +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
                      +conj(X1[(uid*ndirac+idirac)*nc])*
                      ( u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
                        -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
                      +conj(X1[(i*ndirac+idirac)*nc+1])*
                      (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
                       +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
                      +conj(X1[(uid*ndirac+idirac)*nc+1])*
                      (-u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
                       -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
               dSdpi[(i*nadj)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
                     (conj(X1[(i*ndirac+idirac)*nc])*
                      (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
                       +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
                      +conj(X1[(uid*ndirac+idirac)*nc])*
                      (-u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
                       +conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
                      +conj(X1[(i*ndirac+idirac)*nc+1])*
                      (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
                       +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
                      +conj(X1[(uid*ndirac+idirac)*nc+1])*
                      (u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
                       +conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
 
               dSdpi[(i*nadj+1)*ndim+mu]+=akappa*creal(
                     (conj(X1[(i*ndirac+idirac)*nc])*
                      (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
                       +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
                      +conj(X1[(uid*ndirac+idirac)*nc])*
                      (-u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
                       -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
                      +conj(X1[(i*ndirac+idirac)*nc+1])*
                      (-u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
                       -u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
                      +conj(X1[(uid*ndirac+idirac)*nc+1])*
                      (u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
                       -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
               dSdpi[(i*nadj+1)*ndim+mu]+=creal(gamval[mu*ndirac+idirac]*
                     (conj(X1[(i*ndirac+idirac)*nc])*
                      (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
                       +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
                      +conj(X1[(uid*ndirac+idirac)*nc])*
                      (u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
                       +conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
                      +conj(X1[(i*ndirac+idirac)*nc+1])*
                      (-u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
                       -u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
                      +conj(X1[(uid*ndirac+idirac)*nc+1])*
                      (-u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
                       +conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
 
               dSdpi[(i*nadj+2)*ndim+mu]+=akappa*creal(I*
                     (conj(X1[(i*ndirac+idirac)*nc])*
                      (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
                       +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
                      +conj(X1[(uid*ndirac+idirac)*nc])*
                      (-conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
                       -u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])
                      +conj(X1[(i*ndirac+idirac)*nc+1])*
                      (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
                       -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
                      +conj(X1[(uid*ndirac+idirac)*nc+1])*
                      (-conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
                       +u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])));
               dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
                     (conj(X1[(i*ndirac+idirac)*nc])*
                      (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
                       +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
                      +conj(X1[(uid*ndirac+idirac)*nc])*
                      (conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
                       +u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])
                      +conj(X1[(i*ndirac+idirac)*nc+1])*
                      (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
                       -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
                      +conj(X1[(uid*ndirac+idirac)*nc+1])*
                      (conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
                       -u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])));
 
            }
#endif
            //We're not done tripping yet!! Time like term is different. dk4? shows up
            //For consistency we'll leave mu in instead of hard coding.
            mu=3;
            uid = iu[mu+ndim*i];
            igork1 = gamin[mu*ndirac+idirac];   
#ifndef NO_TIME
            dSdpi[(i*nadj)*ndim+mu]+=creal(I*
                  (conj(X1[(i*ndirac+idirac)*nc])*
                   (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
                             +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
                   +conj(X1[(uid*ndirac+idirac)*nc])*
                   (dk4p[i]*      (+u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
                                   -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
                   +conj(X1[(i*ndirac+idirac)*nc+1])*
                   (dk4m[i]*       (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
                                    +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
                   +conj(X1[(uid*ndirac+idirac)*nc+1])*
                   (dk4p[i]*      (-u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
                                   -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))))
               +creal(I*
                     (conj(X1[(i*ndirac+idirac)*nc])*
                      (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
                                +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
                      +conj(X1[(uid*ndirac+idirac)*nc])*
                      (-dk4p[i]*       (u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
                                        -conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
                      +conj(X1[(i*ndirac+idirac)*nc+1])*
                      (dk4m[i]*       (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
                                       +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
                      +conj(X1[(uid*ndirac+idirac)*nc+1])*
                      (-dk4p[i]*      (-u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
                                       -conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
 
            dSdpi[(i*nadj+1)*ndim+mu]+=creal(
                  conj(X1[(i*ndirac+idirac)*nc])*
                  (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
                            +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
                  +conj(X1[(uid*ndirac+idirac)*nc])*
                  (dk4p[i]*      (-u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
                                  -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
                  +conj(X1[(i*ndirac+idirac)*nc+1])*
                  (dk4m[i]*      (-u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
                                  -u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
                  +conj(X1[(uid*ndirac+idirac)*nc+1])*
                  (dk4p[i]*      ( u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
                                   -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])))
               +creal(
                     (conj(X1[(i*ndirac+idirac)*nc])*
                      (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
                                +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
                      +conj(X1[(uid*ndirac+idirac)*nc])*
                      (-dk4p[i]*      (-u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
                                       -conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
                      +conj(X1[(i*ndirac+idirac)*nc+1])*
                      (dk4m[i]*      (-u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
                                      -u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
                      +conj(X1[(uid*ndirac+idirac)*nc+1])*
                      (-dk4p[i]*       (u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
                                        -conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
 
            dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*
                  (conj(X1[(i*ndirac+idirac)*nc])*
                   (dk4m[i]*       (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
                                    +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
                   +conj(X1[(uid*ndirac+idirac)*nc])*
                   (dk4p[i]*(-conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
                             -u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))
                   +conj(X1[(i*ndirac+idirac)*nc+1])*
                   (dk4m[i]* (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
                              -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
                   +conj(X1[(uid*ndirac+idirac)*nc+1])*
                   (dk4p[i]*(-conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
                             +u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))))
               +creal(I*
                     (conj(X1[(i*ndirac+idirac)*nc])*
                      (dk4m[i]*       (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
                                       +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
                      +conj(X1[(uid*ndirac+idirac)*nc])*
                      (-dk4p[i]*(-conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
                                 -u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))
                      +conj(X1[(i*ndirac+idirac)*nc+1])*
                      (dk4m[i]* (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
                                 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
                      +conj(X1[(uid*ndirac+idirac)*nc+1])*
                      (-dk4p[i]*(-conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
                                 +u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))));
 
#endif
         }
#endif
   }
#ifdef __NVCC__
   cudaFree(X2);
#else
   free(X2); 
#endif
   return 0;
}

References AVX, Complex, Complex_f, Congradq(), DOWN, Fill_Small_Phi(), Gauge_force(), Hdslash(), kferm2, kferm2Halo, kvol, nadj, nc, ndim, ndirac, nf, and ZHalo_swap_dir().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ Gauge_force()

int Gauge_force	(	double *	dSdpi,
		Complex_f *	u11t,
		Complex_f *	u12t,
		unsigned int *	iu,
		unsigned int *	id,
		float	beta )

Calculates the gauge force due to the Wilson Action at each intermediate time.

Parameters

dSdpi	The force
u11t,u12t	Gauge fields
iu,id	Lattice indices
beta	Inverse gauge coupling

Returns: Zero on success, integer error code otherwise

Definition at line 6 of file force.c.

                                                                                                              {
   /*
    * Calculates dSdpi due to the Wilson Action at each intermediate time
    *
    * Calls:
    * =====
    * C_Halo_swap_all, C_gather, C_Halo_swap_dir
    *
    * Parameters:
    * =======
    * double         *dSdpi
    * Complex_f         *u11t
    * Complex_f         *u12t
    * unsigned int   *iu 
    * unsigned int   *id 
    * float          beta
    *
    * Returns:
    * =======
    * Zero on success, integer error code otherwise
    */
   const char *funcname = "Gauge_force";
 
   //We define zero halos for debugging
   // #ifdef _DEBUG
   //    memset(u11t[kvol], 0, ndim*halo*sizeof(Complex_f));   
   //    memset(u12t[kvol], 0, ndim*halo*sizeof(Complex_f));   
   // #endif
   //Was a trial field halo exchange here at one point.
#ifdef __NVCC__
   int device=-1;
   cudaGetDevice(&device);
   Complex_f *Sigma11, *Sigma12, *u11sh, *u12sh;
   cudaMallocAsync((void **)&Sigma11,kvol*sizeof(Complex_f),streams[0]);
   cudaMallocAsync((void **)&Sigma12,kvol*sizeof(Complex_f),streams[1]);
   cudaMallocManaged((void **)&u11sh,(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&u12sh,(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);
#else
   Complex_f *Sigma11 = (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f)); 
   Complex_f *Sigma12= (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f)); 
   Complex_f *u11sh = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f)); 
   Complex_f *u12sh = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f)); 
#endif
   //Holders for directions
   for(int mu=0; mu<ndim; mu++){
#ifdef __NVCC__
      cudaMemset(Sigma11,0, kvol*sizeof(Complex_f));
      cudaMemset(Sigma12,0, kvol*sizeof(Complex_f));
#else
      memset(Sigma11,0, kvol*sizeof(Complex_f));
      memset(Sigma12,0, kvol*sizeof(Complex_f));
#endif
      for(int nu=0; nu<ndim; nu++)
         if(nu!=mu){
            //The +ν Staple
#ifdef __NVCC__
            cuPlus_staple(mu,nu,iu,Sigma11,Sigma12,u11t,u12t,dimGrid,dimBlock);
#else
#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12,iu:AVX)
            for(int i=0;i<kvol;i++){
               int uidm = iu[mu+ndim*i];
               int uidn = iu[nu+ndim*i];
               Complex_f   a11=u11t[uidm*ndim+nu]*conj(u11t[uidn*ndim+mu])+\
                               u12t[uidm*ndim+nu]*conj(u12t[uidn*ndim+mu]);
               Complex_f   a12=-u11t[uidm*ndim+nu]*u12t[uidn*ndim+mu]+\
                               u12t[uidm*ndim+nu]*u11t[uidn*ndim+mu];
               Sigma11[i]+=a11*conj(u11t[i*ndim+nu])+a12*conj(u12t[i*ndim+nu]);
               Sigma12[i]+=-a11*u12t[i*ndim+nu]+a12*u11t[i*ndim+nu];
            }
#endif
            C_gather(u11sh, u11t, kvol, id, nu);
            C_gather(u12sh, u12t, kvol, id, nu);
#if(nproc>1)
#ifdef __NVCC__
            //Prefetch to the CPU for until we get NCCL working
            cudaMemPrefetchAsync(u11sh, kvol*sizeof(Complex_f),cudaCpuDeviceId,streams[0]);
            cudaMemPrefetchAsync(u12sh, kvol*sizeof(Complex_f),cudaCpuDeviceId,streams[1]);
#endif
            CHalo_swap_dir(u11sh, 1, mu, DOWN); CHalo_swap_dir(u12sh, 1, mu, DOWN);
#ifdef __NVCC__
            cudaMemPrefetchAsync(u11sh+kvol, halo*sizeof(Complex_f),device,streams[0]);
            cudaMemPrefetchAsync(u12sh+kvol, halo*sizeof(Complex_f),device,streams[1]);
#endif
#endif
            //Next up, the -ν staple
#ifdef __NVCC__
            cudaDeviceSynchronise();
            cuMinus_staple(mu,nu,iu,id,Sigma11,Sigma12,u11sh,u12sh,u11t,u12t,dimGrid,dimBlock);
#else
#pragma omp parallel for simd aligned(u11t,u12t,u11sh,u12sh,Sigma11,Sigma12,iu,id:AVX)
            for(int i=0;i<kvol;i++){
               int uidm = iu[mu+ndim*i];
               int didn = id[nu+ndim*i];
               //uidm is correct here
               Complex_f a11=conj(u11sh[uidm])*conj(u11t[didn*ndim+mu])-\
                             u12sh[uidm]*conj(u12t[didn*ndim+mu]);
               Complex_f a12=-conj(u11sh[uidm])*u12t[didn*ndim+mu]-\
                             u12sh[uidm]*u11t[didn*ndim+mu];
               Sigma11[i]+=a11*u11t[didn*ndim+nu]-a12*conj(u12t[didn*ndim+nu]);
               Sigma12[i]+=a11*u12t[didn*ndim+nu]+a12*conj(u11t[didn*ndim+nu]);
            }
#endif
         }
#ifdef __NVCC__
      cuGauge_force(mu,Sigma11,Sigma12,u11t,u12t,dSdpi,beta,dimGrid,dimBlock);
#else
#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12,dSdpi:AVX)
      for(int i=0;i<kvol;i++){
         Complex_f a11 = u11t[i*ndim+mu]*Sigma12[i]+u12t[i*ndim+mu]*conj(Sigma11[i]);
         Complex_f a12 = u11t[i*ndim+mu]*Sigma11[i]+conj(u12t[i*ndim+mu])*Sigma12[i];
 
         dSdpi[(i*nadj)*ndim+mu]=(double)(beta*cimag(a11));
         dSdpi[(i*nadj+1)*ndim+mu]=(double)(beta*creal(a11));
         dSdpi[(i*nadj+2)*ndim+mu]=(double)(beta*cimag(a12));
      }
#endif
   }
#ifdef __NVCC__
   cudaDeviceSynchronise();
   cudaFreeAsync(Sigma11,streams[0]); cudaFreeAsync(Sigma12,streams[1]); cudaFree(u11sh); cudaFree(u12sh);
#else
   free(u11sh); free(u12sh); free(Sigma11); free(Sigma12);
#endif
   return 0;
}

References AVX, C_gather(), CHalo_swap_dir(), Complex_f, DOWN, halo, kvol, nadj, and ndim.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ Hamilton()

int Hamilton	(	double *	h,
		double *	s,
		double	res2,
		double *	pp,
		Complex *	X0,
		Complex *	X1,
		Complex *	Phi,
		Complex *	u11t,
		Complex *	u12t,
		Complex_f *	u11t_f,
		Complex_f *	u12t_f,
		unsigned int *	iu,
		unsigned int *	id,
		Complex_f *	gamval_f,
		int *	gamin,
		float *	dk4m_f,
		float *	dk4p_f,
		Complex_f	jqq,
		float	akappa,
		float	beta,
		double *	ancgh,
		int	traj )

Calculate the Hamiltonian.

Parameters

h	Hamiltonian
s	Action
res2	Limit for conjugate gradient
pp	Momentum field
X0	Up/down partitioned pseudofermion field
X1	Holder for the partitioned fermion field, then the conjugate gradient output
Phi	Pseudofermion field
u11t,u12t	Gauge fields
u11t_f,u12t_f	Gauge fields (single precision)
iu,id	Lattice indices
gamval_f	Single precision gamma matrices rescaled by kappa
gamin	Gamma indices
dk4m_f	\(\left(1+\gamma_0\right)e^{-\mu}\) float
dk4p_f	\(\left(1-\gamma_0\right)e^\mu\) float
jqq	Diquark source
akappa	Hopping parameter
beta	Inverse gauge coupling
ancgh	Conjugate gradient iterations counter
traj	Calling trajectory for error reporting

Returns: Zero on success. Integer Error code otherwise.

Definition at line 208 of file su2hmc.c.

                                                       {
   /*
    * @brief Calculate the Hamiltonian
    * 
    *
    * @param   h:                Hamiltonian
    * @param   s:                Action
    * @param   res2:             Limit for conjugate gradient
    * @param   X0:
    * @param   X1:
    * @param   Phi:
    * @param   u11t,u12t:        Gauge fields
    * @param   u11t_f,u12t_f:    Gauge fields
    * @param   iu,id:            Lattice indices
    * @param   gamval_f:         Gamma matrices
    * @param   gamin:            Gamma indices
    * @param   dk4m_f:           $exp(-\mu)$ float
    * @param   dk4p_f:           $exp(\mu)$ float
    * @param   jqq:              Diquark source
    * @param   akappa:           Hopping parameter
    * @param   beta:             Inverse gauge coupling
    * @param   ancgh:            Conjugate gradient iterations counter 
    *
    * @return  Zero on success. Integer Error code otherwise.
    */   
   const char *funcname = "Hamilton";
   //Iterate over momentum terms.
#ifdef __NVCC__
   double hp;
   int device=-1;
   cudaGetDevice(&device);
   cudaMemPrefetchAsync(pp,kmom*sizeof(double),device,NULL);
   cublasDnrm2(cublas_handle, kmom, pp, 1,&hp);
   hp*=hp;
#elif defined USE_BLAS
   double hp = cblas_dnrm2(kmom, pp, 1);
   hp*=hp;
#else
   double hp=0;
   for(int i = 0; i<kmom; i++)
      hp+=pp[i]*pp[i]; 
#endif
   hp*=0.5;
   double avplaqs, avplaqt;
   double hg = 0;
   //avplaq? isn't seen again here.
   Average_Plaquette(&hg,&avplaqs,&avplaqt,u11t_f,u12t_f,iu,beta);
 
   double hf = 0; int itercg = 0;
#ifdef __NVCC__
   Complex *smallPhi;
   cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),NULL);
#else
   Complex *smallPhi = aligned_alloc(AVX,kferm2*sizeof(Complex));
#endif
   //Iterating over flavours
   for(int na=0;na<nf;na++){
#ifdef __NVCC__
#ifdef _DEBUG
      cudaDeviceSynchronise();
#endif
      cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,streams[0]);
#ifdef _DEBUG
      cudaDeviceSynchronise();
#endif
#else
      memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
#endif
      Fill_Small_Phi(na, smallPhi, Phi);
      if(Congradq(na,res2,X1,smallPhi,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg))
         fprintf(stderr,"Trajectory %d\n", traj);
 
      *ancgh+=itercg;
#ifdef __NVCC__
      cudaMemcpyAsync(X0+na*kferm2,X1,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,streams[0]);
#else
      memcpy(X0+na*kferm2,X1,kferm2*sizeof(Complex));
#endif
      Fill_Small_Phi(na, smallPhi,Phi);
#ifdef __NVCC__
      Complex dot;
      cublasZdotc(cublas_handle,kferm2,(cuDoubleComplex *)smallPhi,1,(cuDoubleComplex *) X1,1,(cuDoubleComplex *) &dot);
      hf+=creal(dot);
#elif defined USE_BLAS
      Complex dot;
      cblas_zdotc_sub(kferm2, smallPhi, 1, X1, 1, &dot);
      hf+=creal(dot);
#else
      //It is a dot product of the flattened arrays, could use
      //a module to convert index to coordinate array...
      for(int j=0;j<kferm2;j++)
         hf+=creal(conj(smallPhi[j])*X1[j]);
#endif
   }
#ifdef __NVCC__
   cudaFreeAsync(smallPhi,NULL);
#else
   free(smallPhi);
#endif
   //hg was summed over inside of Average_Plaquette.
#if(nproc>1)
   Par_dsum(&hp); Par_dsum(&hf);
#endif
   *s=hg+hf; *h=(*s)+hp;
#ifdef _DEBUG
   if(!rank)
      printf("hg=%.5e; hf=%.5e; hp=%.5e; h=%.5e\n", hg, hf, hp, *h);
#endif
   return 0;
}

References Average_Plaquette(), AVX, Complex, Congradq(), Fill_Small_Phi(), kferm2, kmom, nf, Par_dsum(), and rank.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ Init()

int Init	(	int	istart,
		int	ibound,
		int	iread,
		float	beta,
		float	fmu,
		float	akappa,
		Complex_f	ajq,
		Complex *	u11,
		Complex *	u12,
		Complex *	u11t,
		Complex *	u12t,
		Complex_f *	u11t_f,
		Complex_f *	u12t_f,
		Complex *	gamval,
		Complex_f *	gamval_f,
		int *	gamin,
		double *	dk4m,
		double *	dk4p,
		float *	dk4m_f,
		float *	dk4p_f,
		unsigned int *	iu,
		unsigned int *	id )

Initialises the system.

Parameters

istart	Zero for cold, >1 for hot, <1 for none
ibound	Periodic boundary conditions
iread	Read configuration from file
beta	Inverse gauge coupling
fmu	Chemical potential
akappa	Hopping parameter
ajq	Diquark source
u11,u12	Gauge fields
u11t,u12t	Trial gauge field
u11t_f,u12t_f	Trial gauge field (single precision)
dk4m	\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p	\(\left(1-\gamma_0\right)^\mu\)
dk4m_f	\(\left(1+\gamma_0\right)e^{-\mu}\) float
dk4p_f	\(\left(1-\gamma_0\right)e^\mu\) float
iu,id	Up halo indices
gamin	Gamma matrix indices
gamval	Double precision gamma matrices rescaled by kappa
gamval_f	Single precision gamma matrices rescaled by kappa

Returns: Zero on success, integer error code otherwise

Definition at line 19 of file su2hmc.c.

                                         {
   /*
    * Initialises the system
    *
    * Calls:
    * ======
    * Addrc, Check_addr, ran2, DHalo_swap_dir, Par_sread, Par_ranset, Reunitarise
    *
    * Globals:
    * =======
    * Complex gamval:      Gamma Matrices
    * Complex_f gamval_f:  Float Gamma matrices:
    *
    * Parameters:
    * ==========
    * int istart:          Zero for cold, >1 for hot, <1 for none
    * int ibound:          Periodic boundary conditions
    * int iread:           Read configuration from file
    * float beta:          beta
    * float fmu:           Chemical potential
    * float akappa:        Hopping parameter
    * Complex_f ajq:       Diquark source
    * Complex *u11:        First colour field
    * Complex *u12:        Second colour field
    * Complex *u11t:       First colour trial field
    * Complex *u12t:       Second colour trial field
    * Complex_f *u11t_f:   First float trial field
    * Complex_f *u12t_f:   Second float trial field
    * double   *dk4m       $exp(-\mu)$
    * double   *dk4p:      $exp(\mu)$
    * float    *dk4m_f:    $exp(-\mu)$ float
    * float    *dk4p_f:    $exp(\mu)$ float
    * unsigned int *iu:    Up halo indices
    * unsigned int *id:    Down halo indices
    *
    * Returns:
    * =======
    * Zero on success, integer error code otherwise
    */
   const char *funcname = "Init";
 
#ifdef _OPENMP
   omp_set_num_threads(nthreads);
#ifdef __INTEL_MKL__
   mkl_set_num_threads(nthreads);
#endif
#endif
   //First things first, calculate a few constants for coordinates
   Addrc(iu, id);
   //And confirm they're legit
   Check_addr(iu, ksize, ksizet, 0, kvol+halo);
   Check_addr(id, ksize, ksizet, 0, kvol+halo);
#ifdef _DEBUG
   printf("Checked addresses\n");
#endif
   double chem1=exp(fmu); double chem2 = 1/chem1;
   //CUDA this. Only limit will be the bus speed
#pragma omp parallel for simd aligned(dk4m,dk4p:AVX)
   for(int i = 0; i<kvol; i++){
      dk4p[i]=akappa*chem1;
      dk4m[i]=akappa*chem2;
   }
   //Anti periodic Boundary Conditions. Flip the terms at the edge of the time
   //direction
   if(ibound == -1 && pcoord[3+ndim*rank]==npt-1){
#ifdef _DEBUG
      printf("Implementing antiperiodic boundary conditions on rank %i\n", rank);
#endif
#pragma omp parallel for simd aligned(dk4m,dk4p:AVX)
      for(int k= kvol-1; k>=kvol-kvol3; k--){
         //int k = kvol - kvol3 + i;
         dk4p[k]*=-1;
         dk4m[k]*=-1;
      }
   }
   //These are constant so swap the halos when initialising and be done with it
   //May need to add a synchronisation statement here first
#if(npt>1)
   DHalo_swap_dir(dk4p, 1, 3, UP);
   DHalo_swap_dir(dk4m, 1, 3, UP);
#endif
   //Float versions
#ifdef __NVCC__
   cuReal_convert(dk4p_f,dk4p,kvol+halo,true,dimBlock,dimGrid);
   cuReal_convert(dk4m_f,dk4m,kvol+halo,true,dimBlock,dimGrid);
#else
#pragma omp parallel for simd aligned(dk4m,dk4p,dk4m_f,dk4p_f:AVX)
   for(int i=0;i<kvol+halo;i++){
      dk4p_f[i]=(float)dk4p[i];
      dk4m_f[i]=(float)dk4m[i];
   }
#endif
   int __attribute__((aligned(AVX))) gamin_t[4][4] =  {{3,2,1,0},{3,2,1,0},{2,3,0,1},{2,3,0,1}};
   //Gamma Matrices in Chiral Representation
   //Gattringer and Lang have a nice crash course in appendix A.2 of
   //Quantum Chromodynamics on the Lattice (530.14 GAT)
   //_t is for temp. We copy these into the real gamvals later
#ifdef __NVCC__
   cudaMemcpy(gamin,gamin_t,4*4*sizeof(int),cudaMemcpyDefault);
#else
   memcpy(gamin,gamin_t,4*4*sizeof(int));
#endif
   Complex  __attribute__((aligned(AVX))) gamval_t[5][4] =  {{-I,-I,I,I},{-1,1,1,-1},{-I,I,I,-I},{1,1,1,1},{1,1,-1,-1}};
   //Each gamma matrix is rescaled by akappa by flattening the gamval array
#if defined USE_BLAS
   //Don't cuBLAS this. It is small and won't saturate the GPU. Let the CPU handle
   //it and just copy it later
   cblas_zdscal(5*4, akappa, gamval_t, 1);
#else
#pragma omp parallel for simd collapse(2) aligned(gamval,gamval_f:AVX)
   for(int i=0;i<5;i++)
      for(int j=0;j<4;j++)
         gamval_t[i][j]*=akappa;
#endif
 
#ifdef __NVCC__
   cudaMemcpy(gamval,gamval_t,5*4*sizeof(Complex),cudaMemcpyDefault);
   cuComplex_convert(gamval_f,gamval,20,true,dimBlockOne,dimGridOne);   
#else
   memcpy(gamval,gamval_t,5*4*sizeof(Complex));
   for(int i=0;i<5*4;i++)
      gamval_f[i]=(Complex_f)gamval[i];
#endif
   if(iread){
      if(!rank) printf("Calling Par_sread() for configuration: %i\n", iread);
      Par_sread(iread, beta, fmu, akappa, ajq,u11,u12,u11t,u12t);
      Par_ranset(&seed,iread);
   }
   else{
      Par_ranset(&seed,iread);
      if(istart==0){
         //Initialise a cold start to zero
         //memset is safe to use here because zero is zero 
#pragma omp parallel for simd aligned(u11t:AVX) 
         //Leave it to the GPU?
         for(int i=0; i<kvol*ndim;i++){
            u11t[i]=1;  u12t[i]=0;
         }
      }
      else if(istart>0){
         //Ideally, we can use gsl_ranlux as the PRNG
#ifdef __RANLUX__
         for(int i=0; i<kvol*ndim;i++){
            u11t[i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
            u12t[i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
         }
         //If not, the Intel Vectorise Mersenne Twister
#elif (defined __INTEL_MKL__&&!defined USE_RAN2)
         //Good news, casting works for using a double to create random complex numbers
         vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*ndim*kvol, u11t, -1, 1);
         vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*ndim*kvol, u12t, -1, 1);
         //Last resort, Numerical Recipes' Ran2
#else
         for(int i=0; i<kvol*ndim;i++){
            u11t[i]=2*(ran2(&seed)-0.5+I*(ran2(&seed)-0.5));
            u12t[i]=2*(ran2(&seed)-0.5+I*(ran2(&seed)-0.5));
         }
#endif
      }
      else
         fprintf(stderr,"Warning %i in %s: Gauge fields are not initialised.\n", NOINIT, funcname);
 
#ifdef __NVCC__
      int device=-1;
      cudaGetDevice(&device);
      cudaMemPrefetchAsync(u11t, ndim*kvol*sizeof(Complex),device,streams[0]);
      cudaMemPrefetchAsync(u12t, ndim*kvol*sizeof(Complex),device,streams[1]);
#endif
      //Send trials to accelerator for reunitarisation
      Reunitarise(u11t,u12t);
      //Get trials back
#ifdef __NVCC__
      cudaMemcpyAsync(u11,u11t,ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[0]);
      cudaMemPrefetchAsync(u11, ndim*kvol*sizeof(Complex),device,streams[0]);
      cudaMemcpyAsync(u12,u12t,ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[1]);
      cudaMemPrefetchAsync(u12, ndim*kvol*sizeof(Complex),device,streams[1]);
#else
      memcpy(u11, u11t, ndim*kvol*sizeof(Complex));
      memcpy(u12, u12t, ndim*kvol*sizeof(Complex));
#endif
   }
#ifdef _DEBUG
   printf("Initialisation Complete\n");
#endif
   return 0;
}

References Addrc(), AVX, Check_addr(), Complex, Complex_f, DHalo_swap_dir(), halo, ksize, ksizet, kvol, kvol3, ndim, npt, nthreads, Par_ranset(), Par_sread(), pcoord, ran2(), rank, Reunitarise(), seed, and UP.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ Measure()

int Measure	(	double *	pbp,
		double *	endenf,
		double *	denf,
		Complex *	qq,
		Complex *	qbqb,
		double	res,
		int *	itercg,
		Complex *	u11t,
		Complex *	u12t,
		Complex_f *	u11t_f,
		Complex_f *	u12t_f,
		unsigned int *	iu,
		unsigned int *	id,
		Complex *	gamval,
		Complex_f *	gamval_f,
		int *	gamin,
		double *	dk4m,
		double *	dk4p,
		float *	dk4m_f,
		float *	dk4p_f,
		Complex_f	jqq,
		float	akappa,
		Complex *	Phi,
		Complex *	R1 )

Calculate fermion expectation values via a noisy estimator.

Matrix inversion via conjugate gradient algorithm Solves \(MX=X_1\) (Numerical Recipes section 2.10 pp.70-73)
uses NEW lookup tables ** Implemented in Congradp()

Parameters

pbp	\(\langle\bar{\Psi}\Psi\rangle\)
endenf	Energy density
denf	Number Density
qq	Diquark condensate
qbqb	Antidiquark condensate
res	Conjugate Gradient Residue
itercg	Iterations of Conjugate Gradient
u11t,u12t	Double precisiongauge field
u11t_f,u12t_f	Single precision gauge fields
iu,id	Lattice indices
gamval	Double precision gamma matrices rescaled by kappa
gamval_f	Single precision gamma matrices rescaled by kappa
gamin	Indices for Dirac terms
dk4m	\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p	\(\left(1-\gamma_0\right)e^\mu\)
dk4m_f	\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p_f	\(\left(1-\gamma_0\right)e^\mu\)
jqq	Diquark source
akappa	Hopping parameter
Phi	Pseudofermion field
R1	A useful array for holding things that was already assigned in main. In particular, we'll be using it to catch the output of \( M^\dagger\Xi\) before the inversion, then used to store the output of the inversion

Returns: Zero on success, integer error code otherwise

Definition at line 8 of file fermionic.c.

                                                                                            {
   /*
    * @brief   Calculate fermion expectation values via a noisy estimator
    * 
    * Matrix inversion via conjugate gradient algorithm
    * Solves @f(Mx=x_1@f)
    * (Numerical Recipes section 2.10 pp.70-73)   
    * uses NEW lookup tables **
    * Implemented in Congradq
    *
    * @param   pbp:           @f(\langle\bar{\Psi}\Psi\rangle@f)
    * @param   endenf:        Energy density
    * @param   denf:          Number Density
    * @param   qq:            Diquark condensate
    * @param   qbqb:          Antidiquark condensate
    * @param   res:           Conjugate Gradient Residue
    * @param   itercg:        Iterations of Conjugate Gradient
    * @param   u11t,u12t      Double precisiongauge field
    * @param   u11t_f,u12t_f: Single precision gauge fields
    * @param   iu,id          Lattice indices
    * @param   gamval_f:      Gamma matrices
    * @param   gamin:         Indices for Dirac terms
    * @param   dk4m_f:        $exp(-\mu)$ float
    * @param   dk4p_f:        $exp(\mu)$ float
    * @param   jqq:           Diquark source
    * @param   akappa:        Hopping parameter
    * @param   Phi:           Pseudofermion field  
    * @param   R1:            A useful array for holding things that was already assigned in main.
    *                         In particular, we'll be using it to catch the output of
    *                         @f$ M^\dagger\Xi@f$ before the inversion, then used to store the
    *                         output of the inversion
    *
    * @return Zero on success, integer error code otherwise
    */
   const char *funcname = "Measure";
   //This x is just a storage container
 
#ifdef __NVCC__
   int device=-1;
   cudaGetDevice(&device);
   Complex  *x, *xi; Complex_f *xi_f, *R1_f;
   #ifdef _DEBUG
   cudaMallocManaged((void **)&R1_f,kfermHalo*sizeof(Complex_f), cudaMemAttachGlobal);
   #else
   cudaMallocAsync((void **)&R1_f,kfermHalo*sizeof(Complex_f),streams[1]);
   #endif
   cudaMallocManaged((void **)&x,kfermHalo*sizeof(Complex), cudaMemAttachGlobal);
   cudaMallocManaged((void **)&xi,kferm*sizeof(Complex), cudaMemAttachGlobal);
   cudaMallocManaged((void **)&xi_f,kfermHalo*sizeof(Complex_f), cudaMemAttachGlobal);
#else
   Complex *x =(Complex *)aligned_alloc(AVX,kfermHalo*sizeof(Complex));
   Complex *xi =(Complex *)aligned_alloc(AVX,kferm*sizeof(Complex));
   Complex_f *xi_f =(Complex_f *)aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
   Complex_f *R1_f = (Complex_f *)aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
#endif
   //Setting up noise.
#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
   Gauss_c(xi_f, kferm, 0, (float)(1/sqrt(2)));
#else
   vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, xi_f, 0, 1/sqrt(2));
#endif
#ifdef __NVCC__
   cudaMemPrefetchAsync(xi_f,kferm*sizeof(Complex_f),device,streams[0]);
   cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
   //Transpose needed here for Dslashd
   Transpose_c(xi_f,ngorkov*nc,kvol,dimGrid,dimBlock);
   //Flip all the gauge fields around so memory is coalesced
   Transpose_c(u11t_f,ndim,kvol,dimGrid,dimBlock);
   Transpose_c(u12t_f,ndim,kvol,dimGrid,dimBlock);
   cudaMemcpyAsync(x, xi, kferm*sizeof(Complex),cudaMemcpyDefault,0);
#else
#pragma omp parallel for simd aligned(xi,xi_f:AVX)
   for(int i=0;i<kferm;i++)
      xi[i]=(Complex)xi_f[i];
   memcpy(x, xi, kferm*sizeof(Complex));
#endif
   //R_1= @f$M^\dagger\Xi@f$
   //R1 is local in FORTRAN but since its going to be reset anyway I'm going to recycle the
   //global
   Dslashd_f(R1_f,xi_f,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa);
#ifdef __NVCC__
   cudaDeviceSynchronise();
   cudaFree(xi_f);   
   Transpose_c(R1_f,kvol,ngorkov*nc,dimGrid,dimBlock);
   Transpose_c(u11t_f,kvol,ndim,dimGrid,dimBlock);
   Transpose_c(u12t_f,kvol,ndim,dimGrid,dimBlock);
   cuComplex_convert(R1_f,R1,kferm,false,dimBlock,dimGrid);
   cudaMemcpy(Phi, R1, kferm*sizeof(Complex),cudaMemcpyDefault);
#else
#pragma omp parallel for simd aligned(R1,R1_f:AVX)
   for(int i=0;i<kferm;i++)
      R1[i]=(Complex)R1_f[i];
   //Copying R1 to the first (zeroth) flavour index of Phi
   //This should be safe with memcpy since the pointer name
   //references the first block of memory for that pointer
   memcpy(Phi, R1, kferm*sizeof(Complex));
#endif
   //Evaluate xi = (M^† M)^-1 R_1 
   // Congradp(0, res, R1_f, itercg);
   //If the conjugate gradient fails to converge for some reason, restart it.
   //That's causing issues with NaN's. Plan B is to not record the measurements.
   if(Congradp(0, res, Phi, R1,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,itercg)==ITERLIM)
      return ITERLIM;
   //itercg=0;
   //if(!rank) fprintf(stderr, "Restarting conjugate gradient from %s\n", funcname);
   //Congradp(0, res, Phi, R1_f,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,itercg);
   //itercg+=niterc;
   /*
#pragma omp parallel for simd aligned(R1,R1_f:AVX)
for(int i=0;i<kferm;i++)
xi[i]=(Complex)R1_f[i];
*/
#ifdef __NVCC__
   cudaMemcpyAsync(xi,R1,kferm*sizeof(Complex),cudaMemcpyDefault,streams[0]);
   #ifdef _DEBUG
   cudaFree(R1_f);
   #else
   cudaFreeAsync(R1_f,streams[1]);
   #endif
#else
   memcpy(xi,R1,kferm*sizeof(Complex));
   free(xi_f); free(R1_f);
#endif
#ifdef USE_BLAS
   Complex buff;
#ifdef __NVCC__
   cublasZdotc(cublas_handle,kferm,(cuDoubleComplex *)x,1,(cuDoubleComplex *)xi,1,(cuDoubleComplex *)&buff);
   cudaDeviceSynchronise();
#elif defined USE_BLAS
   cblas_zdotc_sub(kferm, x, 1, xi,  1, &buff);
#endif
   *pbp=creal(buff);
#else
   *pbp = 0;
#pragma unroll
   for(int i=0;i<kferm;i++)
      *pbp+=creal(conj(x[i])*xi[i]);
#endif
#if(nproc>1)
   Par_dsum(pbp);
#endif
   *pbp/=4*gvol;
 
   *qbqb=*qq=0;
#if defined USE_BLAS
   for(int idirac = 0; idirac<ndirac; idirac++){
      int igork=idirac+4;
      //Unrolling the colour indices, Then its just (γ_5*x)*Ξ or (γ_5*Ξ)*x 
#pragma unroll
      for(int ic = 0; ic<nc; ic++){
         Complex dot;
         //Because we have kvol on the outer index and are summing over it, we set the
         //step for BLAS to be ngorkov*nc=16. 
         //Does this make sense to do on the GPU?
#ifdef __NVCC__
         cublasZdotc(cublas_handle,kvol,(cuDoubleComplex *)(x+idirac*nc+ic),ngorkov*nc,(cuDoubleComplex *)(xi+igork*nc+ic), ngorkov*nc,(cuDoubleComplex *)&dot);
#else
         cblas_zdotc_sub(kvol, &x[idirac*nc+ic], ngorkov*nc, &xi[igork*nc+ic], ngorkov*nc, &dot);
#endif
         *qbqb+=gamval[4*ndirac+idirac]*dot;
#ifdef __NVCC__
         cublasZdotc(cublas_handle,kvol,(cuDoubleComplex *)(x+igork*nc+ic),ngorkov*nc,(cuDoubleComplex *)(xi+idirac*nc+ic), ngorkov*nc,(cuDoubleComplex *)&dot);
#else
         cblas_zdotc_sub(kvol, &x[igork*nc+ic], ngorkov*nc, &xi[idirac*nc+ic], ngorkov*nc, &dot);
#endif
         *qq-=gamval[4*ndirac+idirac]*dot;
      }
   }
#else
#pragma unroll(2)
   for(int i=0; i<kvol; i++)
      //What is the optimal order to evaluate these in?
      for(int idirac = 0; idirac<ndirac; idirac++){
         int igork=idirac+4;
         *qbqb+=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+idirac)*nc])*xi[(i*ngorkov+igork)*nc];
         *qq-=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+igork)*nc])*xi[(i*ngorkov+idirac)*nc];
         *qbqb+=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+idirac)*nc+1])*xi[(i*ngorkov+igork)*nc+1];
         *qq-=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+igork)*nc+1])*xi[(i*ngorkov+idirac)*nc+1];
      }
#endif
   //In the FORTRAN Code dsum was used instead despite qq and qbqb being complex
   //Since we only care about the real part this shouldn't cause (m)any serious issues
#if(nproc>1)
   Par_dsum((double *)qq); Par_dsum((double *)qbqb);
#endif
   *qq=(*qq+*qbqb)/(2*gvol);
   Complex xu, xd, xuu, xdd;
   xu=xd=xuu=xdd=0;
 
   //Halos
#if(npt>1)
   ZHalo_swap_dir(x,16,3,DOWN);     ZHalo_swap_dir(x,16,3,UP);
#endif
   //Pesky halo exchange indices again
   //The halo exchange for the trial fields was done already at the end of the trajectory
   //No point doing it again
 
   //Instead of typing id[i*ndim+3] a lot, we'll just assign them to variables.
   //Idea. One loop instead of two loops but for xuu and xdd just use ngorkov-(igorkov+1) instead
   //Dirty CUDA work around since it won't convert thrust<complex> to double
   //TODO: get a reduction routine ready for CUDA
#ifndef __NVCC__
#pragma omp parallel for reduction(+:xd,xu,xdd,xuu) 
#endif
   for(int i = 0; i<kvol; i++){
      int did=id[3+ndim*i];
      int uid=iu[3+ndim*i];
#ifndef __NVCC__
#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xu)
#endif
      for(int igorkov=0; igorkov<4; igorkov++){
         int igork1=gamin[3*ndirac+igorkov];
         //For the C Version I'll try and factorise where possible
         xu+=dk4p[did]*(conj(x[(did*ngorkov+igorkov)*nc])*(\
                  u11t[did*ndim+3]*(xi[(i*ngorkov+igork1)*nc]-xi[(i*ngorkov+igorkov)*nc])+\
                  u12t[did*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]-xi[(i*ngorkov+igorkov)*nc+1]) )+\
               conj(x[(did*ngorkov+igorkov)*nc+1])*(\
                  conj(u11t[did*ndim+3])*(xi[(i*ngorkov+igork1)*nc+1]-xi[(i*ngorkov+igorkov)*nc+1])+\
                  conj(u12t[did*ndim+3])*(xi[(i*ngorkov+igorkov)*nc]-xi[(i*ngorkov+igork1)*nc])));
      }
#ifndef __NVCC__
#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xd)
#endif
      for(int igorkov=0; igorkov<4; igorkov++){
         int igork1=gamin[3*ndirac+igorkov];
         xd+=dk4m[i]*(conj(x[(uid*ngorkov+igorkov)*nc])*(\
                  conj(u11t[i*ndim+3])*(xi[(i*ngorkov+igork1)*nc]+xi[(i*ngorkov+igorkov)*nc])-\
                  u12t[i*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]+xi[(i*ngorkov+igorkov)*nc+1]) )+\
               conj(x[(uid*ngorkov+igorkov)*nc+1])*(\
                  u11t[i*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]+xi[(i*ngorkov+igorkov)*nc+1])+\
                  conj(u12t[i*ndim+3])*(xi[(i*ngorkov+igorkov)*nc]+xi[(i*ngorkov+igork1)*nc]) ) );
      }
#ifndef __NVCC__
#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xuu)
#endif
      for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
         int igork1PP=4+gamin[3*ndirac+igorkovPP-4];
         xuu-=dk4m[did]*(conj(x[(did*ngorkov+igorkovPP)*nc])*(\
                  u11t[did*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc]-xi[(i*ngorkov+igorkovPP)*nc])+\
                  u12t[did*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]-xi[(i*ngorkov+igorkovPP)*nc+1]) )+\
               conj(x[(did*ngorkov+igorkovPP)*nc+1])*(\
                  conj(u11t[did*ndim+3])*(xi[(i*ngorkov+igork1PP)*nc+1]-xi[(i*ngorkov+igorkovPP)*nc+1])+\
                  conj(u12t[did*ndim+3])*(xi[(i*ngorkov+igorkovPP)*nc]-xi[(i*ngorkov+igork1PP)*nc]) ) );
      }
#ifndef __NVCC__
#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xdd)
#endif
      for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
         int igork1PP=4+gamin[3*ndirac+igorkovPP-4];
         xdd-=dk4p[i]*(conj(x[(uid*ngorkov+igorkovPP)*nc])*(\
                  conj(u11t[i*ndim+3])*(xi[(i*ngorkov+igork1PP)*nc]+xi[(i*ngorkov+igorkovPP)*nc])-\
                  u12t[i*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]+xi[(i*ngorkov+igorkovPP)*nc+1]) )+\
               conj(x[(uid*ngorkov+igorkovPP)*nc+1])*(\
                  u11t[i*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]+xi[(i*ngorkov+igorkovPP)*nc+1])+\
                  conj(u12t[i*ndim+3])*(xi[(i*ngorkov+igorkovPP)*nc]+xi[(i*ngorkov+igork1PP)*nc]) ) );
      }
   }
   *endenf=creal(xu-xd-xuu+xdd);
   *denf=creal(xu+xd+xuu+xdd);
 
#if(nproc>1)
   Par_dsum(endenf); Par_dsum(denf);
#endif
   *endenf/=2*gvol; *denf/=2*gvol;
   //Future task. Chiral susceptibility measurements
#ifdef __NVCC__
   cudaFree(x); cudaFree(xi);
#else
   free(x); free(xi);
#endif
   return 0;
}

References AVX, Complex, Complex_f, Congradp(), DOWN, Dslashd_f(), Gauss_c(), gvol, kferm, kfermHalo, kvol, nc, ndim, ndirac, ngorkov, Par_dsum(), UP, and ZHalo_swap_dir().

Here is the call graph for this function:

◆ Polyakov()

double Polyakov	(	Complex_f *	u11t,
		Complex_f *	u12t )

Calculate the Polyakov loop (no prizes for guessing that one...)

Parameters

u11t,u12t The gauge fields

See also: Par_tmul, Par_dsum

Returns: Double corresponding to the polyakov loop

Definition at line 105 of file bosonic.c.

                                                 {
   /*
    * Calculate the Polyakov loop (no prizes for guessing that one...)
    * 
    * Parameters:
    * =========
    * u11t, u12t  The gauge fields
    *
    * Calls:
    * ======
    * Par_tmul, Par_dsum
    * 
    * Return:
    * ======
    * Double corresponding to the polyakov loop
    */
   const char *funcname = "Polyakov";
   double poly = 0;
#ifdef __NVCC__
   int device=-1;
   cudaGetDevice(&device);
   Complex_f *Sigma11,*Sigma12;
   cudaMallocManaged((void **)&Sigma11,kvol3*sizeof(Complex_f),cudaMemAttachGlobal);
#ifdef _DEBUG
   cudaMallocManaged((void **)&Sigma12,kvol3*sizeof(Complex_f),cudaMemAttachGlobal);
#else
   cudaMallocAsync((void **)&Sigma12,kvol3*sizeof(Complex_f),streams[0]);
#endif
#else
   Complex_f *Sigma11 = aligned_alloc(AVX,kvol3*sizeof(Complex_f));
   Complex_f *Sigma12 = aligned_alloc(AVX,kvol3*sizeof(Complex_f));
#endif
 
   //Extract the time component from each site and save in corresponding Sigma
#ifdef __NVCC__
   cublasCcopy(cublas_handle,kvol3, (cuComplex *)(u11t+3), ndim, (cuComplex *)Sigma11, 1);
   cublasCcopy(cublas_handle,kvol3, (cuComplex *)(u12t+3), ndim, (cuComplex *)Sigma12, 1);
#elif defined USE_BLAS
   cblas_ccopy(kvol3, u11t+3, ndim, Sigma11, 1);
   cblas_ccopy(kvol3, u12t+3, ndim, Sigma12, 1);
#else
   for(int i=0; i<kvol3; i++){
      Sigma11[i]=u11t[i*ndim+3];
      Sigma12[i]=u12t[i*ndim+3];
   }
#endif
   /* Some Fortran commentary
      Changed this routine.
      u11t and u12t now defined as normal ie (kvol+halo,4).
      Copy of Sigma11 and Sigma12 is changed so that it copies
      in blocks of ksizet.
      Variable indexu also used to select correct element of u11t and u12t 
      in loop 10 below.
 
      Change the order of multiplication so that it can
      be done in parallel. Start at t=1 and go up to t=T:
      previously started at t+T and looped back to 1, 2, ... T-1
      Buffers
      There is a dependency. Can only parallelise the inner loop
      */
#ifdef __NVCC__
   cudaDeviceSynchronise();
   cuPolyakov(Sigma11,Sigma12,u11t,u12t,dimGrid,dimBlock);
   cudaMemPrefetchAsync(Sigma11,kvol3*sizeof(Complex_f),cudaCpuDeviceId,NULL);
#else
#pragma unroll
   for(int it=1;it<ksizet;it++)
#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12:AVX)
      for(int i=0;i<kvol3;i++){
         //Seems a bit more efficient to increment indexu instead of reassigning
         //it every single loop
         int indexu=it*kvol3+i;
         Complex_f   a11=Sigma11[i]*u11t[indexu*ndim+3]-Sigma12[i]*conj(u12t[indexu*ndim+3]);
         //Instead of having to store a second buffer just assign it directly
         Sigma12[i]=Sigma11[i]*u12t[indexu*ndim+3]+Sigma12[i]*conj(u11t[indexu*ndim+3]);
         Sigma11[i]=a11;
      }
   //Multiply this partial loop with the contributions of the other cores in the
   //Time-like dimension
#endif
   //End of CUDA-CPU pre-processor for evaluating Polyakov
   //
   //Par_tmul does nothing if there is only a single processor in the time direction. So we only compile
   //its call if it is required
#if (npt>1)
#ifdef __NVCC_
#error Par_tmul is not yet implimented in CUDA as Sigma12 is device only memory
#endif
#ifdef _DEBUG
   printf("Multiplying with MPI\n");
#endif
   Par_tmul(Sigma11, Sigma12);
   //end of #if(npt>1)
#endif
   /*Now all cores have the value for the complete Polyakov line at all spacial sites
     We need to globally sum over spacial processors but not across time as these
     are duplicates. So we zero the value for all but t=0
     This is (according to the FORTRAN code) a bit of a hack
     I will expand on this hack and completely avoid any work
     for this case rather than calculating everything just to set it to zero
     */
   if(!pcoord[3+rank*ndim])
#ifdef __NVCC__
      cudaDeviceSynchronise();
#pragma omp parallel for simd reduction(+:poly)
#else
#pragma omp parallel for simd reduction(+:poly) aligned(Sigma11:AVX)
#endif
   for(int i=0;i<kvol3;i++)
      poly+=creal(Sigma11[i]);
#ifdef __NVCC__
   cudaFree(Sigma11);
#ifdef _DEBUG
   cudaFree(Sigma12);
#else
   cudaFreeAsync(Sigma12,streams[0]);
#endif
#else
   free(Sigma11); free(Sigma12);
#endif
 
#if(nproc>1)
   Par_dsum(&poly);
#endif
   poly/=gvol3;
   return poly;   
}

References AVX, Complex_f, gvol3, ksizet, kvol3, ndim, Par_dsum(), pcoord, and rank.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ Reunitarise()

int Reunitarise	(	Complex *	u11t,
		Complex *	u12t )

inline

Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.

If you're looking at the FORTRAN code be careful. There are two header files for the /trial/ header. One with u11 u12 (which was included here originally) and the other with u11t and u12t.

Parameters

u11t,u12t Trial fields to be reunitarised

Returns: Zero on success, integer error code otherwise

Definition at line 904 of file matrices.c.

                                                    {
   /*
    * @brief Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1
    *
    * If you're looking at the FORTRAN code be careful. There are two header files
    * for the /trial/ header. One with u11 u12 (which was included here originally)
    * and the other with u11t and u12t.
    *
    * @see cuReunitarise (CUDA Wrapper)
    *
    * @param u11t, u12t Trial fields to be reunitarised
    *
    * @return Zero on success, integer error code otherwise
    */
   const char *funcname = "Reunitarise";
#ifdef __NVCC__
   cuReunitarise(u11t,u12t,dimGrid,dimBlock);
#else
#pragma omp parallel for simd aligned(u11t,u12t:AVX)
   for(int i=0; i<kvol*ndim; i++){
      //Declaring anorm inside the loop will hopefully let the compiler know it
      //is safe to vectorise aggressively
      double anorm=sqrt(conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]);
      //    Exception handling code. May be faster to leave out as the exit prevents vectorisation.
      //    if(anorm==0){
      //       fprintf(stderr, "Error %i in %s on rank %i: anorm = 0 for μ=%i and i=%i.\nExiting...\n\n",
      //             DIVZERO, funcname, rank, mu, i);
      //       MPI_Finalise();
      //       exit(DIVZERO);
      //    }
      u11t[i]/=anorm;
      u12t[i]/=anorm;
   }
#endif
   return 0;
}

References kvol, and ndim.

Here is the caller graph for this function:

◆ SU2plaq()

float SU2plaq	(	Complex_f *	u11t,
		Complex_f *	u12t,
		unsigned int *	iu,
		int	i,
		int	mu,
		int	nu )

inline

Calculates the plaquette at site i in the \(\mu--\nu\) direction.

Parameters

u11t,u12t	Trial fields
i	Lattice site
iu	Upper halo indices
mu,nu	Plaquette direction. Note that mu and nu can be negative to facilitate calculating plaquettes for Clover terms. No sanity checks are conducted on them in this routine.

Returns: double corresponding to the plaquette value

Definition at line 72 of file bosonic.c.

                                                                                               {
   /*
    * Calculates the plaquette at site i in the μ-ν direction
    * 
    * Parameters:
    * ==========
    * u11t, u12t: Trial fields
    * int *iu: Upper halo indices
    * mu, nu:           Plaquette direction. Note that mu and nu can be negative
    *                      to facilitate calculating plaquettes for Clover terms. No
    *                      sanity checks are conducted on them in this routine.
    *
    * Return:
    * =======
    * double corresponding to the plaquette value
    *
    */
   const char *funcname = "SU2plaq";
   int uidm = iu[mu+ndim*i]; 
 
   Complex_f Sigma11=u11t[i*ndim+mu]*u11t[uidm*ndim+nu]-u12t[i*ndim+mu]*conj(u12t[uidm*ndim+nu]);
   Complex_f Sigma12=u11t[i*ndim+mu]*u12t[uidm*ndim+nu]+u12t[i*ndim+mu]*conj(u11t[uidm*ndim+nu]);
 
   int uidn = iu[nu+ndim*i]; 
   Complex_f a11=Sigma11*conj(u11t[uidn*ndim+mu])+Sigma12*conj(u12t[uidn*ndim+mu]);
   Complex_f a12=-Sigma11*u12t[uidn*ndim+mu]+Sigma12*u11t[uidn*ndim+mu];
 
   Sigma11=a11*conj(u11t[i*ndim+nu])+a12*conj(u12t[i*ndim+nu]);
   //          Sigma12[i]=-a11[i]*u12t[i*ndim+nu]+a12*u11t[i*ndim+mu];
   //          Not needed in final result as it traces out
   return creal(Sigma11);
}

References Complex_f, and ndim.

Here is the caller graph for this function:

◆ UpDownPart()

int UpDownPart	(	const int	na,
		Complex *	X0,
		Complex *	R1 )

inline

Definition at line 380 of file su2hmc.c.

                                                             {
#ifdef __NVCC__
   cuUpDownPart(na,X0,R1,dimBlock,dimGrid);
   cudaDeviceSynchronise();
#else
   //The only reason this was removed from the original function is for diagnostics
#pragma omp parallel for simd collapse(2) aligned(X0,R1:AVX)
   for(int i=0; i<kvol; i++)
      for(int idirac = 0; idirac < ndirac; idirac++){
         X0[((na*kvol+i)*ndirac+idirac)*nc]=R1[(i*ngorkov+idirac)*nc];
         X0[((na*kvol+i)*ndirac+idirac)*nc+1]=R1[(i*ngorkov+idirac)*nc+1];
      }
#endif
   return 0;
}

◆ Z_gather()

int Z_gather	(	Complex *	x,
		Complex *	y,
		int	n,
		unsigned int *	table,
		unsigned int	mu )

inline

Extracts all the double precision gauge links in the \(\mu\) direction only.

Parameters

x	The output
y	The gauge field for a particular colour
n	Number of sites in the gauge field. This is typically kvol
table	Table containing information on nearest neighbours. Usually id or iu
mu	Direciton we're interested in extractng

Returns: Zero on success, integer error code otherwise

Definition at line 335 of file su2hmc.c.

{
   const char *funcname = "Z_gather";
   //FORTRAN had a second parameter m giving the size of y (kvol+halo) normally
   //Pointers mean that's not an issue for us so I'm leaving it out
#ifdef __NVCC__
   cuZ_gather(x,y,n,table,mu,dimBlock,dimGrid);
#else
#pragma omp parallel for simd aligned (x,y,table:AVX)
   for(int i=0; i<n; i++)
      x[i]=y[table[i*ndim+mu]*ndim+mu];
#endif
   return 0;
}

References ndim.

Functions

Detailed Description

Function Documentation

◆ Average_Plaquette()

◆ C_gather()

◆ Congradp()

◆ Congradq()

◆ Fill_Small_Phi()

◆ Force()

◆ Gauge_force()

◆ Hamilton()

◆ Init()

◆ Measure()

◆ Polyakov()

◆ Reunitarise()

◆ SU2plaq()

◆ UpDownPart()

◆ Z_gather()