Conjugate gradients. Congradq for the solver and Congradp for the inversion. More...

#include <matrices.h>

Include dependency graph for congrad.c:

Functions
int	Congradq (int na, double res, Complex X1, Complex r, Complex_f u11t, Complex_f u12t, unsigned int iu, unsigned int id, Complex_f gamval_f, int gamin, float dk4m, float dk4p, Complex_f jqq, float akappa, int *itercg)
	Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) Implements up/down partitioning The matrix multiplication step is done at single precision, while the update is done at double.

int	Congradp (int na, double res, Complex Phi, Complex xi, Complex_f u11t, Complex_f u12t, unsigned int iu, unsigned int id, Complex_f gamval, int gamin, float dk4m, float dk4p, Complex_f jqq, float akappa, int *itercg)
	Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) The matrix multiplication step is done at single precision, while the update is done at double.

Detailed Description

Conjugate gradients. Congradq for the solver and Congradp for the inversion.

Definition in file congrad.c.

Function Documentation

◆ Congradp()

int Congradp	(	int	na,
		double	res,
		Complex *	Phi,
		Complex *	xi,
		Complex_f *	u11t,
		Complex_f *	u12t,
		unsigned int *	iu,
		unsigned int *	id,
		Complex_f *	gamval,
		int *	gamin,
		float *	dk4m,
		float *	dk4p,
		Complex_f	jqq,
		float	akappa,
		int *	itercg )

Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) The matrix multiplication step is done at single precision, while the update is done at double.

Parameters

na	Flavour index
res	Limit for conjugate gradient
Phi	Pseudofermion field.
xi	Returned as \((M^\dagger M)^{-1} \Phi\)
u11t	First colour's trial field
u12t	Second colour's trial field
iu	Upper halo indices
id	Lower halo indices
gamval_f	Single precision gamma matrices rescaled by kappa
gamin	Dirac indices
dk4m	\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p	\(\left(1-\gamma_0\right)e^\mu\)
jqq	Diquark source
akappa	Hopping Parameter
itercg	Counts the iterations of the conjugate gradient

Returns: 0 on success, integer error code otherwise

Definition at line 262 of file congrad.c.

                                                                                                   {
   /*
    * @brief Matrix Inversion via Conjugate Gradient
    * Solves @f$(M^\dagger)Mx=\Phi@f$
    * No even/odd partitioning.
    * The matrix multiplication step is done at single precision, while the update is done at double
    *
    * @param   na:         Flavour index
    * @param   res:        Limit for conjugate gradient
    * @param   Phi:        @f(\Phi@f) initially, 
    * @param   xi:         Returned as @f((M^\dagger M)^{-1} \Phi@f)
    * @param   u11t:       First colour's trial field
    * @param   u12t:       Second colour's trial field
    * @param   iu:         Upper halo indices
    * @param   id:         Lower halo indices
    * @param   gamval:     Gamma matrices
    * @param   gamin:      Dirac indices
    * @param   dk4m:
    * @param   dk4p:
    * @param   jqq:        Diquark source
    * @param   akappa:     Hopping Parameter
    * @param   itercg:     Counts the iterations of the conjugate gradient
    *
    * @return 0 on success, integer error code otherwise
    */
   const char *funcname = "Congradp";
   //Return value
   int ret_val=0;
   const double resid = res*res;
   //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
   //These alpha and beta terms should be double, but that causes issues with BLAS. Instead we declare
   //them Complex and work with the real part (especially for α_d)
   //Give initial values Will be overwritten if niterx>0
#ifdef __NVCC__
   Complex_f *p_f, *r_f, *xi_f, *x1_f, *x2_f;
   int device; cudaGetDevice(&device);
#ifdef _DEBUG
   cudaMallocManaged((void **)&p_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&r_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&x1_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&x2_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
   cudaMallocManaged((void **)&xi_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
#else
   cudaMalloc((void **)&p_f, kfermHalo*sizeof(Complex_f));
   cudaMalloc((void **)&r_f, kferm*sizeof(Complex_f));
   cudaMalloc((void **)&x1_f, kfermHalo*sizeof(Complex_f));
   cudaMalloc((void **)&x2_f, kferm*sizeof(Complex_f));
   cudaMalloc((void **)&xi_f, kferm*sizeof(Complex_f));
#endif
#else
   Complex_f *p_f  = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
   Complex_f *r_f  = aligned_alloc(AVX,kferm*sizeof(Complex_f));
   Complex_f *x1_f   =  aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
   Complex_f *x2_f   =  aligned_alloc(AVX,kferm*sizeof(Complex_f));
   Complex_f *xi_f   =  aligned_alloc(AVX,kferm*sizeof(Complex_f));
#endif
   double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
   double alphan=0.0;
   //Instead of copying element-wise in a loop, use memcpy.
#ifdef __NVCC__
   //Get xi  in single precision, then swap to AoS format
   cuComplex_convert(p_f,xi,kferm,true,dimBlock,dimGrid);
   Transpose_c(p_f,ngorkov*nc,kvol,dimGrid,dimBlock);
   cudaMemcpy(xi_f,p_f,kferm*sizeof(Complex_f),cudaMemcpyDefault);
 
   //And repeat for r
   cuComplex_convert(r_f,Phi+na*kferm,kferm,true,dimBlock,dimGrid);
   Transpose_c(r_f,ngorkov*nc,kvol,dimGrid,dimBlock);
 
   //Flip all the gauge fields around so memory is coalesced
   Transpose_c(u11t,ndim,kvol,dimGrid,dimBlock);
   Transpose_c(u12t,ndim,kvol,dimGrid,dimBlock);
#else
#pragma omp parallel for simd aligned(p_f,xi_f,xi,r_f,Phi:AVX)
   for(int i =0;i<kferm;i++){
      p_f[i]=xi_f[i]=(Complex_f)xi[i];
      r_f[i]=(Complex_f)Phi[na*kferm+i];
   }
#endif
 
   // Declaring placeholder arrays 
   // This x1 is NOT related to the /common/vectorp/X1 in the FORTRAN code and should not
   // be confused with X1 the global variable
 
   //niterx isn't called as an index but we'll start from zero with the C code to make the
   //if statements quicker to type
   double betan;
#ifdef __NVCC__
   cudaDeviceSynchronise();
#endif
   for((*itercg)=0; (*itercg)<=niterc; (*itercg)++){
      //Don't overwrite on first run. 
      //x2=(M^†)x1=(M^†)Mp
      Dslash_f(x1_f,p_f,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa);
      Dslashd_f(x2_f,x1_f,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa);
#ifdef __NVCC__
      cudaDeviceSynchronise();
#endif
      //We can't evaluate α on the first niterx because we need to get β_n.
      if(*itercg){
         //x*.x
#ifdef USE_BLAS
         float alphad_f;
#ifdef __NVCC__
         cublasScnrm2(cublas_handle,kferm,(cuComplex*) x1_f, 1,(float *)&alphad_f);
         alphad = alphad_f*alphad_f;
#else
         alphad_f = cblas_scnrm2(kferm, x1_f, 1);
#endif
         alphad = alphad_f*alphad_f;
#else
         alphad=0;
         for(int i = 0; i<kferm; i++)
            alphad+=conj(x1_f[i])*x1_f[i];
#endif
#if(nproc>1)
         Par_fsum((float *)&alphad);
#endif
         //α=(r.r)/p(M^†)Mp
         alpha=alphan/creal(alphad);
         //       Complex_f alpha_f = (Complex_f)alpha;
         //x+αp
#ifdef USE_BLAS
         Complex_f alpha_f=(float)alpha;
#ifdef __NVCC__
         cublasCaxpy(cublas_handle,kferm,(cuComplex*) &alpha_f,(cuComplex*) p_f,1,(cuComplex*) xi_f,1);
#else
         cblas_caxpy(kferm, (Complex_f*)&alpha_f,(Complex_f*)p_f, 1, (Complex_f*)xi_f, 1);
#endif
#else
#pragma omp parallel for simd aligned(xi_f,p_f:AVX)
         for(int i = 0; i<kferm; i++)
            xi_f[i]+=alpha*p_f[i];
#endif
      }
 
      //r=α(M^†)Mp and β_n=r*.r
#if defined USE_BLAS
      Complex_f alpha_m=(Complex_f)(-alpha);
      float betan_f=0;
#ifdef __NVCC__
      cublasCaxpy(cublas_handle,kferm, (cuComplex *)&alpha_m,(cuComplex *) x2_f, 1,(cuComplex *) r_f, 1);
      //cudaDeviceSynchronise();
      //r*.r
      cublasScnrm2(cublas_handle,kferm,(cuComplex *) r_f,1,(float *)&betan_f);
#else
      cblas_caxpy(kferm,(Complex_f*) &alpha_m,(Complex_f*) x2_f, 1,(Complex_f*) r_f, 1);
      //r*.r
      betan_f = cblas_scnrm2(kferm, (Complex_f*)r_f,1);
#endif
      //Gotta square it to "undo" the norm
      betan=betan_f*betan_f;
#else
      //Just like Congradq, this loop could be unrolled but will need a reduction to deal with the betan 
      //addition.
      betan = 0;
      //If we get a small enough β_n before hitting the iteration cap we break
#pragma omp parallel for simd aligned(x2_f,r_f:AVX) reduction(+:betan)
      for(int i = 0; i<kferm;i++){
         r_f[i]-=alpha*x2_f[i];
         betan+=conj(r_f[i])*r_f[i];
      }
#endif
      //This is basically just congradq at the end. Check there for comments
#if(nproc>1)
      Par_dsum(&betan);
#endif
#ifdef _DEBUG
#ifdef _DEBUGCG
      char *endline = "\n";
#else
      char *endline = "\r";
#endif
      if(!rank) printf("Iter (CG) = %i β_n= %e α= %e%s", *itercg, betan, alpha,endline);
#endif
      if(betan<resid){
         //Started counting from zero so add one to make it accurate
         (*itercg)++;
#ifdef _DEBUG
         if(!rank) printf("\nIter (CG) = %i resid = %e toler = %e\n", *itercg, betan, resid);
#endif
         ret_val=0;  break;
      }
      else if(*itercg==niterc-1){
         if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n",
               ITERLIM, funcname, niterc, betan);
         ret_val=ITERLIM;  break;
      }
      //Note that beta below is not the global beta and scoping is used to avoid conflict between them
      Complex beta = (*itercg) ? betan/betad : 0;
      betad=betan; alphan=betan;
      //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by 
      //β instead of x.
      //There is cblas_zaxpby in the MKL though, set a = 1 and b = β.
#ifdef USE_BLAS
      Complex_f beta_f = (Complex_f)beta;
      Complex_f a = 1.0;
#ifdef __NVCC__
      cublasCscal(cublas_handle,kferm,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
      cublasCaxpy(cublas_handle,kferm,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
      cudaDeviceSynchronise();
#elif (defined __INTEL_MKL__ || defined AMD_BLAS)
      cblas_caxpby(kferm, &a, r_f, 1, &beta_f,  p_f, 1);
#else
      cblas_cscal(kferm,&beta_f,p_f,1);
      cblas_caxpy(kferm,&a,r_f,1,p_f,1);
#endif
#else
#pragma omp parallel for simd aligned(r_f,p_f:AVX)
      for(int i=0; i<kferm; i++)
         p_f[i]=r_f[i]+beta*p_f[i];
#endif
   }
#ifdef __NVCC__
   Transpose_c(xi_f,kvol,ngorkov*nc,dimGrid,dimBlock);
   Transpose_c(r_f,kvol,ngorkov*nc,dimGrid,dimBlock);
 
   Transpose_c(u11t,kvol,ndim,dimGrid,dimBlock);
   Transpose_c(u12t,kvol,ndim,dimGrid,dimBlock);
   cudaDeviceSynchronise();
   cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
#else
#pragma omp simd
   for(int i = 0; i <kferm;i++){
      xi[i]=(Complex)xi_f[i];
   }
#endif
#ifdef   __NVCC__
   cudaFree(p_f); cudaFree(r_f);cudaFree(x1_f); cudaFree(x2_f); cudaFree(xi_f); 
#else
   free(p_f); free(r_f); free(x1_f); free(x2_f); free(xi_f); 
#endif
   return ret_val;
}

References AVX, Complex, Complex_f, Dslash_f(), Dslashd_f(), kferm, kfermHalo, kvol, nc, ndim, ngorkov, niterc, Par_dsum(), Par_fsum(), and rank.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ Congradq()

int Congradq	(	int	na,
		double	res,
		Complex *	X1,
		Complex *	r,
		Complex_f *	u11t_f,
		Complex_f *	u12t_f,
		unsigned int *	iu,
		unsigned int *	id,
		Complex_f *	gamval_f,
		int *	gamin,
		float *	dk4m_f,
		float *	dk4p_f,
		Complex_f	jqq,
		float	akappa,
		int *	itercg )

Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) Implements up/down partitioning The matrix multiplication step is done at single precision, while the update is done at double.