su2hmc/sizes_8h_source.html

#ifndef  SIZES

#define  SIZES

#ifdef   __INTEL_MKL__

#define  USE_BLAS

#include <mkl.h>

#elif defined GSL_BLAS

#define  USE_BLAS

#include <gsl/gsl_cblas.h>

#elif defined AMD_BLAS

#define  USE_BLAS

#include <cblas.h>

#endif

#ifdef   __NVCC__

#include <cuda.h>

#include <cuda_runtime_api.h>

#include <cublas_v2.h>

extern cublasHandle_t cublas_handle;

extern cublasStatus_t cublas_status;

extern cudaMemPool_t mempool;

#define cudaDeviceSynchronise() cudaDeviceSynchronize()

#endif

#ifdef __CUDACC__

#include <thrust_complex.h>

#include <thrust/reduce.h>

#include <thrust/device_vector.h>

#else

#include <complex.h>

#define  Complex_f   float complex

#define  Complex     double complex

#endif


#define  FILELEN  64

// Common block definition for parallel variables


#define  nx 8

#if(nx<1)

#error "nx is expected it to be greater than or equal to 1"

#endif


// Keep original restriction of single spatial extent


#define  ny    nx

#if(ny<1)

#error "ny is expected it to be greater than or equal to 1"

#endif


#define  nz    nx

#if(nz<1)

#error "nz is expected it to be greater than or equal to 1"

#endif


#define  nt 16

#if(nt<1)

#error "nt is expected it to be greater than or equal to 1"

#endif


#define  gvol    (nx*ny*nz*nt)

#define  gvol3   (nx*ny*nz)


#define  npx   1

#if(npx<1)

#error "npx is expected it to be greater than or equal to 1"

#elif(nx%npx!=0)

#error "npx should be a divisor of nx"

#endif


// Initially restrict to npz = npy = npx

// This allows us to have a single ksize variable


#define  npy   npx

#if(npy<1)

#error "npy is expected it to be greater than or equal to 1"

#elif(ny%npy!=0)

#error "npy should be a divisor of ny"

#endif


#define  npz   npx

#if(npz<1)

#error "npz is expected it to be greater than or equal to 1"

#elif(nz%npz!=0)

#error "npz should be a divisor of nz"

#endif


#define  npt   1

#if(npt<1)

#error "npt is expected it to be greater than or equal to 1"

#elif(nt%npt!=0)

#error "npt should be a divisor of nt"

#endif


#define  nproc (npx*npy*npz*npt)


#define  nthreads 16


//    Existing parameter definitions.

#define  ksizex   (nx/npx)

#define  ksizey   (ny/npy)

#define  ksizez   (nz/npz)


#define  ksize ksizex


#define  ksizet   (nt/npt)

#define  nf 1


#define  kvol  (ksizet*ksizez*ksizey*ksizex)

#define  kvol3 (ksizez*ksizey*ksizex)


//     integer, parameter :: niterc=2*gvol

//      #define niterc 2*gvol

//    jis: hard limit to avoid runaway trajectories

#if   (nx*ny*nz*nt<=16384)

#define  niterc   gvol

#elif (nx>=(3*nt)/2)

#define  niterc   gvol3

#else

#define  niterc   (gvol/4)

#endif

//    Constants for dimensions.

#define  nc 2

#define  nadj  3

#define  ndirac   4

#define  ndim  4

#define  ngorkov  8


#define  kmom  (ndim*nadj*kvol)

#define  kferm (nc*ngorkov*kvol)

#define  kferm2   (nc*ndirac*kvol)

#if(npx>1)

#define  halox (ksizey*ksizez*ksizet)

#else

#define  halox 0

#endif

#if(npy>1)

#define  haloy (ksizex*ksizez*ksizet)

#else

#define  haloy 0

#endif

#if(npz>1)

#define  haloz (ksizex*ksizey*ksizet)

#else

#define  haloz 0

#endif

#if(npt>1)

#define  halot (ksizex*ksizey*ksizez)

#else

#define  halot 0

#endif

#define  halo  (2*(halox+haloy+haloz+halot))


#define  kfermHalo   (nc*ngorkov*(kvol+halo))

#define  kferm2Halo  (nc*ndirac*(kvol+halo))

#define  kmomHalo (ndim*nadj*(kvol+halo))


//    These all used to be multipled by kferm or kferm2 at the start of Congradq or Congradp

//    On 20240516 in Room 2.19 of the Lloyd building of Trinity we copped that doing so means that the residue is larger

//    if running on a smaller number of cores. In the extreme GPU case the subvolume is the entire volume so the residue

//    can be several orders of magnitude larger than in the smallest sublattice case.

//    Instead, we rescale all the default residues here by sqrt(kferm) or sqrt(kferm2). No matter what size sublattice

//    we use now, the residue will match that of a 2^3X4 sublattice used in the earlier FORTRAN runs

#define  respbp   3.2E-5

#define  rescgg   2.26E-5

#define  rescga   2.26E-8


#ifdef   __AVX512F__

#ifdef __unix__

#warning AVX512 detected

#elif (defined WIN32||_WIN32)

#pragma message("AVX512 detected")

#endif

#define  AVX   64

#elif defined  __AVX__

#ifdef __unix__

#warning AVX or AVX2 detected

#elif (defined WIN32||_WIN32)

#pragma message("AVX or AVX2 detected")

#endif

#define  AVX   32

#else

#ifdef __unix__

#warning No AVX detected, assuming SSE is present

#elif (defined WIN32||_WIN32)

#pragma message("No AVX detected, assuming SSE is present")

#endif

#define  AVX   16

#endif


#ifdef   __NVCC__

/*

 * @section gridblock Grids and Blocks

 *

 * Threads are grouped together to form warps of 32 threads

 * best to keep the block dimension (ksizex*ksizey) multiples of 32,

 * usually between 128 and 256

 * Note that from Volta/Turing  each SM (group of processors)

 * is smaller than on previous generations of GPUs

 */

extern dim3 dimBlock;// =dim3(nx,ny,nz);

extern dim3 dimGrid;//  =dim3(nt,1,1);

                    //For copying over gamval

extern dim3 dimBlockOne;// =dim3(nx,ny,nz);

extern dim3 dimGridOne;//  =dim3(nt,1,1);

#define  USE_BLAS

#endif

#endif