su2hmc
Loading...
Searching...
No Matches
su2hmc.c File Reference

An ecclectic collection of functions used in the HMC. More...

#include <assert.h>
#include <coord.h>
#include <matrices.h>
#include <par_mpi.h>
#include <random.h>
#include <string.h>
#include <su2hmc.h>
Include dependency graph for su2hmc.c:

Go to the source code of this file.

Functions

int Init (int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, unsigned int *iu, unsigned int *id)
 Initialises the system.
 
int Hamilton (double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
 Calculate the Hamiltonian.
 
int C_gather (Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
 Extracts all the single precision gauge links in the \(\mu\) direction only.
 
int Z_gather (Complex *x, Complex *y, int n, unsigned int *table, unsigned int mu)
 Extracts all the double precision gauge links in the \(\mu\) direction only.
 
int Fill_Small_Phi (int na, Complex *smallPhi, Complex *Phi)
 
int UpDownPart (const int na, Complex *X0, Complex *R1)
 

Detailed Description

An ecclectic collection of functions used in the HMC.

Definition in file su2hmc.c.

Function Documentation

◆ C_gather()

int C_gather ( Complex_f * x,
Complex_f * y,
int n,
unsigned int * table,
unsigned int mu )
inline

Extracts all the single precision gauge links in the \(\mu\) direction only.

Parameters
xThe output
yThe gauge field for a particular colour
nNumber of sites in the gauge field. This is typically kvol
tableTable containing information on nearest neighbours. Usually id or iu
muDireciton we're interested in extractng
Returns
Zero on success, integer error code otherwise

Definition at line 321 of file su2hmc.c.

322{
323 const char *funcname = "C_gather";
324 //FORTRAN had a second parameter m giving the size of y (kvol+halo) normally
325 //Pointers mean that's not an issue for us so I'm leaving it out
326#ifdef __NVCC__
327 cuC_gather(x,y,n,table,mu,dimBlock,dimGrid);
328#else
329#pragma omp parallel for simd aligned (x,y,table:AVX)
330 for(int i=0; i<n; i++)
331 x[i]=y[table[i*ndim+mu]*ndim+mu];
332#endif
333 return 0;
334}
#define ndim
Dimensions.
Definition sizes.h:179

References ndim.

Here is the caller graph for this function:

◆ Fill_Small_Phi()

int Fill_Small_Phi ( int na,
Complex * smallPhi,
Complex * Phi )
inline

Copies necessary (2*4*kvol) elements of Phi into a vector variable

Parameters
naflavour index
smallPhiThe partitioned output
PhiThe pseudofermion field
Returns
Zero on success, integer error code otherwise

Definition at line 349 of file su2hmc.c.

350{
351 /*Copies necessary (2*4*kvol) elements of Phi into a vector variable
352 *
353 * Globals:
354 * =======
355 * Phi: The source array
356 *
357 * Parameters:
358 * ==========
359 * int na: flavour index
360 * Complex *smallPhi: The target array
361 *
362 * Returns:
363 * =======
364 * Zero on success, integer error code otherwise
365 */
366 const char *funcname = "Fill_Small_Phi";
367 //BIG and small phi index
368#ifdef __NVCC__
369 cuFill_Small_Phi(na,smallPhi,Phi,dimBlock,dimGrid);
370#else
371#pragma omp parallel for simd aligned(smallPhi,Phi:AVX) collapse(3)
372 for(int i = 0; i<kvol;i++)
373 for(int idirac = 0; idirac<ndirac; idirac++)
374 for(int ic= 0; ic<nc; ic++)
375 // PHI_index=i*16+j*2+k;
376 smallPhi[(i*ndirac+idirac)*nc+ic]=Phi[((na*kvol+i)*ngorkov+idirac)*nc+ic];
377#endif
378 return 0;
379}
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define kvol
Sublattice volume.
Definition sizes.h:154
#define ndirac
Dirac indices.
Definition sizes.h:177

References kvol, nc, ndirac, and ngorkov.

Here is the caller graph for this function:

◆ Hamilton()

int Hamilton ( double * h,
double * s,
double res2,
double * pp,
Complex * X0,
Complex * X1,
Complex * Phi,
Complex * u11t,
Complex * u12t,
Complex_f * u11t_f,
Complex_f * u12t_f,
unsigned int * iu,
unsigned int * id,
Complex_f * gamval_f,
int * gamin,
float * dk4m_f,
float * dk4p_f,
Complex_f jqq,
float akappa,
float beta,
double * ancgh,
int traj )

Calculate the Hamiltonian.

Parameters
hHamiltonian
sAction
res2Limit for conjugate gradient
ppMomentum field
X0Up/down partitioned pseudofermion field
X1Holder for the partitioned fermion field, then the conjugate gradient output
PhiPseudofermion field
u11t,u12tGauge fields
u11t_f,u12t_fGauge fields (single precision)
iu,idLattice indices
gamval_fSingle precision gamma matrices rescaled by kappa
gaminGamma indices
dk4m_f\(\left(1+\gamma_0\right)e^{-\mu}\) float
dk4p_f\(\left(1-\gamma_0\right)e^\mu\) float
jqqDiquark source
akappaHopping parameter
betaInverse gauge coupling
ancghConjugate gradient iterations counter
trajCalling trajectory for error reporting
Returns
Zero on success. Integer Error code otherwise.

Definition at line 208 of file su2hmc.c.

211 {
212 /*
213 * @brief Calculate the Hamiltonian
214 *
215 *
216 * @param h: Hamiltonian
217 * @param s: Action
218 * @param res2: Limit for conjugate gradient
219 * @param X0:
220 * @param X1:
221 * @param Phi:
222 * @param u11t,u12t: Gauge fields
223 * @param u11t_f,u12t_f: Gauge fields
224 * @param iu,id: Lattice indices
225 * @param gamval_f: Gamma matrices
226 * @param gamin: Gamma indices
227 * @param dk4m_f: $exp(-\mu)$ float
228 * @param dk4p_f: $exp(\mu)$ float
229 * @param jqq: Diquark source
230 * @param akappa: Hopping parameter
231 * @param beta: Inverse gauge coupling
232 * @param ancgh: Conjugate gradient iterations counter
233 *
234 * @return Zero on success. Integer Error code otherwise.
235 */
236 const char *funcname = "Hamilton";
237 //Iterate over momentum terms.
238#ifdef __NVCC__
239 double hp;
240 int device=-1;
241 cudaGetDevice(&device);
242 cudaMemPrefetchAsync(pp,kmom*sizeof(double),device,NULL);
243 cublasDnrm2(cublas_handle, kmom, pp, 1,&hp);
244 hp*=hp;
245#elif defined USE_BLAS
246 double hp = cblas_dnrm2(kmom, pp, 1);
247 hp*=hp;
248#else
249 double hp=0;
250 for(int i = 0; i<kmom; i++)
251 hp+=pp[i]*pp[i];
252#endif
253 hp*=0.5;
254 double avplaqs, avplaqt;
255 double hg = 0;
256 //avplaq? isn't seen again here.
257 Average_Plaquette(&hg,&avplaqs,&avplaqt,u11t_f,u12t_f,iu,beta);
258
259 double hf = 0; int itercg = 0;
260#ifdef __NVCC__
261 Complex *smallPhi;
262 cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),NULL);
263#else
264 Complex *smallPhi = aligned_alloc(AVX,kferm2*sizeof(Complex));
265#endif
266 //Iterating over flavours
267 for(int na=0;na<nf;na++){
268#ifdef __NVCC__
269#ifdef _DEBUG
270 cudaDeviceSynchronise();
271#endif
272 cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,streams[0]);
273#ifdef _DEBUG
274 cudaDeviceSynchronise();
275#endif
276#else
277 memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
278#endif
279 Fill_Small_Phi(na, smallPhi, Phi);
280 if(Congradq(na,res2,X1,smallPhi,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg))
281 fprintf(stderr,"Trajectory %d\n", traj);
282
283 *ancgh+=itercg;
284#ifdef __NVCC__
285 cudaMemcpyAsync(X0+na*kferm2,X1,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,streams[0]);
286#else
287 memcpy(X0+na*kferm2,X1,kferm2*sizeof(Complex));
288#endif
289 Fill_Small_Phi(na, smallPhi,Phi);
290#ifdef __NVCC__
291 Complex dot;
292 cublasZdotc(cublas_handle,kferm2,(cuDoubleComplex *)smallPhi,1,(cuDoubleComplex *) X1,1,(cuDoubleComplex *) &dot);
293 hf+=creal(dot);
294#elif defined USE_BLAS
295 Complex dot;
296 cblas_zdotc_sub(kferm2, smallPhi, 1, X1, 1, &dot);
297 hf+=creal(dot);
298#else
299 //It is a dot product of the flattened arrays, could use
300 //a module to convert index to coordinate array...
301 for(int j=0;j<kferm2;j++)
302 hf+=creal(conj(smallPhi[j])*X1[j]);
303#endif
304 }
305#ifdef __NVCC__
306 cudaFreeAsync(smallPhi,NULL);
307#else
308 free(smallPhi);
309#endif
310 //hg was summed over inside of Average_Plaquette.
311#if(nproc>1)
312 Par_dsum(&hp); Par_dsum(&hf);
313#endif
314 *s=hg+hf; *h=(*s)+hp;
315#ifdef _DEBUG
316 if(!rank)
317 printf("hg=%.5e; hf=%.5e; hp=%.5e; h=%.5e\n", hg, hf, hp, *h);
318#endif
319 return 0;
320}
int rank
The MPI rank.
Definition par_mpi.c:22
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define kmom
sublattice momentum sites
Definition sizes.h:184
#define Complex
Double precision complex number.
Definition sizes.h:58
#define nf
Fermion flavours (double it)
Definition sizes.h:151
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
Definition su2hmc.c:349
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
Definition congrad.c:7
int Average_Plaquette(double *hg, double *avplaqs, double *avplaqt, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, float beta)
Calculates the gauge action using new (how new?) lookup table.
Definition bosonic.c:8

References Average_Plaquette(), AVX, Complex, Congradq(), Fill_Small_Phi(), kferm2, kmom, nf, Par_dsum(), and rank.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Init()

int Init ( int istart,
int ibound,
int iread,
float beta,
float fmu,
float akappa,
Complex_f ajq,
Complex * u11,
Complex * u12,
Complex * u11t,
Complex * u12t,
Complex_f * u11t_f,
Complex_f * u12t_f,
Complex * gamval,
Complex_f * gamval_f,
int * gamin,
double * dk4m,
double * dk4p,
float * dk4m_f,
float * dk4p_f,
unsigned int * iu,
unsigned int * id )

Initialises the system.

Parameters
istartZero for cold, >1 for hot, <1 for none
iboundPeriodic boundary conditions
ireadRead configuration from file
betaInverse gauge coupling
fmuChemical potential
akappaHopping parameter
ajqDiquark source
u11,u12Gauge fields
u11t,u12tTrial gauge field
u11t_f,u12t_fTrial gauge field (single precision)
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)^\mu\)
dk4m_f\(\left(1+\gamma_0\right)e^{-\mu}\) float
dk4p_f\(\left(1-\gamma_0\right)e^\mu\) float
iu,idUp halo indices
gaminGamma matrix indices
gamvalDouble precision gamma matrices rescaled by kappa
gamval_fSingle precision gamma matrices rescaled by kappa
Returns
Zero on success, integer error code otherwise

Definition at line 19 of file su2hmc.c.

22 {
23 /*
24 * Initialises the system
25 *
26 * Calls:
27 * ======
28 * Addrc, Check_addr, ran2, DHalo_swap_dir, Par_sread, Par_ranset, Reunitarise
29 *
30 * Globals:
31 * =======
32 * Complex gamval: Gamma Matrices
33 * Complex_f gamval_f: Float Gamma matrices:
34 *
35 * Parameters:
36 * ==========
37 * int istart: Zero for cold, >1 for hot, <1 for none
38 * int ibound: Periodic boundary conditions
39 * int iread: Read configuration from file
40 * float beta: beta
41 * float fmu: Chemical potential
42 * float akappa: Hopping parameter
43 * Complex_f ajq: Diquark source
44 * Complex *u11: First colour field
45 * Complex *u12: Second colour field
46 * Complex *u11t: First colour trial field
47 * Complex *u12t: Second colour trial field
48 * Complex_f *u11t_f: First float trial field
49 * Complex_f *u12t_f: Second float trial field
50 * double *dk4m $exp(-\mu)$
51 * double *dk4p: $exp(\mu)$
52 * float *dk4m_f: $exp(-\mu)$ float
53 * float *dk4p_f: $exp(\mu)$ float
54 * unsigned int *iu: Up halo indices
55 * unsigned int *id: Down halo indices
56 *
57 * Returns:
58 * =======
59 * Zero on success, integer error code otherwise
60 */
61 const char *funcname = "Init";
62
63#ifdef _OPENMP
64 omp_set_num_threads(nthreads);
65#ifdef __INTEL_MKL__
66 mkl_set_num_threads(nthreads);
67#endif
68#endif
69 //First things first, calculate a few constants for coordinates
70 Addrc(iu, id);
71 //And confirm they're legit
74#ifdef _DEBUG
75 printf("Checked addresses\n");
76#endif
77 double chem1=exp(fmu); double chem2 = 1/chem1;
78 //CUDA this. Only limit will be the bus speed
79#pragma omp parallel for simd aligned(dk4m,dk4p:AVX)
80 for(int i = 0; i<kvol; i++){
81 dk4p[i]=akappa*chem1;
82 dk4m[i]=akappa*chem2;
83 }
84 //Anti periodic Boundary Conditions. Flip the terms at the edge of the time
85 //direction
86 if(ibound == -1 && pcoord[3+ndim*rank]==npt-1){
87#ifdef _DEBUG
88 printf("Implementing antiperiodic boundary conditions on rank %i\n", rank);
89#endif
90#pragma omp parallel for simd aligned(dk4m,dk4p:AVX)
91 for(int k= kvol-1; k>=kvol-kvol3; k--){
92 //int k = kvol - kvol3 + i;
93 dk4p[k]*=-1;
94 dk4m[k]*=-1;
95 }
96 }
97 //These are constant so swap the halos when initialising and be done with it
98 //May need to add a synchronisation statement here first
99#if(npt>1)
100 DHalo_swap_dir(dk4p, 1, 3, UP);
101 DHalo_swap_dir(dk4m, 1, 3, UP);
102#endif
103 //Float versions
104#ifdef __NVCC__
105 cuReal_convert(dk4p_f,dk4p,kvol+halo,true,dimBlock,dimGrid);
106 cuReal_convert(dk4m_f,dk4m,kvol+halo,true,dimBlock,dimGrid);
107#else
108#pragma omp parallel for simd aligned(dk4m,dk4p,dk4m_f,dk4p_f:AVX)
109 for(int i=0;i<kvol+halo;i++){
110 dk4p_f[i]=(float)dk4p[i];
111 dk4m_f[i]=(float)dk4m[i];
112 }
113#endif
114 int __attribute__((aligned(AVX))) gamin_t[4][4] = {{3,2,1,0},{3,2,1,0},{2,3,0,1},{2,3,0,1}};
115 //Gamma Matrices in Chiral Representation
116 //Gattringer and Lang have a nice crash course in appendix A.2 of
117 //Quantum Chromodynamics on the Lattice (530.14 GAT)
118 //_t is for temp. We copy these into the real gamvals later
119#ifdef __NVCC__
120 cudaMemcpy(gamin,gamin_t,4*4*sizeof(int),cudaMemcpyDefault);
121#else
122 memcpy(gamin,gamin_t,4*4*sizeof(int));
123#endif
124 Complex __attribute__((aligned(AVX))) gamval_t[5][4] = {{-I,-I,I,I},{-1,1,1,-1},{-I,I,I,-I},{1,1,1,1},{1,1,-1,-1}};
125 //Each gamma matrix is rescaled by akappa by flattening the gamval array
126#if defined USE_BLAS
127 //Don't cuBLAS this. It is small and won't saturate the GPU. Let the CPU handle
128 //it and just copy it later
129 cblas_zdscal(5*4, akappa, gamval_t, 1);
130#else
131#pragma omp parallel for simd collapse(2) aligned(gamval,gamval_f:AVX)
132 for(int i=0;i<5;i++)
133 for(int j=0;j<4;j++)
134 gamval_t[i][j]*=akappa;
135#endif
136
137#ifdef __NVCC__
138 cudaMemcpy(gamval,gamval_t,5*4*sizeof(Complex),cudaMemcpyDefault);
139 cuComplex_convert(gamval_f,gamval,20,true,dimBlockOne,dimGridOne);
140#else
141 memcpy(gamval,gamval_t,5*4*sizeof(Complex));
142 for(int i=0;i<5*4;i++)
143 gamval_f[i]=(Complex_f)gamval[i];
144#endif
145 if(iread){
146 if(!rank) printf("Calling Par_sread() for configuration: %i\n", iread);
147 Par_sread(iread, beta, fmu, akappa, ajq,u11,u12,u11t,u12t);
148 Par_ranset(&seed,iread);
149 }
150 else{
151 Par_ranset(&seed,iread);
152 if(istart==0){
153 //Initialise a cold start to zero
154 //memset is safe to use here because zero is zero
155#pragma omp parallel for simd aligned(u11t:AVX)
156 //Leave it to the GPU?
157 for(int i=0; i<kvol*ndim;i++){
158 u11t[i]=1; u12t[i]=0;
159 }
160 }
161 else if(istart>0){
162 //Ideally, we can use gsl_ranlux as the PRNG
163#ifdef __RANLUX__
164 for(int i=0; i<kvol*ndim;i++){
165 u11t[i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
166 u12t[i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
167 }
168 //If not, the Intel Vectorise Mersenne Twister
169#elif (defined __INTEL_MKL__&&!defined USE_RAN2)
170 //Good news, casting works for using a double to create random complex numbers
171 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*ndim*kvol, u11t, -1, 1);
172 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*ndim*kvol, u12t, -1, 1);
173 //Last resort, Numerical Recipes' Ran2
174#else
175 for(int i=0; i<kvol*ndim;i++){
176 u11t[i]=2*(ran2(&seed)-0.5+I*(ran2(&seed)-0.5));
177 u12t[i]=2*(ran2(&seed)-0.5+I*(ran2(&seed)-0.5));
178 }
179#endif
180 }
181 else
182 fprintf(stderr,"Warning %i in %s: Gauge fields are not initialised.\n", NOINIT, funcname);
183
184#ifdef __NVCC__
185 int device=-1;
186 cudaGetDevice(&device);
187 cudaMemPrefetchAsync(u11t, ndim*kvol*sizeof(Complex),device,streams[0]);
188 cudaMemPrefetchAsync(u12t, ndim*kvol*sizeof(Complex),device,streams[1]);
189#endif
190 //Send trials to accelerator for reunitarisation
191 Reunitarise(u11t,u12t);
192 //Get trials back
193#ifdef __NVCC__
194 cudaMemcpyAsync(u11,u11t,ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[0]);
195 cudaMemPrefetchAsync(u11, ndim*kvol*sizeof(Complex),device,streams[0]);
196 cudaMemcpyAsync(u12,u12t,ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[1]);
197 cudaMemPrefetchAsync(u12, ndim*kvol*sizeof(Complex),device,streams[1]);
198#else
199 memcpy(u11, u11t, ndim*kvol*sizeof(Complex));
200 memcpy(u12, u12t, ndim*kvol*sizeof(Complex));
201#endif
202 }
203#ifdef _DEBUG
204 printf("Initialisation Complete\n");
205#endif
206 return 0;
207}
int Check_addr(unsigned int *table, int lns, int lnt, int imin, int imax)
Definition coord.c:334
int Addrc(unsigned int *iu, unsigned int *id)
Loads the addresses required during the update.
Definition coord.c:16
#define UP
Flag for send up.
Definition par_mpi.h:37
int Par_sread(const int iread, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t)
Reads and assigns the gauges from file.
Definition par_mpi.c:127
int DHalo_swap_dir(double *d, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int * pcoord
The processor grid.
Definition par_mpi.c:19
int Par_ranset(long *seed, int iread)
Uses the rank to get a new seed. Copying from the FORTRAN description here c create new seeds in rang...
Definition random.c:135
double ran2(long *idum)
Generates uniformly distributed random double between zero and one as described in numerical recipes....
Definition random.c:440
long seed
RAN2 seed.
Definition random.c:27
#define ksizet
Sublattice t extent.
Definition sizes.h:149
#define nthreads
Number of threads for OpenMP, which can be overwritten at runtime.
Definition sizes.h:135
#define ksize
Sublattice spatial extent for a cubic lattice.
Definition sizes.h:146
#define kvol3
Sublattice spatial volume.
Definition sizes.h:156
#define halo
Total Halo size.
Definition sizes.h:222
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define npt
Processor grid t extent.
Definition sizes.h:124
int Reunitarise(Complex *u11t, Complex *u12t)
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
Definition matrices.c:904

References Addrc(), AVX, Check_addr(), Complex, Complex_f, DHalo_swap_dir(), halo, ksize, ksizet, kvol, kvol3, ndim, npt, nthreads, Par_ranset(), Par_sread(), pcoord, ran2(), rank, Reunitarise(), seed, and UP.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ UpDownPart()

int UpDownPart ( const int na,
Complex * X0,
Complex * R1 )
inline

Definition at line 380 of file su2hmc.c.

380 {
381#ifdef __NVCC__
382 cuUpDownPart(na,X0,R1,dimBlock,dimGrid);
383 cudaDeviceSynchronise();
384#else
385 //The only reason this was removed from the original function is for diagnostics
386#pragma omp parallel for simd collapse(2) aligned(X0,R1:AVX)
387 for(int i=0; i<kvol; i++)
388 for(int idirac = 0; idirac < ndirac; idirac++){
389 X0[((na*kvol+i)*ndirac+idirac)*nc]=R1[(i*ngorkov+idirac)*nc];
390 X0[((na*kvol+i)*ndirac+idirac)*nc+1]=R1[(i*ngorkov+idirac)*nc+1];
391 }
392#endif
393 return 0;
394}

◆ Z_gather()

int Z_gather ( Complex * x,
Complex * y,
int n,
unsigned int * table,
unsigned int mu )
inline

Extracts all the double precision gauge links in the \(\mu\) direction only.

Parameters
xThe output
yThe gauge field for a particular colour
nNumber of sites in the gauge field. This is typically kvol
tableTable containing information on nearest neighbours. Usually id or iu
muDireciton we're interested in extractng
Returns
Zero on success, integer error code otherwise

Definition at line 335 of file su2hmc.c.

336{
337 const char *funcname = "Z_gather";
338 //FORTRAN had a second parameter m giving the size of y (kvol+halo) normally
339 //Pointers mean that's not an issue for us so I'm leaving it out
340#ifdef __NVCC__
341 cuZ_gather(x,y,n,table,mu,dimBlock,dimGrid);
342#else
343#pragma omp parallel for simd aligned (x,y,table:AVX)
344 for(int i=0; i<n; i++)
345 x[i]=y[table[i*ndim+mu]*ndim+mu];
346#endif
347 return 0;
348}

References ndim.