su2hmc
Loading...
Searching...
No Matches
su2hmc.h File Reference

Function declarations for most of the routines. More...

#include <errorcodes.h>
#include <integrate.h>
#include <sizes.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
Include dependency graph for su2hmc.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions

int Force (double *dSdpi, int iflag, double res1, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancg)
 Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time.
 
int Gauge_force (double *dSdpi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, float beta)
 Calculates the gauge force due to the Wilson Action at each intermediate time.
 
int Init (int istart, int ibound, int iread, float beta, float fmu, float akappa, Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, unsigned int *iu, unsigned int *id)
 Initialises the system.
 
int Hamilton (double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
 Calculate the Hamiltonian.
 
int Congradq (int na, double res, Complex *X1, Complex *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, int *itercg)
 Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) Implements up/down partitioning The matrix multiplication step is done at single precision, while the update is done at double.
 
int Congradp (int na, double res, Complex *Phi, Complex *xi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
 Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) The matrix multiplication step is done at single precision, while the update is done at double.
 
int Measure (double *pbp, double *endenf, double *denf, Complex *qq, Complex *qbqb, double res, int *itercg, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, Complex *Phi, Complex *R1)
 Calculate fermion expectation values via a noisy estimator.
 
int Average_Plaquette (double *hg, double *avplaqs, double *avplaqt, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, float beta)
 Calculates the gauge action using new (how new?) lookup table.
 
float SU2plaq (Complex_f *u11t, Complex_f *u12t, unsigned int *iu, int i, int mu, int nu)
 Calculates the plaquette at site i in the \(\mu--\nu\) direction.
 
double Polyakov (Complex_f *u11t, Complex_f *u12t)
 Calculate the Polyakov loop (no prizes for guessing that one...)
 
int C_gather (Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
 Extracts all the single precision gauge links in the \(\mu\) direction only.
 
int Z_gather (Complex *x, Complex *y, int n, unsigned int *table, unsigned int mu)
 Extracts all the double precision gauge links in the \(\mu\) direction only.
 
int Fill_Small_Phi (int na, Complex *smallPhi, Complex *Phi)
 
int UpDownPart (const int na, Complex *X0, Complex *R1)
 
int Reunitarise (Complex *u11t, Complex *u12t)
 Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
 

Detailed Description

Function declarations for most of the routines.

Definition in file su2hmc.h.

Function Documentation

◆ Average_Plaquette()

int Average_Plaquette ( double * hg,
double * avplaqs,
double * avplaqt,
Complex_f * u11t,
Complex_f * u12t,
unsigned int * iu,
float beta )

Calculates the gauge action using new (how new?) lookup table.

Follows a routine called qedplaq in some QED3 code

Parameters
hgGauge component of Hamilton
avplaqsAverage spacial Plaquette
avplaqtAverage Temporal Plaquette
u11t,u12tThe trial fields
iuUpper halo indices
betaInverse gauge coupling
See also
Par_dsum
Returns
Zero on success, integer error code otherwise

Definition at line 8 of file bosonic.c.

8 {
9 /*
10 * Calculates the gauge action using new (how new?) lookup table
11 * Follows a routine called qedplaq in some QED3 code
12 *
13 * Parameters:
14 * =========
15 * hg Gauge component of Hamilton
16 * avplaqs Average spacial Plaquette
17 * avplaqt Average Temporal Plaquette
18 * u11t,u12t The trial fields
19 * iu Upper halo indices
20 * beta Inverse gauge coupling
21 *
22 * Calls:
23 * =====
24 * Par_dsum
25 *
26 * Return:
27 * ======
28 * Zero on success, integer error code otherwise
29 */
30 const char *funcname = "Average_Plaquette";
31 /*There was a halo exchange here but moved it outside
32 The FORTRAN code used several consecutive loops to get the plaquette
33 Instead we'll just make the arrays variables and do everything in one loop
34 Should work since in the FORTRAN Sigma11[i] only depends on i components for example
35 Since the ν loop doesn't get called for μ=0 we'll start at μ=1
36 */
37#ifdef __NVCC__
38 __managed__ double hgs = 0; __managed__ double hgt = 0;
39 cuAverage_Plaquette(&hgs, &hgt, u11t, u12t, iu,dimGrid,dimBlock);
40#else
41 double hgs = 0; double hgt = 0;
42 for(int mu=1;mu<ndim;mu++)
43 for(int nu=0;nu<mu;nu++)
44 //Don't merge into a single loop. Makes vectorisation easier?
45 //Or merge into a single loop and dispense with the a arrays?
46#pragma omp parallel for simd aligned(u11t,u12t,iu:AVX) reduction(+:hgs,hgt)
47 for(int i=0;i<kvol;i++){
48 //Save us from typing iu[mu+ndim*i] everywhere
49 switch(mu){
50 //Time component
51 case(ndim-1): hgt -= SU2plaq(u11t,u12t,iu,i,mu,nu);
52 break;
53 //Space component
54 default: hgs -= SU2plaq(u11t,u12t,iu,i,mu,nu);
55 break;
56 }
57 }
58#endif
59#if(nproc>1)
60 Par_dsum(&hgs); Par_dsum(&hgt);
61#endif
62 *avplaqs=-hgs/(3.0*gvol); *avplaqt=-hgt/(gvol*3.0);
63 *hg=(hgs+hgt)*beta;
64#ifdef _DEBUG
65 if(!rank)
66 printf("hgs=%e hgt=%e hg=%e\n", hgs, hgt, *hg);
67#endif
68 return 0;
69}
float SU2plaq(Complex_f *u11t, Complex_f *u12t, unsigned int *iu, int i, int mu, int nu)
Calculates the plaquette at site i in the direction.
Definition bosonic.c:72
int rank
The MPI rank.
Definition par_mpi.c:22
int Par_dsum(double *dval)
Performs a reduction on a double dval to get a sum which is then distributed to all ranks.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define kvol
Sublattice volume.
Definition sizes.h:154
#define gvol
Lattice volume.
Definition sizes.h:92
#define ndim
Dimensions.
Definition sizes.h:179

References AVX, gvol, kvol, ndim, Par_dsum(), rank, and SU2plaq().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ C_gather()

int C_gather ( Complex_f * x,
Complex_f * y,
int n,
unsigned int * table,
unsigned int mu )
inline

Extracts all the single precision gauge links in the \(\mu\) direction only.

Parameters
xThe output
yThe gauge field for a particular colour
nNumber of sites in the gauge field. This is typically kvol
tableTable containing information on nearest neighbours. Usually id or iu
muDireciton we're interested in extractng
Returns
Zero on success, integer error code otherwise

Definition at line 321 of file su2hmc.c.

322{
323 const char *funcname = "C_gather";
324 //FORTRAN had a second parameter m giving the size of y (kvol+halo) normally
325 //Pointers mean that's not an issue for us so I'm leaving it out
326#ifdef __NVCC__
327 cuC_gather(x,y,n,table,mu,dimBlock,dimGrid);
328#else
329#pragma omp parallel for simd aligned (x,y,table:AVX)
330 for(int i=0; i<n; i++)
331 x[i]=y[table[i*ndim+mu]*ndim+mu];
332#endif
333 return 0;
334}

References ndim.

Here is the caller graph for this function:

◆ Congradp()

int Congradp ( int na,
double res,
Complex * Phi,
Complex * xi,
Complex_f * u11t,
Complex_f * u12t,
unsigned int * iu,
unsigned int * id,
Complex_f * gamval,
int * gamin,
float * dk4m,
float * dk4p,
Complex_f jqq,
float akappa,
int * itercg )

Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) The matrix multiplication step is done at single precision, while the update is done at double.

Parameters
naFlavour index
resLimit for conjugate gradient
PhiPseudofermion field.
xiReturned as \((M^\dagger M)^{-1} \Phi\)
u11tFirst colour's trial field
u12tSecond colour's trial field
iuUpper halo indices
idLower halo indices
gamval_fSingle precision gamma matrices rescaled by kappa
gaminDirac indices
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping Parameter
itercgCounts the iterations of the conjugate gradient
Returns
0 on success, integer error code otherwise

Definition at line 262 of file congrad.c.

263 {
264 /*
265 * @brief Matrix Inversion via Conjugate Gradient
266 * Solves @f$(M^\dagger)Mx=\Phi@f$
267 * No even/odd partitioning.
268 * The matrix multiplication step is done at single precision, while the update is done at double
269 *
270 * @param na: Flavour index
271 * @param res: Limit for conjugate gradient
272 * @param Phi: @f(\Phi@f) initially,
273 * @param xi: Returned as @f((M^\dagger M)^{-1} \Phi@f)
274 * @param u11t: First colour's trial field
275 * @param u12t: Second colour's trial field
276 * @param iu: Upper halo indices
277 * @param id: Lower halo indices
278 * @param gamval: Gamma matrices
279 * @param gamin: Dirac indices
280 * @param dk4m:
281 * @param dk4p:
282 * @param jqq: Diquark source
283 * @param akappa: Hopping Parameter
284 * @param itercg: Counts the iterations of the conjugate gradient
285 *
286 * @return 0 on success, integer error code otherwise
287 */
288 const char *funcname = "Congradp";
289 //Return value
290 int ret_val=0;
291 const double resid = res*res;
292 //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
293 //These alpha and beta terms should be double, but that causes issues with BLAS. Instead we declare
294 //them Complex and work with the real part (especially for α_d)
295 //Give initial values Will be overwritten if niterx>0
296#ifdef __NVCC__
297 Complex_f *p_f, *r_f, *xi_f, *x1_f, *x2_f;
298 int device; cudaGetDevice(&device);
299#ifdef _DEBUG
300 cudaMallocManaged((void **)&p_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
301 cudaMallocManaged((void **)&r_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
302 cudaMallocManaged((void **)&x1_f, kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
303 cudaMallocManaged((void **)&x2_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
304 cudaMallocManaged((void **)&xi_f, kferm*sizeof(Complex_f),cudaMemAttachGlobal);
305#else
306 cudaMalloc((void **)&p_f, kfermHalo*sizeof(Complex_f));
307 cudaMalloc((void **)&r_f, kferm*sizeof(Complex_f));
308 cudaMalloc((void **)&x1_f, kfermHalo*sizeof(Complex_f));
309 cudaMalloc((void **)&x2_f, kferm*sizeof(Complex_f));
310 cudaMalloc((void **)&xi_f, kferm*sizeof(Complex_f));
311#endif
312#else
313 Complex_f *p_f = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
314 Complex_f *r_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
315 Complex_f *x1_f = aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
316 Complex_f *x2_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
317 Complex_f *xi_f = aligned_alloc(AVX,kferm*sizeof(Complex_f));
318#endif
319 double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
320 double alphan=0.0;
321 //Instead of copying element-wise in a loop, use memcpy.
322#ifdef __NVCC__
323 //Get xi in single precision, then swap to AoS format
324 cuComplex_convert(p_f,xi,kferm,true,dimBlock,dimGrid);
325 Transpose_c(p_f,ngorkov*nc,kvol,dimGrid,dimBlock);
326 cudaMemcpy(xi_f,p_f,kferm*sizeof(Complex_f),cudaMemcpyDefault);
327
328 //And repeat for r
329 cuComplex_convert(r_f,Phi+na*kferm,kferm,true,dimBlock,dimGrid);
330 Transpose_c(r_f,ngorkov*nc,kvol,dimGrid,dimBlock);
331
332 //Flip all the gauge fields around so memory is coalesced
333 Transpose_c(u11t,ndim,kvol,dimGrid,dimBlock);
334 Transpose_c(u12t,ndim,kvol,dimGrid,dimBlock);
335#else
336#pragma omp parallel for simd aligned(p_f,xi_f,xi,r_f,Phi:AVX)
337 for(int i =0;i<kferm;i++){
338 p_f[i]=xi_f[i]=(Complex_f)xi[i];
339 r_f[i]=(Complex_f)Phi[na*kferm+i];
340 }
341#endif
342
343 // Declaring placeholder arrays
344 // This x1 is NOT related to the /common/vectorp/X1 in the FORTRAN code and should not
345 // be confused with X1 the global variable
346
347 //niterx isn't called as an index but we'll start from zero with the C code to make the
348 //if statements quicker to type
349 double betan;
350#ifdef __NVCC__
351 cudaDeviceSynchronise();
352#endif
353 for((*itercg)=0; (*itercg)<=niterc; (*itercg)++){
354 //Don't overwrite on first run.
355 //x2=(M^†)x1=(M^†)Mp
356 Dslash_f(x1_f,p_f,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa);
357 Dslashd_f(x2_f,x1_f,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa);
358#ifdef __NVCC__
359 cudaDeviceSynchronise();
360#endif
361 //We can't evaluate α on the first niterx because we need to get β_n.
362 if(*itercg){
363 //x*.x
364#ifdef USE_BLAS
365 float alphad_f;
366#ifdef __NVCC__
367 cublasScnrm2(cublas_handle,kferm,(cuComplex*) x1_f, 1,(float *)&alphad_f);
368 alphad = alphad_f*alphad_f;
369#else
370 alphad_f = cblas_scnrm2(kferm, x1_f, 1);
371#endif
372 alphad = alphad_f*alphad_f;
373#else
374 alphad=0;
375 for(int i = 0; i<kferm; i++)
376 alphad+=conj(x1_f[i])*x1_f[i];
377#endif
378#if(nproc>1)
379 Par_fsum((float *)&alphad);
380#endif
381 //α=(r.r)/p(M^†)Mp
382 alpha=alphan/creal(alphad);
383 // Complex_f alpha_f = (Complex_f)alpha;
384 //x+αp
385#ifdef USE_BLAS
386 Complex_f alpha_f=(float)alpha;
387#ifdef __NVCC__
388 cublasCaxpy(cublas_handle,kferm,(cuComplex*) &alpha_f,(cuComplex*) p_f,1,(cuComplex*) xi_f,1);
389#else
390 cblas_caxpy(kferm, (Complex_f*)&alpha_f,(Complex_f*)p_f, 1, (Complex_f*)xi_f, 1);
391#endif
392#else
393#pragma omp parallel for simd aligned(xi_f,p_f:AVX)
394 for(int i = 0; i<kferm; i++)
395 xi_f[i]+=alpha*p_f[i];
396#endif
397 }
398
399 //r=α(M^†)Mp and β_n=r*.r
400#if defined USE_BLAS
401 Complex_f alpha_m=(Complex_f)(-alpha);
402 float betan_f=0;
403#ifdef __NVCC__
404 cublasCaxpy(cublas_handle,kferm, (cuComplex *)&alpha_m,(cuComplex *) x2_f, 1,(cuComplex *) r_f, 1);
405 //cudaDeviceSynchronise();
406 //r*.r
407 cublasScnrm2(cublas_handle,kferm,(cuComplex *) r_f,1,(float *)&betan_f);
408#else
409 cblas_caxpy(kferm,(Complex_f*) &alpha_m,(Complex_f*) x2_f, 1,(Complex_f*) r_f, 1);
410 //r*.r
411 betan_f = cblas_scnrm2(kferm, (Complex_f*)r_f,1);
412#endif
413 //Gotta square it to "undo" the norm
414 betan=betan_f*betan_f;
415#else
416 //Just like Congradq, this loop could be unrolled but will need a reduction to deal with the betan
417 //addition.
418 betan = 0;
419 //If we get a small enough β_n before hitting the iteration cap we break
420#pragma omp parallel for simd aligned(x2_f,r_f:AVX) reduction(+:betan)
421 for(int i = 0; i<kferm;i++){
422 r_f[i]-=alpha*x2_f[i];
423 betan+=conj(r_f[i])*r_f[i];
424 }
425#endif
426 //This is basically just congradq at the end. Check there for comments
427#if(nproc>1)
428 Par_dsum(&betan);
429#endif
430#ifdef _DEBUG
431#ifdef _DEBUGCG
432 char *endline = "\n";
433#else
434 char *endline = "\r";
435#endif
436 if(!rank) printf("Iter (CG) = %i β_n= %e α= %e%s", *itercg, betan, alpha,endline);
437#endif
438 if(betan<resid){
439 //Started counting from zero so add one to make it accurate
440 (*itercg)++;
441#ifdef _DEBUG
442 if(!rank) printf("\nIter (CG) = %i resid = %e toler = %e\n", *itercg, betan, resid);
443#endif
444 ret_val=0; break;
445 }
446 else if(*itercg==niterc-1){
447 if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n",
448 ITERLIM, funcname, niterc, betan);
449 ret_val=ITERLIM; break;
450 }
451 //Note that beta below is not the global beta and scoping is used to avoid conflict between them
452 Complex beta = (*itercg) ? betan/betad : 0;
453 betad=betan; alphan=betan;
454 //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
455 //β instead of x.
456 //There is cblas_zaxpby in the MKL though, set a = 1 and b = β.
457#ifdef USE_BLAS
458 Complex_f beta_f = (Complex_f)beta;
459 Complex_f a = 1.0;
460#ifdef __NVCC__
461 cublasCscal(cublas_handle,kferm,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
462 cublasCaxpy(cublas_handle,kferm,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
463 cudaDeviceSynchronise();
464#elif (defined __INTEL_MKL__ || defined AMD_BLAS)
465 cblas_caxpby(kferm, &a, r_f, 1, &beta_f, p_f, 1);
466#else
467 cblas_cscal(kferm,&beta_f,p_f,1);
468 cblas_caxpy(kferm,&a,r_f,1,p_f,1);
469#endif
470#else
471#pragma omp parallel for simd aligned(r_f,p_f:AVX)
472 for(int i=0; i<kferm; i++)
473 p_f[i]=r_f[i]+beta*p_f[i];
474#endif
475 }
476#ifdef __NVCC__
477 Transpose_c(xi_f,kvol,ngorkov*nc,dimGrid,dimBlock);
478 Transpose_c(r_f,kvol,ngorkov*nc,dimGrid,dimBlock);
479
480 Transpose_c(u11t,kvol,ndim,dimGrid,dimBlock);
481 Transpose_c(u12t,kvol,ndim,dimGrid,dimBlock);
482 cudaDeviceSynchronise();
483 cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
484#else
485#pragma omp simd
486 for(int i = 0; i <kferm;i++){
487 xi[i]=(Complex)xi_f[i];
488 }
489#endif
490#ifdef __NVCC__
491 cudaFree(p_f); cudaFree(r_f);cudaFree(x1_f); cudaFree(x2_f); cudaFree(xi_f);
492#else
493 free(p_f); free(r_f); free(x1_f); free(x2_f); free(xi_f);
494#endif
495 return ret_val;
496}
int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:462
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:584
int Par_fsum(float *dval)
Performs a reduction on a float dval to get a sum which is then distributed to all ranks.
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define niterc
Hard limit for runaway trajectories in Conjugate gradient.
Definition sizes.h:163
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225

References AVX, Complex, Complex_f, Dslash_f(), Dslashd_f(), kferm, kfermHalo, kvol, nc, ndim, ngorkov, niterc, Par_dsum(), Par_fsum(), and rank.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Congradq()

int Congradq ( int na,
double res,
Complex * X1,
Complex * r,
Complex_f * u11t_f,
Complex_f * u12t_f,
unsigned int * iu,
unsigned int * id,
Complex_f * gamval_f,
int * gamin,
float * dk4m_f,
float * dk4p_f,
Complex_f jqq,
float akappa,
int * itercg )

Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves \((M^\dagger)Mx=\Phi\) Implements up/down partitioning The matrix multiplication step is done at single precision, while the update is done at double.

Parameters
naFlavour index
resLimit for conjugate gradient
X1Pseudofermion field \(\Phi\) initially, returned as \((M^\dagger M)^{-1} \Phi\)
rPartition of \(\Phi\) being used. Gets recycled as the residual vector
u11t_fFirst colour's trial field
u12t_fSecond colour's trial field
iuUpper halo indices
idLower halo indices
gamval_fSingle precision gamma matrices rescaled by kappa
gaminDirac indices
dk4m_f\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p_f\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping Parameter
itercgCounts the iterations of the conjugate gradient
Returns
0 on success, integer error code otherwise

Definition at line 7 of file congrad.c.

8 {
9 /*
10 * @brief Matrix Inversion via Mixed Precision Conjugate Gradient
11 * Solves @f$(M^\dagger)Mx=\Phi@f$
12 * Implements up/down partitioning
13 * The matrix multiplication step is done at single precision, while the update is done at double
14 *
15 * @param na: Flavour index
16 * @param res: Limit for conjugate gradient
17 * @param X1: @f(\Phi@f) initially, returned as @f((M^\dagger M)^{-1} \Phi@f)
18 * @param r: Partition of @f(\Phi@f) being used. Gets recycled as the residual vector
19 * @param u11t: First colour's trial field
20 * @param u12t: Second colour's trial field
21 * @param iu: Upper halo indices
22 * @param id: Lower halo indices
23 * @param gamval_f: Gamma matrices
24 * @param gamin: Dirac indices
25 * @param dk4m:
26 * @param dk4p:
27 * @param jqq: Diquark source
28 * @param akappa: Hopping Parameter
29 * @param itercg: Counts the iterations of the conjugate gradient
30 *
31 * @see Hdslash_f(), Hdslashd_f(), Par_fsum(), Par_dsum()
32 *
33 * @return 0 on success, integer error code otherwise
34 */
35 const char *funcname = "Congradq";
36 int ret_val=0;
37 const double resid = res*res;
38 //The κ^2 factor is needed to normalise the fields correctly
39 //jqq is the diquark condensate and is global scope.
40 const Complex_f fac_f = conj(jqq)*jqq*akappa*akappa;
41 //These were evaluated only in the first loop of niterx so we'll just do it outside of the loop.
42 //n suffix is numerator, d is denominator
43 double alphan=1;
44 //The alpha and beta terms should be double, but that causes issues with BLAS pointers. Instead we declare
45 //them complex and work with the real part (especially for α_d)
46 //Give initial values Will be overwritten if niterx>0
47 double betad = 1.0; Complex_f alphad=0; Complex alpha = 1;
48 //Because we're dealing with flattened arrays here we can call cblas safely without the halo
49#ifdef __NVCC__
50 Complex_f *p_f, *x1_f, *x2_f, *r_f, *X1_f;
51 int device=-1; cudaGetDevice(&device);
52
53#ifdef _DEBUG
54 cudaMallocManaged((void **)&p_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
55 cudaMallocManaged((void **)&x1_f, kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
56 cudaMallocManaged((void **)&x2_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
57 cudaMallocManaged((void **)&r_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
58 cudaMallocManaged((void **)&X1_f, kferm2*sizeof(Complex_f),cudaMemAttachGlobal);
59#else
60 //First two have halo exchanges, so getting NCCL working is important
61 cudaMallocAsync((void **)&p_f, kferm2Halo*sizeof(Complex_f),streams[0]);
62 cudaMallocAsync((void **)&x1_f, kferm2Halo*sizeof(Complex_f),streams[1]);
63 cudaMallocAsync((void **)&x2_f, kferm2*sizeof(Complex_f),streams[2]);
64 cudaMallocAsync((void **)&r_f, kferm2*sizeof(Complex_f),streams[3]);
65 cudaMallocAsync((void **)&X1_f, kferm2*sizeof(Complex_f),streams[4]);
66#endif
67#else
68 Complex_f *p_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
69 Complex_f *x1_f=aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
70 Complex_f *x2_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
71 Complex_f *X1_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
72 Complex_f *r_f=aligned_alloc(AVX,kferm2*sizeof(Complex_f));
73#endif
74 //Instead of copying element-wise in a loop, use memcpy.
75#ifdef __NVCC__
76 //Get X1 in single precision, then swap to AoS format
77 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
78 Transpose_c(X1_f,ndirac*nc,kvol,dimGrid,dimBlock);
79
80 //And repeat for r
81 cuComplex_convert(r_f,r,kferm2,true,dimBlock,dimGrid);
82 Transpose_c(r_f,ndirac*nc,kvol,dimGrid,dimBlock);
83
84 //cudaMemcpy is blocking, so use async instead
85 cudaMemcpyAsync(p_f, X1_f, kferm2*sizeof(Complex_f),cudaMemcpyDeviceToDevice,NULL);
86 //Flip all the gauge fields around so memory is coalesced
87 Transpose_c(u11t,ndim,kvol,dimGrid,dimBlock);
88 Transpose_c(u12t,ndim,kvol,dimGrid,dimBlock);
89 Transpose_I(iu,ndim,kvol,dimGrid,dimBlock);
90 Transpose_I(id,ndim,kvol,dimGrid,dimBlock);
91#else
92#pragma omp parallel for simd
93 for(int i=0;i<kferm2;i++){
94 r_f[i]=(Complex_f)r[i];
95 X1_f[i]=(Complex_f)X1[i];
96 }
97 memcpy(p_f, X1_f, kferm2*sizeof(Complex_f));
98#endif
99
100 //niterx isn't called as an index but we'll start from zero with the C code to make the
101 //if statements quicker to type
102 double betan; bool pf=true;
103 for(*itercg=0; *itercg<niterc; (*itercg)++){
104 //x2 = (M^†M)p
105 //No need to synchronise here. The memcpy in Hdslash is blocking
106 Hdslash_f(x1_f,p_f,u11t,u12t,iu,id,gamval_f,gamin,dk4m,dk4p,akappa);
107 Hdslashd_f(x2_f,x1_f,u11t,u12t,iu,id,gamval_f,gamin,dk4m,dk4p,akappa);
108#ifdef __NVCC__
109 cudaDeviceSynchronise();
110#endif
111 //x2 = (M^†M+J^2)p
112 //No point adding zero a couple of hundred times if the diquark source is zero
113 if(fac_f!=0){
114#ifdef __NVCC__
115 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&fac_f,(cuComplex *)p_f,1,(cuComplex *)x2_f,1);
116#elif defined USE_BLAS
117 cblas_caxpy(kferm2, &fac_f, p_f, 1, x2_f, 1);
118#else
119#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
120 for(int i=0; i<kferm2; i++)
121 x2_f[i]+=fac_f*p_f[i];
122#endif
123 }
124 //We can't evaluate α on the first *itercg because we need to get β_n.
125 if(*itercg){
126 //α_d= p* (M^†M+J^2)p
127#ifdef __NVCC__
128 cublasCdotc(cublas_handle,kferm2,(cuComplex *)p_f,1,(cuComplex *)x2_f,1,(cuComplex *)&alphad);
129#elif defined USE_BLAS
130 cblas_cdotc_sub(kferm2, p_f, 1, x2_f, 1, &alphad);
131#else
132 alphad=0;
133#pragma omp parallel for simd aligned(p_f,x2_f:AVX)
134 for(int i=0; i<kferm2; i++)
135 alphad+=conj(p_f[i])*x2_f[i];
136#endif
137 //For now I'll cast it into a float for the reduction. Each rank only sends and writes
138 //to the real part so this is fine
139#if(nproc>1)
140 Par_fsum((float *)&alphad);
141#endif
142 //α=α_n/α_d = (r.r)/p(M^†M)p
143 alpha=alphan/creal(alphad);
144 //x-αp,
145#ifdef __NVCC__
146 Complex_f alpha_f = (Complex_f)alpha;
147 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&alpha_f,(cuComplex *)p_f,1,(cuComplex *)X1_f,1);
148#elif defined USE_BLAS
149 Complex_f alpha_f = (Complex_f)alpha;
150 cblas_caxpy(kferm2, &alpha_f, p_f, 1, X1_f, 1);
151#else
152 for(int i=0; i<kferm2; i++)
153 X1_f[i]+=alpha*p_f[i];
154#endif
155 }
156 // r_n+1 = r_n-α(M^† M)p_n and β_n=r*.r
157#ifdef __NVCC__
158 Complex_f alpha_m=(Complex_f)(-alpha);
159 cublasCaxpy(cublas_handle, kferm2,(cuComplex *)&alpha_m,(cuComplex *)x2_f,1,(cuComplex *)r_f,1);
160 float betan_f;
161 cublasScnrm2(cublas_handle,kferm2,(cuComplex *)r_f,1,&betan_f);
162 betan = betan_f*betan_f;
163#elif defined USE_BLAS
164 Complex_f alpha_m = (Complex_f)(-alpha);
165 cblas_caxpy(kferm2, &alpha_m, x2_f, 1, r_f, 1);
166 //Undo the negation for the BLAS routine
167 float betan_f = cblas_scnrm2(kferm2, r_f,1);
168 //Gotta square it to "undo" the norm
169 betan = betan_f*betan_f;
170#else
171 betan=0;
172#pragma omp parallel for simd aligned(r_f,x2_f:AVX) reduction(+:betan)
173 for(int i=0; i<kferm2; i++){
174 r_f[i]-=alpha*x2_f[i];
175 betan += conj(r_f[i])*r_f[i];
176 }
177#endif
178 //And... reduce.
179#if(nproc>1)
180 Par_dsum(&betan);
181#endif
182#ifdef _DEBUGCG
183#warning "CG Debugging"
184 char *endline = "\n";
185#else
186 char *endline = "\r";
187#endif
188#ifdef _DEBUG
189 if(!rank) printf("Iter(CG)=%i\tβ_n=%e\tα=%e%s", *itercg, betan, alpha,endline);
190#endif
191 if(betan<resid){
192 (*itercg)++;
193#ifdef _DEBUG
194 if(!rank) printf("\nIter(CG)=%i\tResidue: %e\tTolerance: %e\n", *itercg, betan, resid);
195#endif
196 ret_val=0; break;
197 }
198 else if(*itercg==niterc-1){
199 if(!rank) fprintf(stderr, "Warning %i in %s: Exceeded iteration limit %i β_n=%e\n", ITERLIM, funcname, *itercg, betan);
200 ret_val=ITERLIM; break;
201 }
202 //Here we evaluate β=(r_{k+1}.r_{k+1})/(r_k.r_k) and then shuffle our indices down the line.
203 //On the first iteration we define beta to be zero.
204 //Note that beta below is not the global beta and scoping is used to avoid conflict between them
205 Complex beta = (*itercg) ? betan/betad : 0;
206 betad=betan; alphan=betan;
207 //BLAS for p=r+βp doesn't exist in standard BLAS. This is NOT an axpy case as we're multiplying y by
208 //β instead of x.
209#ifdef __NVCC__
210 Complex_f beta_f=(Complex_f)beta;
211 __managed__ Complex_f a = 1.0;
212 cublasCscal(cublas_handle,kferm2,(cuComplex *)&beta_f,(cuComplex *)p_f,1);
213 cublasCaxpy(cublas_handle,kferm2,(cuComplex *)&a,(cuComplex *)r_f,1,(cuComplex *)p_f,1);
214#elif (defined __INTEL_MKL__)
215 Complex_f a = 1.0;
216 Complex_f beta_f=(Complex_f)beta;
217 //There is cblas_?axpby in the MKL and AMD though, set a = 1 and b = β.
218 //If we get a small enough β_n before hitting the iteration cap we break
219 cblas_caxpby(kferm2, &a, r_f, 1, &beta_f, p_f, 1);
220#elif defined USE_BLAS
221 Complex_f beta_f=(Complex_f)beta;
222 cblas_cscal(kferm2,&beta_f,p_f,1);
223 Complex_f a = 1.0;
224 cblas_caxpy(kferm2,&a,r_f,1,p_f,1);
225#else
226 for(int i=0; i<kferm2; i++)
227 p_f[i]=r_f[i]+beta*p_f[i];
228#endif
229 }
230#ifdef __NVCC__
231//Restore arrays back to their previous salyout
232 Transpose_c(X1_f,kvol,ndirac*nc,dimGrid,dimBlock);
233 cuComplex_convert(X1_f,X1,kferm2,false,dimBlock,dimGrid);
234 Transpose_c(r_f,kvol,ndirac*nc,dimGrid,dimBlock);
235 cuComplex_convert(r_f,r,kferm2,false,dimBlock,dimGrid);
236 Transpose_c(u11t,kvol,ndim,dimGrid,dimBlock);
237 Transpose_c(u12t,kvol,ndim,dimGrid,dimBlock);
238 Transpose_I(iu,kvol,ndim,dimGrid,dimBlock);
239 Transpose_I(id,kvol,ndim,dimGrid,dimBlock);
240#else
241 for(int i=0;i<kferm2;i++){
242 X1[i]=(Complex)X1_f[i];
243 r[i]=(Complex)r_f[i];
244 }
245#endif
246#ifdef __NVCC__
247#ifdef _DEBUG
248 cudaDeviceSynchronise();
249 cudaFree(x1_f);cudaFree(x2_f); cudaFree(p_f);
250 cudaFree(r_f);cudaFree(X1_f);
251#else
252 //streams match the ones that allocated them.
253 cudaFreeAsync(p_f,streams[0]);cudaFreeAsync(x1_f,streams[1]);cudaFreeAsync(x2_f,streams[2]);
254 cudaDeviceSynchronise();
255 cudaFreeAsync(r_f,streams[3]);cudaFreeAsync(X1_f,streams[4]);
256#endif
257#else
258 free(x1_f);free(x2_f); free(p_f); free(r_f); free(X1_f);
259#endif
260 return ret_val;
261}
int Hdslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, float akappa)
Evaluates in single precision.
Definition matrices.c:802
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, float akappa)
Evaluates in single precision.
Definition matrices.c:711
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define ndirac
Dirac indices.
Definition sizes.h:177
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188

References AVX, Complex, Complex_f, Hdslash_f(), Hdslashd_f(), kferm2, kferm2Halo, kvol, nc, ndim, ndirac, niterc, Par_dsum(), Par_fsum(), and rank.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Fill_Small_Phi()

int Fill_Small_Phi ( int na,
Complex * smallPhi,
Complex * Phi )
inline

Copies necessary (2*4*kvol) elements of Phi into a vector variable

Parameters
naflavour index
smallPhiThe partitioned output
PhiThe pseudofermion field
Returns
Zero on success, integer error code otherwise

Definition at line 349 of file su2hmc.c.

350{
351 /*Copies necessary (2*4*kvol) elements of Phi into a vector variable
352 *
353 * Globals:
354 * =======
355 * Phi: The source array
356 *
357 * Parameters:
358 * ==========
359 * int na: flavour index
360 * Complex *smallPhi: The target array
361 *
362 * Returns:
363 * =======
364 * Zero on success, integer error code otherwise
365 */
366 const char *funcname = "Fill_Small_Phi";
367 //BIG and small phi index
368#ifdef __NVCC__
369 cuFill_Small_Phi(na,smallPhi,Phi,dimBlock,dimGrid);
370#else
371#pragma omp parallel for simd aligned(smallPhi,Phi:AVX) collapse(3)
372 for(int i = 0; i<kvol;i++)
373 for(int idirac = 0; idirac<ndirac; idirac++)
374 for(int ic= 0; ic<nc; ic++)
375 // PHI_index=i*16+j*2+k;
376 smallPhi[(i*ndirac+idirac)*nc+ic]=Phi[((na*kvol+i)*ngorkov+idirac)*nc+ic];
377#endif
378 return 0;
379}

References kvol, nc, ndirac, and ngorkov.

Here is the caller graph for this function:

◆ Force()

int Force ( double * dSdpi,
int iflag,
double res1,
Complex * X0,
Complex * X1,
Complex * Phi,
Complex * u11t,
Complex * u12t,
Complex_f * u11t_f,
Complex_f * u12t_f,
unsigned int * iu,
unsigned int * id,
Complex * gamval,
Complex_f * gamval_f,
int * gamin,
double * dk4m,
double * dk4p,
float * dk4m_f,
float * dk4p_f,
Complex_f jqq,
float akappa,
float beta,
double * ancg )

Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time.

Parameters
dSdpiThe force
iflagInvert before evaluating the force. 0 to invert, one not to. Blame FORTRAN...
res1Conjugate gradient residule
X0Up/down partitioned pseudofermion field
X1Holder for the partitioned fermion field, then the conjugate gradient output
PhiPseudofermion field
u11t,u12tDouble precision colour fields
u11t_f,u12t_fSingle precision colour fields
iu,idLattice indices
gaminGamma indices
gamvalDouble precision gamma matrices rescaled by kappa
gamval_fSingle precision gamma matrices rescaled by kappa
dk4m\(e^{-\mu}\)
dk4p\(e^\mu\)
dk4m_f\(e^{-\mu}\) float
dk4p_f\(e^\mu\) float
jqqDiquark source
akappaHopping parameter
betaInverse gauge coupling
ancgCounter for conjugate gradient iterations
Returns
Zero on success, integer error code otherwise

Definition at line 131 of file force.c.

134 {
135 /*
136 * @brief Calculates the force @f$\frac{dS}{d\pi}@f$ at each intermediate time
137 *
138 * @param dSdpi: The force
139 * @param iflag: Invert before evaluating the force?
140 * @param res1: Conjugate gradient residule
141 * @param X0: Up/down partitioned pseudofermion field
142 * @param X1: Holder for the partitioned fermion field, then the conjugate gradient output
143 * @param Phi: Pseudofermion field
144 * @param u11t,u12t Double precision colour fields
145 * @param u11t_f,u12t_f: Single precision colour fields
146 * @param iu,id: Lattice indices
147 * @param gamin: Gamma indices
148 * @param gamval: Double precision gamma matrices
149 * @param gamval_f: Single precision gamma matrices
150 * @param dk4m: @f$\left(1+\gamma_0\right)e^{-\mu}@f$
151 * @param dk4p: @f$\left(1-\gamma_0\right)e^\mu@f$
152 * @param dk4m_f: @f$\left(1+\gamma_0\right)e^{-\mu}@f$ float
153 * @param dk4p_f: @f$\left(1-\gamma_0\right)e^\mu@f$ float
154 * @param jqq: Diquark source
155 * @param akappa: Hopping parameter
156 * @param beta: Inverse gauge coupling
157 * @param ancg: Counter for conjugate gradient iterations
158 *
159 * @return Zero on success, integer error code otherwise
160 */
161 const char *funcname = "Force";
162#ifdef __NVCC__
163 int device=-1;
164 cudaGetDevice(&device);
165#endif
166#ifndef NO_GAUGE
167 Gauge_force(dSdpi,u11t_f,u12t_f,iu,id,beta);
168#endif
169 //X1=(M†M)^{1} Phi
170 int itercg=1;
171#ifdef __NVCC__
172 Complex *X2;
173 cudaMallocManaged((void **)&X2,kferm2Halo*sizeof(Complex),cudaMemAttachGlobal);
174#else
175 Complex *X2= (Complex *)aligned_alloc(AVX,kferm2Halo*sizeof(Complex));
176#endif
177 for(int na = 0; na<nf; na++){
178#ifdef __NVCC__
179 cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,NULL);
180#else
181 memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
182#endif
183 if(!iflag){
184#ifdef __NVCC__
185 Complex *smallPhi;
186 cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),streams[0]);
187#else
188 Complex *smallPhi = (Complex *)aligned_alloc(AVX,kferm2*sizeof(Complex));
189#endif
190 Fill_Small_Phi(na, smallPhi, Phi);
191 // Congradq(na, res1,smallPhi, &itercg );
192 Congradq(na,res1,X1,smallPhi,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg);
193#ifdef __NVCC__
194 cudaFreeAsync(smallPhi,streams[0]);
195#else
196 free(smallPhi);
197#endif
198 *ancg+=itercg;
199#ifdef __NVCC__
200 Complex blasa=2.0; double blasb=-1.0;
201 cublasZdscal(cublas_handle,kferm2,&blasb,(cuDoubleComplex *)(X0+na*kferm2),1);
202 cublasZaxpy(cublas_handle,kferm2,(cuDoubleComplex *)&blasa,(cuDoubleComplex *)X1,1,(cuDoubleComplex *)(X0+na*kferm2),1);
203 //HDslash launches a different stream so we need a barrieer
204 cudaDeviceSynchronise();
205#elif (defined __INTEL_MKL__)
206 Complex blasa=2.0; Complex blasb=-1.0;
207 //This is not a general BLAS Routine. BLIS and MKl support it
208 //CUDA and GSL does not support it
209 cblas_zaxpby(kferm2, &blasa, X1, 1, &blasb, X0+na*kferm2, 1);
210#elif defined USE_BLAS
211 Complex blasa=2.0; double blasb=-1.0;
212 cblas_zdscal(kferm2,blasb,X0+na*kferm2,1);
213 cblas_zaxpy(kferm2,&blasa,X1,1,X0+na*kferm2,1);
214#else
215#pragma omp parallel for simd collapse(2)
216 for(int i=0;i<kvol;i++)
217 for(int idirac=0;idirac<ndirac;idirac++){
218 X0[((na*kvol+i)*ndirac+idirac)*nc]=
219 2*X1[(i*ndirac+idirac)*nc]-X0[((na*kvol+i)*ndirac+idirac)*nc];
220 X0[((na*kvol+i)*ndirac+idirac)*nc+1]=
221 2*X1[(i*ndirac+idirac)*nc+1]-X0[((na*kvol+i)*ndirac+idirac)*nc+1];
222 }
223#endif
224 }
225 Hdslash(X2,X1,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,akappa);
226#ifdef __NVCC__
227 double blasd=2.0;
228 cudaDeviceSynchronise();
229 cublasZdscal(cublas_handle,kferm2, &blasd, (cuDoubleComplex *)X2, 1);
230#elif defined USE_BLAS
231 double blasd=2.0;
232 cblas_zdscal(kferm2, blasd, X2, 1);
233#else
234#pragma unroll
235 for(int i=0;i<kferm2;i++)
236 X2[i]*=2;
237#endif
238#if(npx>1)
239 ZHalo_swap_dir(X1,8,0,DOWN);
240 ZHalo_swap_dir(X2,8,0,DOWN);
241#endif
242#if(npy>1)
243 ZHalo_swap_dir(X1,8,1,DOWN);
244 ZHalo_swap_dir(X2,8,1,DOWN);
245#endif
246#if(npz>1)
247 ZHalo_swap_dir(X1,8,2,DOWN);
248 ZHalo_swap_dir(X2,8,2,DOWN);
249#endif
250#if(npt>1)
251 ZHalo_swap_dir(X1,8,3,DOWN);
252 ZHalo_swap_dir(X2,8,3,DOWN);
253#endif
254
255 // The original FORTRAN Comment:
256 // dSdpi=dSdpi-Re(X1*(d(Mdagger)dp)*X2) -- Yikes!
257 // we're gonna need drugs for this one......
258 //
259 // Makes references to X1(.,.,iu(i,mu)) AND X2(.,.,iu(i,mu))
260 // as a result, need to swap the DOWN halos in all dirs for
261 // both these arrays, each of which has 8 cpts
262 //
263#ifdef __NVCC__
264 Complex_f *X1_f, *X2_f;
265 cudaMallocAsync((void **)&X1_f,kferm2*sizeof(Complex_f),NULL);
266 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
267 Transpose_c(X1_f,ndirac*nc,kvol,dimGrid,dimBlock);
268
269 cudaMallocAsync((void **)&X2_f,kferm2*sizeof(Complex_f),NULL);
270 cuComplex_convert(X2_f,X2,kferm2,true,dimBlock,dimGrid);
271 Transpose_c(X2_f,ndirac*nc,kvol,dimGrid,dimBlock);
272// Transpose_z(X1,kvol,ndirac*nc,dimGrid,dimBlock); Transpose_z(X2,kvol,ndirac*nc,dimGrid,dimBlock);
273 cuForce(dSdpi,u11t_f,u12t_f,X1_f,X2_f,gamval_f,dk4m_f,dk4p_f,iu,gamin,akappa,dimGrid,dimBlock);
274 cudaDeviceSynchronise();
275 cudaFreeAsync(X1_f,NULL); cudaFreeAsync(X2_f,NULL);
276#else
277#pragma omp parallel for
278 for(int i=0;i<kvol;i++)
279 for(int idirac=0;idirac<ndirac;idirac++){
280 int mu, uid, igork1;
281#ifndef NO_SPACE
282#pragma omp simd aligned(dSdpi,X1,X2,u11t,u12t,iu:AVX)
283 for(mu=0; mu<3; mu++){
284 //Long term ambition. I used the diff command on the different
285 //spacial components of dSdpi and saw a lot of the values required
286 //for them are duplicates (u11(i,mu)*X2(1,idirac,i) is used again with
287 //a minus in front for example. Why not evaluate them first /and then plug
288 //them into the equation? Reduce the number of evaluations needed and look
289 //a bit neater (although harder to follow as a consequence).
290
291 //Up indices
292 uid = iu[mu+ndim*i];
293 igork1 = gamin[mu*ndirac+idirac];
294
295 //REMINDER. Gamma is already scaled by kappa when we defined them. So if yer trying to rederive this from
296 //Montvay and Munster and notice a missing kappa in the code, that is why.
297 dSdpi[(i*nadj)*ndim+mu]+=akappa*creal(I*
298 (conj(X1[(i*ndirac+idirac)*nc])*
299 (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
300 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
301 +conj(X1[(uid*ndirac+idirac)*nc])*
302 ( u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
303 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
304 +conj(X1[(i*ndirac+idirac)*nc+1])*
305 (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
306 +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
307 +conj(X1[(uid*ndirac+idirac)*nc+1])*
308 (-u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
309 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
310 dSdpi[(i*nadj)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
311 (conj(X1[(i*ndirac+idirac)*nc])*
312 (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
313 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
314 +conj(X1[(uid*ndirac+idirac)*nc])*
315 (-u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
316 +conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
317 +conj(X1[(i*ndirac+idirac)*nc+1])*
318 (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
319 +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
320 +conj(X1[(uid*ndirac+idirac)*nc+1])*
321 (u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
322 +conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
323
324 dSdpi[(i*nadj+1)*ndim+mu]+=akappa*creal(
325 (conj(X1[(i*ndirac+idirac)*nc])*
326 (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
327 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
328 +conj(X1[(uid*ndirac+idirac)*nc])*
329 (-u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
330 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
331 +conj(X1[(i*ndirac+idirac)*nc+1])*
332 (-u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
333 -u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
334 +conj(X1[(uid*ndirac+idirac)*nc+1])*
335 (u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
336 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
337 dSdpi[(i*nadj+1)*ndim+mu]+=creal(gamval[mu*ndirac+idirac]*
338 (conj(X1[(i*ndirac+idirac)*nc])*
339 (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
340 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
341 +conj(X1[(uid*ndirac+idirac)*nc])*
342 (u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
343 +conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
344 +conj(X1[(i*ndirac+idirac)*nc+1])*
345 (-u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
346 -u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
347 +conj(X1[(uid*ndirac+idirac)*nc+1])*
348 (-u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
349 +conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
350
351 dSdpi[(i*nadj+2)*ndim+mu]+=akappa*creal(I*
352 (conj(X1[(i*ndirac+idirac)*nc])*
353 (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
354 +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
355 +conj(X1[(uid*ndirac+idirac)*nc])*
356 (-conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
357 -u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])
358 +conj(X1[(i*ndirac+idirac)*nc+1])*
359 (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
360 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
361 +conj(X1[(uid*ndirac+idirac)*nc+1])*
362 (-conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
363 +u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])));
364 dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
365 (conj(X1[(i*ndirac+idirac)*nc])*
366 (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
367 +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
368 +conj(X1[(uid*ndirac+idirac)*nc])*
369 (conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
370 +u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])
371 +conj(X1[(i*ndirac+idirac)*nc+1])*
372 (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
373 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
374 +conj(X1[(uid*ndirac+idirac)*nc+1])*
375 (conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
376 -u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])));
377
378 }
379#endif
380 //We're not done tripping yet!! Time like term is different. dk4? shows up
381 //For consistency we'll leave mu in instead of hard coding.
382 mu=3;
383 uid = iu[mu+ndim*i];
384 igork1 = gamin[mu*ndirac+idirac];
385#ifndef NO_TIME
386 dSdpi[(i*nadj)*ndim+mu]+=creal(I*
387 (conj(X1[(i*ndirac+idirac)*nc])*
388 (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
389 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
390 +conj(X1[(uid*ndirac+idirac)*nc])*
391 (dk4p[i]* (+u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
392 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
393 +conj(X1[(i*ndirac+idirac)*nc+1])*
394 (dk4m[i]* (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
395 +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
396 +conj(X1[(uid*ndirac+idirac)*nc+1])*
397 (dk4p[i]* (-u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
398 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))))
399 +creal(I*
400 (conj(X1[(i*ndirac+idirac)*nc])*
401 (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
402 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
403 +conj(X1[(uid*ndirac+idirac)*nc])*
404 (-dk4p[i]* (u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
405 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
406 +conj(X1[(i*ndirac+idirac)*nc+1])*
407 (dk4m[i]* (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
408 +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
409 +conj(X1[(uid*ndirac+idirac)*nc+1])*
410 (-dk4p[i]* (-u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
411 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
412
413 dSdpi[(i*nadj+1)*ndim+mu]+=creal(
414 conj(X1[(i*ndirac+idirac)*nc])*
415 (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
416 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
417 +conj(X1[(uid*ndirac+idirac)*nc])*
418 (dk4p[i]* (-u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
419 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
420 +conj(X1[(i*ndirac+idirac)*nc+1])*
421 (dk4m[i]* (-u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
422 -u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
423 +conj(X1[(uid*ndirac+idirac)*nc+1])*
424 (dk4p[i]* ( u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
425 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])))
426 +creal(
427 (conj(X1[(i*ndirac+idirac)*nc])*
428 (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
429 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
430 +conj(X1[(uid*ndirac+idirac)*nc])*
431 (-dk4p[i]* (-u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
432 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
433 +conj(X1[(i*ndirac+idirac)*nc+1])*
434 (dk4m[i]* (-u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
435 -u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
436 +conj(X1[(uid*ndirac+idirac)*nc+1])*
437 (-dk4p[i]* (u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
438 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
439
440 dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*
441 (conj(X1[(i*ndirac+idirac)*nc])*
442 (dk4m[i]* (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
443 +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
444 +conj(X1[(uid*ndirac+idirac)*nc])*
445 (dk4p[i]*(-conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
446 -u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))
447 +conj(X1[(i*ndirac+idirac)*nc+1])*
448 (dk4m[i]* (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
449 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
450 +conj(X1[(uid*ndirac+idirac)*nc+1])*
451 (dk4p[i]*(-conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
452 +u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))))
453 +creal(I*
454 (conj(X1[(i*ndirac+idirac)*nc])*
455 (dk4m[i]* (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
456 +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
457 +conj(X1[(uid*ndirac+idirac)*nc])*
458 (-dk4p[i]*(-conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
459 -u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))
460 +conj(X1[(i*ndirac+idirac)*nc+1])*
461 (dk4m[i]* (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
462 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
463 +conj(X1[(uid*ndirac+idirac)*nc+1])*
464 (-dk4p[i]*(-conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
465 +u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))));
466
467#endif
468 }
469#endif
470 }
471#ifdef __NVCC__
472 cudaFree(X2);
473#else
474 free(X2);
475#endif
476 return 0;
477}
int Gauge_force(double *dSdpi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
Definition force.c:6
int Hdslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
Evaluates in double precision.
Definition matrices.c:267
#define DOWN
Flag for send down.
Definition par_mpi.h:35
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define nadj
adjacent spatial indices
Definition sizes.h:175
#define nf
Fermion flavours (double it)
Definition sizes.h:151
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
Definition congrad.c:7
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
Definition su2hmc.c:349

References AVX, Complex, Complex_f, Congradq(), DOWN, Fill_Small_Phi(), Gauge_force(), Hdslash(), kferm2, kferm2Halo, kvol, nadj, nc, ndim, ndirac, nf, and ZHalo_swap_dir().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Gauge_force()

int Gauge_force ( double * dSdpi,
Complex_f * u11t,
Complex_f * u12t,
unsigned int * iu,
unsigned int * id,
float beta )

Calculates the gauge force due to the Wilson Action at each intermediate time.

Parameters
dSdpiThe force
u11t,u12tGauge fields
iu,idLattice indices
betaInverse gauge coupling
Returns
Zero on success, integer error code otherwise

Definition at line 6 of file force.c.

6 {
7 /*
8 * Calculates dSdpi due to the Wilson Action at each intermediate time
9 *
10 * Calls:
11 * =====
12 * C_Halo_swap_all, C_gather, C_Halo_swap_dir
13 *
14 * Parameters:
15 * =======
16 * double *dSdpi
17 * Complex_f *u11t
18 * Complex_f *u12t
19 * unsigned int *iu
20 * unsigned int *id
21 * float beta
22 *
23 * Returns:
24 * =======
25 * Zero on success, integer error code otherwise
26 */
27 const char *funcname = "Gauge_force";
28
29 //We define zero halos for debugging
30 // #ifdef _DEBUG
31 // memset(u11t[kvol], 0, ndim*halo*sizeof(Complex_f));
32 // memset(u12t[kvol], 0, ndim*halo*sizeof(Complex_f));
33 // #endif
34 //Was a trial field halo exchange here at one point.
35#ifdef __NVCC__
36 int device=-1;
37 cudaGetDevice(&device);
38 Complex_f *Sigma11, *Sigma12, *u11sh, *u12sh;
39 cudaMallocAsync((void **)&Sigma11,kvol*sizeof(Complex_f),streams[0]);
40 cudaMallocAsync((void **)&Sigma12,kvol*sizeof(Complex_f),streams[1]);
41 cudaMallocManaged((void **)&u11sh,(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);
42 cudaMallocManaged((void **)&u12sh,(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);
43#else
44 Complex_f *Sigma11 = (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f));
45 Complex_f *Sigma12= (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f));
46 Complex_f *u11sh = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f));
47 Complex_f *u12sh = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f));
48#endif
49 //Holders for directions
50 for(int mu=0; mu<ndim; mu++){
51#ifdef __NVCC__
52 cudaMemset(Sigma11,0, kvol*sizeof(Complex_f));
53 cudaMemset(Sigma12,0, kvol*sizeof(Complex_f));
54#else
55 memset(Sigma11,0, kvol*sizeof(Complex_f));
56 memset(Sigma12,0, kvol*sizeof(Complex_f));
57#endif
58 for(int nu=0; nu<ndim; nu++)
59 if(nu!=mu){
60 //The +ν Staple
61#ifdef __NVCC__
62 cuPlus_staple(mu,nu,iu,Sigma11,Sigma12,u11t,u12t,dimGrid,dimBlock);
63#else
64#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12,iu:AVX)
65 for(int i=0;i<kvol;i++){
66 int uidm = iu[mu+ndim*i];
67 int uidn = iu[nu+ndim*i];
68 Complex_f a11=u11t[uidm*ndim+nu]*conj(u11t[uidn*ndim+mu])+\
69 u12t[uidm*ndim+nu]*conj(u12t[uidn*ndim+mu]);
70 Complex_f a12=-u11t[uidm*ndim+nu]*u12t[uidn*ndim+mu]+\
71 u12t[uidm*ndim+nu]*u11t[uidn*ndim+mu];
72 Sigma11[i]+=a11*conj(u11t[i*ndim+nu])+a12*conj(u12t[i*ndim+nu]);
73 Sigma12[i]+=-a11*u12t[i*ndim+nu]+a12*u11t[i*ndim+nu];
74 }
75#endif
76 C_gather(u11sh, u11t, kvol, id, nu);
77 C_gather(u12sh, u12t, kvol, id, nu);
78#if(nproc>1)
79#ifdef __NVCC__
80 //Prefetch to the CPU for until we get NCCL working
81 cudaMemPrefetchAsync(u11sh, kvol*sizeof(Complex_f),cudaCpuDeviceId,streams[0]);
82 cudaMemPrefetchAsync(u12sh, kvol*sizeof(Complex_f),cudaCpuDeviceId,streams[1]);
83#endif
84 CHalo_swap_dir(u11sh, 1, mu, DOWN); CHalo_swap_dir(u12sh, 1, mu, DOWN);
85#ifdef __NVCC__
86 cudaMemPrefetchAsync(u11sh+kvol, halo*sizeof(Complex_f),device,streams[0]);
87 cudaMemPrefetchAsync(u12sh+kvol, halo*sizeof(Complex_f),device,streams[1]);
88#endif
89#endif
90 //Next up, the -ν staple
91#ifdef __NVCC__
92 cudaDeviceSynchronise();
93 cuMinus_staple(mu,nu,iu,id,Sigma11,Sigma12,u11sh,u12sh,u11t,u12t,dimGrid,dimBlock);
94#else
95#pragma omp parallel for simd aligned(u11t,u12t,u11sh,u12sh,Sigma11,Sigma12,iu,id:AVX)
96 for(int i=0;i<kvol;i++){
97 int uidm = iu[mu+ndim*i];
98 int didn = id[nu+ndim*i];
99 //uidm is correct here
100 Complex_f a11=conj(u11sh[uidm])*conj(u11t[didn*ndim+mu])-\
101 u12sh[uidm]*conj(u12t[didn*ndim+mu]);
102 Complex_f a12=-conj(u11sh[uidm])*u12t[didn*ndim+mu]-\
103 u12sh[uidm]*u11t[didn*ndim+mu];
104 Sigma11[i]+=a11*u11t[didn*ndim+nu]-a12*conj(u12t[didn*ndim+nu]);
105 Sigma12[i]+=a11*u12t[didn*ndim+nu]+a12*conj(u11t[didn*ndim+nu]);
106 }
107#endif
108 }
109#ifdef __NVCC__
110 cuGauge_force(mu,Sigma11,Sigma12,u11t,u12t,dSdpi,beta,dimGrid,dimBlock);
111#else
112#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12,dSdpi:AVX)
113 for(int i=0;i<kvol;i++){
114 Complex_f a11 = u11t[i*ndim+mu]*Sigma12[i]+u12t[i*ndim+mu]*conj(Sigma11[i]);
115 Complex_f a12 = u11t[i*ndim+mu]*Sigma11[i]+conj(u12t[i*ndim+mu])*Sigma12[i];
116
117 dSdpi[(i*nadj)*ndim+mu]=(double)(beta*cimag(a11));
118 dSdpi[(i*nadj+1)*ndim+mu]=(double)(beta*creal(a11));
119 dSdpi[(i*nadj+2)*ndim+mu]=(double)(beta*cimag(a12));
120 }
121#endif
122 }
123#ifdef __NVCC__
124 cudaDeviceSynchronise();
125 cudaFreeAsync(Sigma11,streams[0]); cudaFreeAsync(Sigma12,streams[1]); cudaFree(u11sh); cudaFree(u12sh);
126#else
127 free(u11sh); free(u12sh); free(Sigma11); free(Sigma12);
128#endif
129 return 0;
130}
int CHalo_swap_dir(Complex_f *c, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define halo
Total Halo size.
Definition sizes.h:222
int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
Extracts all the single precision gauge links in the direction only.
Definition su2hmc.c:321

References AVX, C_gather(), CHalo_swap_dir(), Complex_f, DOWN, halo, kvol, nadj, and ndim.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Hamilton()

int Hamilton ( double * h,
double * s,
double res2,
double * pp,
Complex * X0,
Complex * X1,
Complex * Phi,
Complex * u11t,
Complex * u12t,
Complex_f * u11t_f,
Complex_f * u12t_f,
unsigned int * iu,
unsigned int * id,
Complex_f * gamval_f,
int * gamin,
float * dk4m_f,
float * dk4p_f,
Complex_f jqq,
float akappa,
float beta,
double * ancgh,
int traj )

Calculate the Hamiltonian.

Parameters
hHamiltonian
sAction
res2Limit for conjugate gradient
ppMomentum field
X0Up/down partitioned pseudofermion field
X1Holder for the partitioned fermion field, then the conjugate gradient output
PhiPseudofermion field
u11t,u12tGauge fields
u11t_f,u12t_fGauge fields (single precision)
iu,idLattice indices
gamval_fSingle precision gamma matrices rescaled by kappa
gaminGamma indices
dk4m_f\(\left(1+\gamma_0\right)e^{-\mu}\) float
dk4p_f\(\left(1-\gamma_0\right)e^\mu\) float
jqqDiquark source
akappaHopping parameter
betaInverse gauge coupling
ancghConjugate gradient iterations counter
trajCalling trajectory for error reporting
Returns
Zero on success. Integer Error code otherwise.

Definition at line 208 of file su2hmc.c.

211 {
212 /*
213 * @brief Calculate the Hamiltonian
214 *
215 *
216 * @param h: Hamiltonian
217 * @param s: Action
218 * @param res2: Limit for conjugate gradient
219 * @param X0:
220 * @param X1:
221 * @param Phi:
222 * @param u11t,u12t: Gauge fields
223 * @param u11t_f,u12t_f: Gauge fields
224 * @param iu,id: Lattice indices
225 * @param gamval_f: Gamma matrices
226 * @param gamin: Gamma indices
227 * @param dk4m_f: $exp(-\mu)$ float
228 * @param dk4p_f: $exp(\mu)$ float
229 * @param jqq: Diquark source
230 * @param akappa: Hopping parameter
231 * @param beta: Inverse gauge coupling
232 * @param ancgh: Conjugate gradient iterations counter
233 *
234 * @return Zero on success. Integer Error code otherwise.
235 */
236 const char *funcname = "Hamilton";
237 //Iterate over momentum terms.
238#ifdef __NVCC__
239 double hp;
240 int device=-1;
241 cudaGetDevice(&device);
242 cudaMemPrefetchAsync(pp,kmom*sizeof(double),device,NULL);
243 cublasDnrm2(cublas_handle, kmom, pp, 1,&hp);
244 hp*=hp;
245#elif defined USE_BLAS
246 double hp = cblas_dnrm2(kmom, pp, 1);
247 hp*=hp;
248#else
249 double hp=0;
250 for(int i = 0; i<kmom; i++)
251 hp+=pp[i]*pp[i];
252#endif
253 hp*=0.5;
254 double avplaqs, avplaqt;
255 double hg = 0;
256 //avplaq? isn't seen again here.
257 Average_Plaquette(&hg,&avplaqs,&avplaqt,u11t_f,u12t_f,iu,beta);
258
259 double hf = 0; int itercg = 0;
260#ifdef __NVCC__
261 Complex *smallPhi;
262 cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),NULL);
263#else
264 Complex *smallPhi = aligned_alloc(AVX,kferm2*sizeof(Complex));
265#endif
266 //Iterating over flavours
267 for(int na=0;na<nf;na++){
268#ifdef __NVCC__
269#ifdef _DEBUG
270 cudaDeviceSynchronise();
271#endif
272 cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,streams[0]);
273#ifdef _DEBUG
274 cudaDeviceSynchronise();
275#endif
276#else
277 memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
278#endif
279 Fill_Small_Phi(na, smallPhi, Phi);
280 if(Congradq(na,res2,X1,smallPhi,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg))
281 fprintf(stderr,"Trajectory %d\n", traj);
282
283 *ancgh+=itercg;
284#ifdef __NVCC__
285 cudaMemcpyAsync(X0+na*kferm2,X1,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,streams[0]);
286#else
287 memcpy(X0+na*kferm2,X1,kferm2*sizeof(Complex));
288#endif
289 Fill_Small_Phi(na, smallPhi,Phi);
290#ifdef __NVCC__
291 Complex dot;
292 cublasZdotc(cublas_handle,kferm2,(cuDoubleComplex *)smallPhi,1,(cuDoubleComplex *) X1,1,(cuDoubleComplex *) &dot);
293 hf+=creal(dot);
294#elif defined USE_BLAS
295 Complex dot;
296 cblas_zdotc_sub(kferm2, smallPhi, 1, X1, 1, &dot);
297 hf+=creal(dot);
298#else
299 //It is a dot product of the flattened arrays, could use
300 //a module to convert index to coordinate array...
301 for(int j=0;j<kferm2;j++)
302 hf+=creal(conj(smallPhi[j])*X1[j]);
303#endif
304 }
305#ifdef __NVCC__
306 cudaFreeAsync(smallPhi,NULL);
307#else
308 free(smallPhi);
309#endif
310 //hg was summed over inside of Average_Plaquette.
311#if(nproc>1)
312 Par_dsum(&hp); Par_dsum(&hf);
313#endif
314 *s=hg+hf; *h=(*s)+hp;
315#ifdef _DEBUG
316 if(!rank)
317 printf("hg=%.5e; hf=%.5e; hp=%.5e; h=%.5e\n", hg, hf, hp, *h);
318#endif
319 return 0;
320}
#define kmom
sublattice momentum sites
Definition sizes.h:184
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
Definition su2hmc.c:349
int Average_Plaquette(double *hg, double *avplaqs, double *avplaqt, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, float beta)
Calculates the gauge action using new (how new?) lookup table.
Definition bosonic.c:8

References Average_Plaquette(), AVX, Complex, Congradq(), Fill_Small_Phi(), kferm2, kmom, nf, Par_dsum(), and rank.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Init()

int Init ( int istart,
int ibound,
int iread,
float beta,
float fmu,
float akappa,
Complex_f ajq,
Complex * u11,
Complex * u12,
Complex * u11t,
Complex * u12t,
Complex_f * u11t_f,
Complex_f * u12t_f,
Complex * gamval,
Complex_f * gamval_f,
int * gamin,
double * dk4m,
double * dk4p,
float * dk4m_f,
float * dk4p_f,
unsigned int * iu,
unsigned int * id )

Initialises the system.

Parameters
istartZero for cold, >1 for hot, <1 for none
iboundPeriodic boundary conditions
ireadRead configuration from file
betaInverse gauge coupling
fmuChemical potential
akappaHopping parameter
ajqDiquark source
u11,u12Gauge fields
u11t,u12tTrial gauge field
u11t_f,u12t_fTrial gauge field (single precision)
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)^\mu\)
dk4m_f\(\left(1+\gamma_0\right)e^{-\mu}\) float
dk4p_f\(\left(1-\gamma_0\right)e^\mu\) float
iu,idUp halo indices
gaminGamma matrix indices
gamvalDouble precision gamma matrices rescaled by kappa
gamval_fSingle precision gamma matrices rescaled by kappa
Returns
Zero on success, integer error code otherwise

Definition at line 19 of file su2hmc.c.

22 {
23 /*
24 * Initialises the system
25 *
26 * Calls:
27 * ======
28 * Addrc, Check_addr, ran2, DHalo_swap_dir, Par_sread, Par_ranset, Reunitarise
29 *
30 * Globals:
31 * =======
32 * Complex gamval: Gamma Matrices
33 * Complex_f gamval_f: Float Gamma matrices:
34 *
35 * Parameters:
36 * ==========
37 * int istart: Zero for cold, >1 for hot, <1 for none
38 * int ibound: Periodic boundary conditions
39 * int iread: Read configuration from file
40 * float beta: beta
41 * float fmu: Chemical potential
42 * float akappa: Hopping parameter
43 * Complex_f ajq: Diquark source
44 * Complex *u11: First colour field
45 * Complex *u12: Second colour field
46 * Complex *u11t: First colour trial field
47 * Complex *u12t: Second colour trial field
48 * Complex_f *u11t_f: First float trial field
49 * Complex_f *u12t_f: Second float trial field
50 * double *dk4m $exp(-\mu)$
51 * double *dk4p: $exp(\mu)$
52 * float *dk4m_f: $exp(-\mu)$ float
53 * float *dk4p_f: $exp(\mu)$ float
54 * unsigned int *iu: Up halo indices
55 * unsigned int *id: Down halo indices
56 *
57 * Returns:
58 * =======
59 * Zero on success, integer error code otherwise
60 */
61 const char *funcname = "Init";
62
63#ifdef _OPENMP
64 omp_set_num_threads(nthreads);
65#ifdef __INTEL_MKL__
66 mkl_set_num_threads(nthreads);
67#endif
68#endif
69 //First things first, calculate a few constants for coordinates
70 Addrc(iu, id);
71 //And confirm they're legit
74#ifdef _DEBUG
75 printf("Checked addresses\n");
76#endif
77 double chem1=exp(fmu); double chem2 = 1/chem1;
78 //CUDA this. Only limit will be the bus speed
79#pragma omp parallel for simd aligned(dk4m,dk4p:AVX)
80 for(int i = 0; i<kvol; i++){
81 dk4p[i]=akappa*chem1;
82 dk4m[i]=akappa*chem2;
83 }
84 //Anti periodic Boundary Conditions. Flip the terms at the edge of the time
85 //direction
86 if(ibound == -1 && pcoord[3+ndim*rank]==npt-1){
87#ifdef _DEBUG
88 printf("Implementing antiperiodic boundary conditions on rank %i\n", rank);
89#endif
90#pragma omp parallel for simd aligned(dk4m,dk4p:AVX)
91 for(int k= kvol-1; k>=kvol-kvol3; k--){
92 //int k = kvol - kvol3 + i;
93 dk4p[k]*=-1;
94 dk4m[k]*=-1;
95 }
96 }
97 //These are constant so swap the halos when initialising and be done with it
98 //May need to add a synchronisation statement here first
99#if(npt>1)
100 DHalo_swap_dir(dk4p, 1, 3, UP);
101 DHalo_swap_dir(dk4m, 1, 3, UP);
102#endif
103 //Float versions
104#ifdef __NVCC__
105 cuReal_convert(dk4p_f,dk4p,kvol+halo,true,dimBlock,dimGrid);
106 cuReal_convert(dk4m_f,dk4m,kvol+halo,true,dimBlock,dimGrid);
107#else
108#pragma omp parallel for simd aligned(dk4m,dk4p,dk4m_f,dk4p_f:AVX)
109 for(int i=0;i<kvol+halo;i++){
110 dk4p_f[i]=(float)dk4p[i];
111 dk4m_f[i]=(float)dk4m[i];
112 }
113#endif
114 int __attribute__((aligned(AVX))) gamin_t[4][4] = {{3,2,1,0},{3,2,1,0},{2,3,0,1},{2,3,0,1}};
115 //Gamma Matrices in Chiral Representation
116 //Gattringer and Lang have a nice crash course in appendix A.2 of
117 //Quantum Chromodynamics on the Lattice (530.14 GAT)
118 //_t is for temp. We copy these into the real gamvals later
119#ifdef __NVCC__
120 cudaMemcpy(gamin,gamin_t,4*4*sizeof(int),cudaMemcpyDefault);
121#else
122 memcpy(gamin,gamin_t,4*4*sizeof(int));
123#endif
124 Complex __attribute__((aligned(AVX))) gamval_t[5][4] = {{-I,-I,I,I},{-1,1,1,-1},{-I,I,I,-I},{1,1,1,1},{1,1,-1,-1}};
125 //Each gamma matrix is rescaled by akappa by flattening the gamval array
126#if defined USE_BLAS
127 //Don't cuBLAS this. It is small and won't saturate the GPU. Let the CPU handle
128 //it and just copy it later
129 cblas_zdscal(5*4, akappa, gamval_t, 1);
130#else
131#pragma omp parallel for simd collapse(2) aligned(gamval,gamval_f:AVX)
132 for(int i=0;i<5;i++)
133 for(int j=0;j<4;j++)
134 gamval_t[i][j]*=akappa;
135#endif
136
137#ifdef __NVCC__
138 cudaMemcpy(gamval,gamval_t,5*4*sizeof(Complex),cudaMemcpyDefault);
139 cuComplex_convert(gamval_f,gamval,20,true,dimBlockOne,dimGridOne);
140#else
141 memcpy(gamval,gamval_t,5*4*sizeof(Complex));
142 for(int i=0;i<5*4;i++)
143 gamval_f[i]=(Complex_f)gamval[i];
144#endif
145 if(iread){
146 if(!rank) printf("Calling Par_sread() for configuration: %i\n", iread);
147 Par_sread(iread, beta, fmu, akappa, ajq,u11,u12,u11t,u12t);
148 Par_ranset(&seed,iread);
149 }
150 else{
151 Par_ranset(&seed,iread);
152 if(istart==0){
153 //Initialise a cold start to zero
154 //memset is safe to use here because zero is zero
155#pragma omp parallel for simd aligned(u11t:AVX)
156 //Leave it to the GPU?
157 for(int i=0; i<kvol*ndim;i++){
158 u11t[i]=1; u12t[i]=0;
159 }
160 }
161 else if(istart>0){
162 //Ideally, we can use gsl_ranlux as the PRNG
163#ifdef __RANLUX__
164 for(int i=0; i<kvol*ndim;i++){
165 u11t[i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
166 u12t[i]=2*(gsl_rng_uniform(ranlux_instd)-0.5+I*(gsl_rng_uniform(ranlux_instd)-0.5));
167 }
168 //If not, the Intel Vectorise Mersenne Twister
169#elif (defined __INTEL_MKL__&&!defined USE_RAN2)
170 //Good news, casting works for using a double to create random complex numbers
171 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*ndim*kvol, u11t, -1, 1);
172 vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, 2*ndim*kvol, u12t, -1, 1);
173 //Last resort, Numerical Recipes' Ran2
174#else
175 for(int i=0; i<kvol*ndim;i++){
176 u11t[i]=2*(ran2(&seed)-0.5+I*(ran2(&seed)-0.5));
177 u12t[i]=2*(ran2(&seed)-0.5+I*(ran2(&seed)-0.5));
178 }
179#endif
180 }
181 else
182 fprintf(stderr,"Warning %i in %s: Gauge fields are not initialised.\n", NOINIT, funcname);
183
184#ifdef __NVCC__
185 int device=-1;
186 cudaGetDevice(&device);
187 cudaMemPrefetchAsync(u11t, ndim*kvol*sizeof(Complex),device,streams[0]);
188 cudaMemPrefetchAsync(u12t, ndim*kvol*sizeof(Complex),device,streams[1]);
189#endif
190 //Send trials to accelerator for reunitarisation
191 Reunitarise(u11t,u12t);
192 //Get trials back
193#ifdef __NVCC__
194 cudaMemcpyAsync(u11,u11t,ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[0]);
195 cudaMemPrefetchAsync(u11, ndim*kvol*sizeof(Complex),device,streams[0]);
196 cudaMemcpyAsync(u12,u12t,ndim*kvol*sizeof(Complex),cudaMemcpyDefault,streams[1]);
197 cudaMemPrefetchAsync(u12, ndim*kvol*sizeof(Complex),device,streams[1]);
198#else
199 memcpy(u11, u11t, ndim*kvol*sizeof(Complex));
200 memcpy(u12, u12t, ndim*kvol*sizeof(Complex));
201#endif
202 }
203#ifdef _DEBUG
204 printf("Initialisation Complete\n");
205#endif
206 return 0;
207}
int Check_addr(unsigned int *table, int lns, int lnt, int imin, int imax)
Definition coord.c:334
int Addrc(unsigned int *iu, unsigned int *id)
Loads the addresses required during the update.
Definition coord.c:16
#define UP
Flag for send up.
Definition par_mpi.h:37
int Par_sread(const int iread, const float beta, const float fmu, const float akappa, const Complex_f ajq, Complex *u11, Complex *u12, Complex *u11t, Complex *u12t)
Reads and assigns the gauges from file.
Definition par_mpi.c:127
int DHalo_swap_dir(double *d, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
int * pcoord
The processor grid.
Definition par_mpi.c:19
int Par_ranset(long *seed, int iread)
Uses the rank to get a new seed. Copying from the FORTRAN description here c create new seeds in rang...
Definition random.c:135
double ran2(long *idum)
Generates uniformly distributed random double between zero and one as described in numerical recipes....
Definition random.c:440
long seed
RAN2 seed.
Definition random.c:27
#define ksizet
Sublattice t extent.
Definition sizes.h:149
#define nthreads
Number of threads for OpenMP, which can be overwritten at runtime.
Definition sizes.h:135
#define ksize
Sublattice spatial extent for a cubic lattice.
Definition sizes.h:146
#define kvol3
Sublattice spatial volume.
Definition sizes.h:156
#define npt
Processor grid t extent.
Definition sizes.h:124
int Reunitarise(Complex *u11t, Complex *u12t)
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
Definition matrices.c:904

References Addrc(), AVX, Check_addr(), Complex, Complex_f, DHalo_swap_dir(), halo, ksize, ksizet, kvol, kvol3, ndim, npt, nthreads, Par_ranset(), Par_sread(), pcoord, ran2(), rank, Reunitarise(), seed, and UP.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Measure()

int Measure ( double * pbp,
double * endenf,
double * denf,
Complex * qq,
Complex * qbqb,
double res,
int * itercg,
Complex * u11t,
Complex * u12t,
Complex_f * u11t_f,
Complex_f * u12t_f,
unsigned int * iu,
unsigned int * id,
Complex * gamval,
Complex_f * gamval_f,
int * gamin,
double * dk4m,
double * dk4p,
float * dk4m_f,
float * dk4p_f,
Complex_f jqq,
float akappa,
Complex * Phi,
Complex * R1 )

Calculate fermion expectation values via a noisy estimator.

Matrix inversion via conjugate gradient algorithm Solves \(MX=X_1\) (Numerical Recipes section 2.10 pp.70-73)
uses NEW lookup tables ** Implemented in Congradp()

Parameters
pbp\(\langle\bar{\Psi}\Psi\rangle\)
endenfEnergy density
denfNumber Density
qqDiquark condensate
qbqbAntidiquark condensate
resConjugate Gradient Residue
itercgIterations of Conjugate Gradient
u11t,u12tDouble precisiongauge field
u11t_f,u12t_fSingle precision gauge fields
iu,idLattice indices
gamvalDouble precision gamma matrices rescaled by kappa
gamval_fSingle precision gamma matrices rescaled by kappa
gaminIndices for Dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
dk4m_f\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p_f\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping parameter
PhiPseudofermion field
R1A useful array for holding things that was already assigned in main. In particular, we'll be using it to catch the output of \( M^\dagger\Xi\) before the inversion, then used to store the output of the inversion
Returns
Zero on success, integer error code otherwise

Definition at line 8 of file fermionic.c.

11 {
12 /*
13 * @brief Calculate fermion expectation values via a noisy estimator
14 *
15 * Matrix inversion via conjugate gradient algorithm
16 * Solves @f(Mx=x_1@f)
17 * (Numerical Recipes section 2.10 pp.70-73)
18 * uses NEW lookup tables **
19 * Implemented in Congradq
20 *
21 * @param pbp: @f(\langle\bar{\Psi}\Psi\rangle@f)
22 * @param endenf: Energy density
23 * @param denf: Number Density
24 * @param qq: Diquark condensate
25 * @param qbqb: Antidiquark condensate
26 * @param res: Conjugate Gradient Residue
27 * @param itercg: Iterations of Conjugate Gradient
28 * @param u11t,u12t Double precisiongauge field
29 * @param u11t_f,u12t_f: Single precision gauge fields
30 * @param iu,id Lattice indices
31 * @param gamval_f: Gamma matrices
32 * @param gamin: Indices for Dirac terms
33 * @param dk4m_f: $exp(-\mu)$ float
34 * @param dk4p_f: $exp(\mu)$ float
35 * @param jqq: Diquark source
36 * @param akappa: Hopping parameter
37 * @param Phi: Pseudofermion field
38 * @param R1: A useful array for holding things that was already assigned in main.
39 * In particular, we'll be using it to catch the output of
40 * @f$ M^\dagger\Xi@f$ before the inversion, then used to store the
41 * output of the inversion
42 *
43 * @return Zero on success, integer error code otherwise
44 */
45 const char *funcname = "Measure";
46 //This x is just a storage container
47
48#ifdef __NVCC__
49 int device=-1;
50 cudaGetDevice(&device);
51 Complex *x, *xi; Complex_f *xi_f, *R1_f;
52 #ifdef _DEBUG
53 cudaMallocManaged((void **)&R1_f,kfermHalo*sizeof(Complex_f), cudaMemAttachGlobal);
54 #else
55 cudaMallocAsync((void **)&R1_f,kfermHalo*sizeof(Complex_f),streams[1]);
56 #endif
57 cudaMallocManaged((void **)&x,kfermHalo*sizeof(Complex), cudaMemAttachGlobal);
58 cudaMallocManaged((void **)&xi,kferm*sizeof(Complex), cudaMemAttachGlobal);
59 cudaMallocManaged((void **)&xi_f,kfermHalo*sizeof(Complex_f), cudaMemAttachGlobal);
60#else
61 Complex *x =(Complex *)aligned_alloc(AVX,kfermHalo*sizeof(Complex));
62 Complex *xi =(Complex *)aligned_alloc(AVX,kferm*sizeof(Complex));
63 Complex_f *xi_f =(Complex_f *)aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
64 Complex_f *R1_f = (Complex_f *)aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
65#endif
66 //Setting up noise.
67#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
68 Gauss_c(xi_f, kferm, 0, (float)(1/sqrt(2)));
69#else
70 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, xi_f, 0, 1/sqrt(2));
71#endif
72#ifdef __NVCC__
73 cudaMemPrefetchAsync(xi_f,kferm*sizeof(Complex_f),device,streams[0]);
74 cuComplex_convert(xi_f,xi,kferm,false,dimBlock,dimGrid);
75 //Transpose needed here for Dslashd
76 Transpose_c(xi_f,ngorkov*nc,kvol,dimGrid,dimBlock);
77 //Flip all the gauge fields around so memory is coalesced
78 Transpose_c(u11t_f,ndim,kvol,dimGrid,dimBlock);
79 Transpose_c(u12t_f,ndim,kvol,dimGrid,dimBlock);
80 cudaMemcpyAsync(x, xi, kferm*sizeof(Complex),cudaMemcpyDefault,0);
81#else
82#pragma omp parallel for simd aligned(xi,xi_f:AVX)
83 for(int i=0;i<kferm;i++)
84 xi[i]=(Complex)xi_f[i];
85 memcpy(x, xi, kferm*sizeof(Complex));
86#endif
87 //R_1= @f$M^\dagger\Xi@f$
88 //R1 is local in FORTRAN but since its going to be reset anyway I'm going to recycle the
89 //global
90 Dslashd_f(R1_f,xi_f,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa);
91#ifdef __NVCC__
92 cudaDeviceSynchronise();
93 cudaFree(xi_f);
94 Transpose_c(R1_f,kvol,ngorkov*nc,dimGrid,dimBlock);
95 Transpose_c(u11t_f,kvol,ndim,dimGrid,dimBlock);
96 Transpose_c(u12t_f,kvol,ndim,dimGrid,dimBlock);
97 cuComplex_convert(R1_f,R1,kferm,false,dimBlock,dimGrid);
98 cudaMemcpy(Phi, R1, kferm*sizeof(Complex),cudaMemcpyDefault);
99#else
100#pragma omp parallel for simd aligned(R1,R1_f:AVX)
101 for(int i=0;i<kferm;i++)
102 R1[i]=(Complex)R1_f[i];
103 //Copying R1 to the first (zeroth) flavour index of Phi
104 //This should be safe with memcpy since the pointer name
105 //references the first block of memory for that pointer
106 memcpy(Phi, R1, kferm*sizeof(Complex));
107#endif
108 //Evaluate xi = (M^† M)^-1 R_1
109 // Congradp(0, res, R1_f, itercg);
110 //If the conjugate gradient fails to converge for some reason, restart it.
111 //That's causing issues with NaN's. Plan B is to not record the measurements.
112 if(Congradp(0, res, Phi, R1,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,itercg)==ITERLIM)
113 return ITERLIM;
114 //itercg=0;
115 //if(!rank) fprintf(stderr, "Restarting conjugate gradient from %s\n", funcname);
116 //Congradp(0, res, Phi, R1_f,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,itercg);
117 //itercg+=niterc;
118 /*
119#pragma omp parallel for simd aligned(R1,R1_f:AVX)
120for(int i=0;i<kferm;i++)
121xi[i]=(Complex)R1_f[i];
122*/
123#ifdef __NVCC__
124 cudaMemcpyAsync(xi,R1,kferm*sizeof(Complex),cudaMemcpyDefault,streams[0]);
125 #ifdef _DEBUG
126 cudaFree(R1_f);
127 #else
128 cudaFreeAsync(R1_f,streams[1]);
129 #endif
130#else
131 memcpy(xi,R1,kferm*sizeof(Complex));
132 free(xi_f); free(R1_f);
133#endif
134#ifdef USE_BLAS
135 Complex buff;
136#ifdef __NVCC__
137 cublasZdotc(cublas_handle,kferm,(cuDoubleComplex *)x,1,(cuDoubleComplex *)xi,1,(cuDoubleComplex *)&buff);
138 cudaDeviceSynchronise();
139#elif defined USE_BLAS
140 cblas_zdotc_sub(kferm, x, 1, xi, 1, &buff);
141#endif
142 *pbp=creal(buff);
143#else
144 *pbp = 0;
145#pragma unroll
146 for(int i=0;i<kferm;i++)
147 *pbp+=creal(conj(x[i])*xi[i]);
148#endif
149#if(nproc>1)
150 Par_dsum(pbp);
151#endif
152 *pbp/=4*gvol;
153
154 *qbqb=*qq=0;
155#if defined USE_BLAS
156 for(int idirac = 0; idirac<ndirac; idirac++){
157 int igork=idirac+4;
158 //Unrolling the colour indices, Then its just (γ_5*x)*Ξ or (γ_5*Ξ)*x
159#pragma unroll
160 for(int ic = 0; ic<nc; ic++){
161 Complex dot;
162 //Because we have kvol on the outer index and are summing over it, we set the
163 //step for BLAS to be ngorkov*nc=16.
164 //Does this make sense to do on the GPU?
165#ifdef __NVCC__
166 cublasZdotc(cublas_handle,kvol,(cuDoubleComplex *)(x+idirac*nc+ic),ngorkov*nc,(cuDoubleComplex *)(xi+igork*nc+ic), ngorkov*nc,(cuDoubleComplex *)&dot);
167#else
168 cblas_zdotc_sub(kvol, &x[idirac*nc+ic], ngorkov*nc, &xi[igork*nc+ic], ngorkov*nc, &dot);
169#endif
170 *qbqb+=gamval[4*ndirac+idirac]*dot;
171#ifdef __NVCC__
172 cublasZdotc(cublas_handle,kvol,(cuDoubleComplex *)(x+igork*nc+ic),ngorkov*nc,(cuDoubleComplex *)(xi+idirac*nc+ic), ngorkov*nc,(cuDoubleComplex *)&dot);
173#else
174 cblas_zdotc_sub(kvol, &x[igork*nc+ic], ngorkov*nc, &xi[idirac*nc+ic], ngorkov*nc, &dot);
175#endif
176 *qq-=gamval[4*ndirac+idirac]*dot;
177 }
178 }
179#else
180#pragma unroll(2)
181 for(int i=0; i<kvol; i++)
182 //What is the optimal order to evaluate these in?
183 for(int idirac = 0; idirac<ndirac; idirac++){
184 int igork=idirac+4;
185 *qbqb+=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+idirac)*nc])*xi[(i*ngorkov+igork)*nc];
186 *qq-=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+igork)*nc])*xi[(i*ngorkov+idirac)*nc];
187 *qbqb+=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+idirac)*nc+1])*xi[(i*ngorkov+igork)*nc+1];
188 *qq-=gamval[4*ndirac+idirac]*conj(x[(i*ngorkov+igork)*nc+1])*xi[(i*ngorkov+idirac)*nc+1];
189 }
190#endif
191 //In the FORTRAN Code dsum was used instead despite qq and qbqb being complex
192 //Since we only care about the real part this shouldn't cause (m)any serious issues
193#if(nproc>1)
194 Par_dsum((double *)qq); Par_dsum((double *)qbqb);
195#endif
196 *qq=(*qq+*qbqb)/(2*gvol);
197 Complex xu, xd, xuu, xdd;
198 xu=xd=xuu=xdd=0;
199
200 //Halos
201#if(npt>1)
202 ZHalo_swap_dir(x,16,3,DOWN); ZHalo_swap_dir(x,16,3,UP);
203#endif
204 //Pesky halo exchange indices again
205 //The halo exchange for the trial fields was done already at the end of the trajectory
206 //No point doing it again
207
208 //Instead of typing id[i*ndim+3] a lot, we'll just assign them to variables.
209 //Idea. One loop instead of two loops but for xuu and xdd just use ngorkov-(igorkov+1) instead
210 //Dirty CUDA work around since it won't convert thrust<complex> to double
211 //TODO: get a reduction routine ready for CUDA
212#ifndef __NVCC__
213#pragma omp parallel for reduction(+:xd,xu,xdd,xuu)
214#endif
215 for(int i = 0; i<kvol; i++){
216 int did=id[3+ndim*i];
217 int uid=iu[3+ndim*i];
218#ifndef __NVCC__
219#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xu)
220#endif
221 for(int igorkov=0; igorkov<4; igorkov++){
222 int igork1=gamin[3*ndirac+igorkov];
223 //For the C Version I'll try and factorise where possible
224 xu+=dk4p[did]*(conj(x[(did*ngorkov+igorkov)*nc])*(\
225 u11t[did*ndim+3]*(xi[(i*ngorkov+igork1)*nc]-xi[(i*ngorkov+igorkov)*nc])+\
226 u12t[did*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]-xi[(i*ngorkov+igorkov)*nc+1]) )+\
227 conj(x[(did*ngorkov+igorkov)*nc+1])*(\
228 conj(u11t[did*ndim+3])*(xi[(i*ngorkov+igork1)*nc+1]-xi[(i*ngorkov+igorkov)*nc+1])+\
229 conj(u12t[did*ndim+3])*(xi[(i*ngorkov+igorkov)*nc]-xi[(i*ngorkov+igork1)*nc])));
230 }
231#ifndef __NVCC__
232#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xd)
233#endif
234 for(int igorkov=0; igorkov<4; igorkov++){
235 int igork1=gamin[3*ndirac+igorkov];
236 xd+=dk4m[i]*(conj(x[(uid*ngorkov+igorkov)*nc])*(\
237 conj(u11t[i*ndim+3])*(xi[(i*ngorkov+igork1)*nc]+xi[(i*ngorkov+igorkov)*nc])-\
238 u12t[i*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]+xi[(i*ngorkov+igorkov)*nc+1]) )+\
239 conj(x[(uid*ngorkov+igorkov)*nc+1])*(\
240 u11t[i*ndim+3]*(xi[(i*ngorkov+igork1)*nc+1]+xi[(i*ngorkov+igorkov)*nc+1])+\
241 conj(u12t[i*ndim+3])*(xi[(i*ngorkov+igorkov)*nc]+xi[(i*ngorkov+igork1)*nc]) ) );
242 }
243#ifndef __NVCC__
244#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xuu)
245#endif
246 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
247 int igork1PP=4+gamin[3*ndirac+igorkovPP-4];
248 xuu-=dk4m[did]*(conj(x[(did*ngorkov+igorkovPP)*nc])*(\
249 u11t[did*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc]-xi[(i*ngorkov+igorkovPP)*nc])+\
250 u12t[did*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]-xi[(i*ngorkov+igorkovPP)*nc+1]) )+\
251 conj(x[(did*ngorkov+igorkovPP)*nc+1])*(\
252 conj(u11t[did*ndim+3])*(xi[(i*ngorkov+igork1PP)*nc+1]-xi[(i*ngorkov+igorkovPP)*nc+1])+\
253 conj(u12t[did*ndim+3])*(xi[(i*ngorkov+igorkovPP)*nc]-xi[(i*ngorkov+igork1PP)*nc]) ) );
254 }
255#ifndef __NVCC__
256#pragma omp simd aligned(u11t,u12t,xi,x,dk4m,dk4p:AVX) reduction(+:xdd)
257#endif
258 for(int igorkovPP=4; igorkovPP<8; igorkovPP++){
259 int igork1PP=4+gamin[3*ndirac+igorkovPP-4];
260 xdd-=dk4p[i]*(conj(x[(uid*ngorkov+igorkovPP)*nc])*(\
261 conj(u11t[i*ndim+3])*(xi[(i*ngorkov+igork1PP)*nc]+xi[(i*ngorkov+igorkovPP)*nc])-\
262 u12t[i*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]+xi[(i*ngorkov+igorkovPP)*nc+1]) )+\
263 conj(x[(uid*ngorkov+igorkovPP)*nc+1])*(\
264 u11t[i*ndim+3]*(xi[(i*ngorkov+igork1PP)*nc+1]+xi[(i*ngorkov+igorkovPP)*nc+1])+\
265 conj(u12t[i*ndim+3])*(xi[(i*ngorkov+igorkovPP)*nc]+xi[(i*ngorkov+igork1PP)*nc]) ) );
266 }
267 }
268 *endenf=creal(xu-xd-xuu+xdd);
269 *denf=creal(xu+xd+xuu+xdd);
270
271#if(nproc>1)
272 Par_dsum(endenf); Par_dsum(denf);
273#endif
274 *endenf/=2*gvol; *denf/=2*gvol;
275 //Future task. Chiral susceptibility measurements
276#ifdef __NVCC__
277 cudaFree(x); cudaFree(xi);
278#else
279 free(x); free(xi);
280#endif
281 return 0;
282}
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
Definition random.c:260
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...
Definition congrad.c:262

References AVX, Complex, Complex_f, Congradp(), DOWN, Dslashd_f(), Gauss_c(), gvol, kferm, kfermHalo, kvol, nc, ndim, ndirac, ngorkov, Par_dsum(), UP, and ZHalo_swap_dir().

Here is the call graph for this function:

◆ Polyakov()

double Polyakov ( Complex_f * u11t,
Complex_f * u12t )

Calculate the Polyakov loop (no prizes for guessing that one...)

Parameters
u11t,u12tThe gauge fields
See also
Par_tmul, Par_dsum
Returns
Double corresponding to the polyakov loop

Definition at line 105 of file bosonic.c.

105 {
106 /*
107 * Calculate the Polyakov loop (no prizes for guessing that one...)
108 *
109 * Parameters:
110 * =========
111 * u11t, u12t The gauge fields
112 *
113 * Calls:
114 * ======
115 * Par_tmul, Par_dsum
116 *
117 * Return:
118 * ======
119 * Double corresponding to the polyakov loop
120 */
121 const char *funcname = "Polyakov";
122 double poly = 0;
123#ifdef __NVCC__
124 int device=-1;
125 cudaGetDevice(&device);
126 Complex_f *Sigma11,*Sigma12;
127 cudaMallocManaged((void **)&Sigma11,kvol3*sizeof(Complex_f),cudaMemAttachGlobal);
128#ifdef _DEBUG
129 cudaMallocManaged((void **)&Sigma12,kvol3*sizeof(Complex_f),cudaMemAttachGlobal);
130#else
131 cudaMallocAsync((void **)&Sigma12,kvol3*sizeof(Complex_f),streams[0]);
132#endif
133#else
134 Complex_f *Sigma11 = aligned_alloc(AVX,kvol3*sizeof(Complex_f));
135 Complex_f *Sigma12 = aligned_alloc(AVX,kvol3*sizeof(Complex_f));
136#endif
137
138 //Extract the time component from each site and save in corresponding Sigma
139#ifdef __NVCC__
140 cublasCcopy(cublas_handle,kvol3, (cuComplex *)(u11t+3), ndim, (cuComplex *)Sigma11, 1);
141 cublasCcopy(cublas_handle,kvol3, (cuComplex *)(u12t+3), ndim, (cuComplex *)Sigma12, 1);
142#elif defined USE_BLAS
143 cblas_ccopy(kvol3, u11t+3, ndim, Sigma11, 1);
144 cblas_ccopy(kvol3, u12t+3, ndim, Sigma12, 1);
145#else
146 for(int i=0; i<kvol3; i++){
147 Sigma11[i]=u11t[i*ndim+3];
148 Sigma12[i]=u12t[i*ndim+3];
149 }
150#endif
151 /* Some Fortran commentary
152 Changed this routine.
153 u11t and u12t now defined as normal ie (kvol+halo,4).
154 Copy of Sigma11 and Sigma12 is changed so that it copies
155 in blocks of ksizet.
156 Variable indexu also used to select correct element of u11t and u12t
157 in loop 10 below.
158
159 Change the order of multiplication so that it can
160 be done in parallel. Start at t=1 and go up to t=T:
161 previously started at t+T and looped back to 1, 2, ... T-1
162 Buffers
163 There is a dependency. Can only parallelise the inner loop
164 */
165#ifdef __NVCC__
166 cudaDeviceSynchronise();
167 cuPolyakov(Sigma11,Sigma12,u11t,u12t,dimGrid,dimBlock);
168 cudaMemPrefetchAsync(Sigma11,kvol3*sizeof(Complex_f),cudaCpuDeviceId,NULL);
169#else
170#pragma unroll
171 for(int it=1;it<ksizet;it++)
172#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12:AVX)
173 for(int i=0;i<kvol3;i++){
174 //Seems a bit more efficient to increment indexu instead of reassigning
175 //it every single loop
176 int indexu=it*kvol3+i;
177 Complex_f a11=Sigma11[i]*u11t[indexu*ndim+3]-Sigma12[i]*conj(u12t[indexu*ndim+3]);
178 //Instead of having to store a second buffer just assign it directly
179 Sigma12[i]=Sigma11[i]*u12t[indexu*ndim+3]+Sigma12[i]*conj(u11t[indexu*ndim+3]);
180 Sigma11[i]=a11;
181 }
182 //Multiply this partial loop with the contributions of the other cores in the
183 //Time-like dimension
184#endif
185 //End of CUDA-CPU pre-processor for evaluating Polyakov
186 //
187 //Par_tmul does nothing if there is only a single processor in the time direction. So we only compile
188 //its call if it is required
189#if (npt>1)
190#ifdef __NVCC_
191#error Par_tmul is not yet implimented in CUDA as Sigma12 is device only memory
192#endif
193#ifdef _DEBUG
194 printf("Multiplying with MPI\n");
195#endif
196 Par_tmul(Sigma11, Sigma12);
197 //end of #if(npt>1)
198#endif
199 /*Now all cores have the value for the complete Polyakov line at all spacial sites
200 We need to globally sum over spacial processors but not across time as these
201 are duplicates. So we zero the value for all but t=0
202 This is (according to the FORTRAN code) a bit of a hack
203 I will expand on this hack and completely avoid any work
204 for this case rather than calculating everything just to set it to zero
205 */
206 if(!pcoord[3+rank*ndim])
207#ifdef __NVCC__
208 cudaDeviceSynchronise();
209#pragma omp parallel for simd reduction(+:poly)
210#else
211#pragma omp parallel for simd reduction(+:poly) aligned(Sigma11:AVX)
212#endif
213 for(int i=0;i<kvol3;i++)
214 poly+=creal(Sigma11[i]);
215#ifdef __NVCC__
216 cudaFree(Sigma11);
217#ifdef _DEBUG
218 cudaFree(Sigma12);
219#else
220 cudaFreeAsync(Sigma12,streams[0]);
221#endif
222#else
223 free(Sigma11); free(Sigma12);
224#endif
225
226#if(nproc>1)
227 Par_dsum(&poly);
228#endif
229 poly/=gvol3;
230 return poly;
231}
#define gvol3
Lattice spatial volume.
Definition sizes.h:94

References AVX, Complex_f, gvol3, ksizet, kvol3, ndim, Par_dsum(), pcoord, and rank.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Reunitarise()

int Reunitarise ( Complex * u11t,
Complex * u12t )
inline

Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.

If you're looking at the FORTRAN code be careful. There are two header files for the /trial/ header. One with u11 u12 (which was included here originally) and the other with u11t and u12t.

Parameters
u11t,u12tTrial fields to be reunitarised
Returns
Zero on success, integer error code otherwise

Definition at line 904 of file matrices.c.

904 {
905 /*
906 * @brief Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1
907 *
908 * If you're looking at the FORTRAN code be careful. There are two header files
909 * for the /trial/ header. One with u11 u12 (which was included here originally)
910 * and the other with u11t and u12t.
911 *
912 * @see cuReunitarise (CUDA Wrapper)
913 *
914 * @param u11t, u12t Trial fields to be reunitarised
915 *
916 * @return Zero on success, integer error code otherwise
917 */
918 const char *funcname = "Reunitarise";
919#ifdef __NVCC__
920 cuReunitarise(u11t,u12t,dimGrid,dimBlock);
921#else
922#pragma omp parallel for simd aligned(u11t,u12t:AVX)
923 for(int i=0; i<kvol*ndim; i++){
924 //Declaring anorm inside the loop will hopefully let the compiler know it
925 //is safe to vectorise aggressively
926 double anorm=sqrt(conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]);
927 // Exception handling code. May be faster to leave out as the exit prevents vectorisation.
928 // if(anorm==0){
929 // fprintf(stderr, "Error %i in %s on rank %i: anorm = 0 for μ=%i and i=%i.\nExiting...\n\n",
930 // DIVZERO, funcname, rank, mu, i);
931 // MPI_Finalise();
932 // exit(DIVZERO);
933 // }
934 u11t[i]/=anorm;
935 u12t[i]/=anorm;
936 }
937#endif
938 return 0;
939}

References kvol, and ndim.

Here is the caller graph for this function:

◆ SU2plaq()

float SU2plaq ( Complex_f * u11t,
Complex_f * u12t,
unsigned int * iu,
int i,
int mu,
int nu )
inline

Calculates the plaquette at site i in the \(\mu--\nu\) direction.

Parameters
u11t,u12tTrial fields
iLattice site
iuUpper halo indices
mu,nuPlaquette direction. Note that mu and nu can be negative to facilitate calculating plaquettes for Clover terms. No sanity checks are conducted on them in this routine.
Returns
double corresponding to the plaquette value

Definition at line 72 of file bosonic.c.

72 {
73 /*
74 * Calculates the plaquette at site i in the μ-ν direction
75 *
76 * Parameters:
77 * ==========
78 * u11t, u12t: Trial fields
79 * int *iu: Upper halo indices
80 * mu, nu: Plaquette direction. Note that mu and nu can be negative
81 * to facilitate calculating plaquettes for Clover terms. No
82 * sanity checks are conducted on them in this routine.
83 *
84 * Return:
85 * =======
86 * double corresponding to the plaquette value
87 *
88 */
89 const char *funcname = "SU2plaq";
90 int uidm = iu[mu+ndim*i];
91
92 Complex_f Sigma11=u11t[i*ndim+mu]*u11t[uidm*ndim+nu]-u12t[i*ndim+mu]*conj(u12t[uidm*ndim+nu]);
93 Complex_f Sigma12=u11t[i*ndim+mu]*u12t[uidm*ndim+nu]+u12t[i*ndim+mu]*conj(u11t[uidm*ndim+nu]);
94
95 int uidn = iu[nu+ndim*i];
96 Complex_f a11=Sigma11*conj(u11t[uidn*ndim+mu])+Sigma12*conj(u12t[uidn*ndim+mu]);
97 Complex_f a12=-Sigma11*u12t[uidn*ndim+mu]+Sigma12*u11t[uidn*ndim+mu];
98
99 Sigma11=a11*conj(u11t[i*ndim+nu])+a12*conj(u12t[i*ndim+nu]);
100 // Sigma12[i]=-a11[i]*u12t[i*ndim+nu]+a12*u11t[i*ndim+mu];
101 // Not needed in final result as it traces out
102 return creal(Sigma11);
103}

References Complex_f, and ndim.

Here is the caller graph for this function:

◆ UpDownPart()

int UpDownPart ( const int na,
Complex * X0,
Complex * R1 )
inline

Definition at line 380 of file su2hmc.c.

380 {
381#ifdef __NVCC__
382 cuUpDownPart(na,X0,R1,dimBlock,dimGrid);
383 cudaDeviceSynchronise();
384#else
385 //The only reason this was removed from the original function is for diagnostics
386#pragma omp parallel for simd collapse(2) aligned(X0,R1:AVX)
387 for(int i=0; i<kvol; i++)
388 for(int idirac = 0; idirac < ndirac; idirac++){
389 X0[((na*kvol+i)*ndirac+idirac)*nc]=R1[(i*ngorkov+idirac)*nc];
390 X0[((na*kvol+i)*ndirac+idirac)*nc+1]=R1[(i*ngorkov+idirac)*nc+1];
391 }
392#endif
393 return 0;
394}

◆ Z_gather()

int Z_gather ( Complex * x,
Complex * y,
int n,
unsigned int * table,
unsigned int mu )
inline

Extracts all the double precision gauge links in the \(\mu\) direction only.

Parameters
xThe output
yThe gauge field for a particular colour
nNumber of sites in the gauge field. This is typically kvol
tableTable containing information on nearest neighbours. Usually id or iu
muDireciton we're interested in extractng
Returns
Zero on success, integer error code otherwise

Definition at line 335 of file su2hmc.c.

336{
337 const char *funcname = "Z_gather";
338 //FORTRAN had a second parameter m giving the size of y (kvol+halo) normally
339 //Pointers mean that's not an issue for us so I'm leaving it out
340#ifdef __NVCC__
341 cuZ_gather(x,y,n,table,mu,dimBlock,dimGrid);
342#else
343#pragma omp parallel for simd aligned (x,y,table:AVX)
344 for(int i=0; i<n; i++)
345 x[i]=y[table[i*ndim+mu]*ndim+mu];
346#endif
347 return 0;
348}

References ndim.