su2hmc
Loading...
Searching...
No Matches
matrices.c File Reference

Matrix multiplication and related routines. More...

#include <assert.h>
#include <complex.h>
#include <matrices.h>
#include <string.h>
#include <stdalign.h>
Include dependency graph for matrices.c:

Go to the source code of this file.

Functions

int Dslash (Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa)
 Evaluates \(\Phi=M r\) in double precision.
 
int Dslashd (Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa)
 Evaluates \(\Phi=M^\dagger r\) in double precision.
 
int Hdslash (Complex *phi, Complex *r, Complex *ut[2], unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk[2], float akappa)
 Evaluates \(\Phi=M r\) in double precision.
 
int Hdslashd (Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
 Evaluates \(\Phi=M^\dagger r\) in double precision.
 
int Dslash_f (Complex_f *phi, Complex_f *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk_f[2], Complex_f jqq, float akappa)
 Evaluates \(\Phi=M r\) in single precision.
 
int Dslashd_f (Complex_f *phi, Complex_f *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk_f[2], Complex_f jqq, float akappa)
 Evaluates \(\Phi=M^\dagger r\) in single precision.
 
int Hdslash_f (Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
 Evaluates \(\Phi=M r\) in single precision.
 
int Hdslashd_f (Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
 Evaluates \(\Phi=M^\dagger r\) in single precision.
 
int Reunitarise (Complex *ut[2])
 Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
 
void Transpose_c (Complex_f *out, const int fast_in, const int fast_out)
 
void Transpose_z (Complex *out, const int fast_in, const int fast_out)
 
void Transpose_f (float *out, const int fast_in, const int fast_out)
 
void Transpose_d (double *out, const int fast_in, const int fast_out)
 
void Transpose_I (int *out, const int fast_in, const int fast_out)
 
void Transpose_U (unsigned int *out, const int fast_in, const int fast_out)
 

Detailed Description

Matrix multiplication and related routines.

There are two four matrix mutiplication routines, and each had a double and single (_f) version The Hdslash? routines are called when acting on half of the fermions (up/down flavour partitioning) The Dslash routines act on everything

Any routine ending in a d is the daggered multiplication

Definition in file matrices.c.

Function Documentation

◆ Dslash()

int Dslash ( Complex * phi,
Complex * r,
Complex * u11t,
Complex * u12t,
unsigned int * iu,
unsigned int * id,
Complex * gamval,
int * gamin,
double * dk4m,
double * dk4p,
Complex_f jqq,
float akappa )

Evaluates \(\Phi=M r\) in double precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 19 of file matrices.c.

20 {
21 /*
22 * @brief Evaluates @f(\Phi=M r@f) in double precision.
23 *
24 * @param phi: The product
25 * @param r: The array being acted on by M
26 * @param u11t: First colour trial field
27 * @param u12t: Second colour trial field
28 * @param iu: Upper halo indices
29 * @param id: Lower halo indices
30 * @param gamval: Gamma matrices
31 * @param gamin: Indices for dirac terms
32 * @param dk4m:
33 * @param dk4p:
34 * @param jqq: Diquark source
35 * @param akappa: Hopping parameter
36 *
37 * @see ZHalo_swap_all (MPI only)
38 *
39 * @return Zero on success, integer error code otherwise
40 */
41 const char *funcname = "Dslash";
42 //Get the halos in order
43#if(nproc>1)
44 ZHalo_swap_all(r, 16);
45#endif
46
47 //Mass term
48 //Diquark Term (antihermitian)
49#ifdef __NVCC__
50 cuDslash(phi,r,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa,dimGrid,dimBlock);
51#else
52 memcpy(phi, r, kferm*sizeof(Complex));
53#pragma omp parallel for
54 for(int i=0;i<kvol;i++){
55#pragma omp simd aligned(phi,r,gamval:AVX)
56 for(int idirac = 0; idirac<ndirac; idirac++){
57 int igork = idirac+4;
58 Complex a_1, a_2;
59 a_1=conj(jqq)*gamval[4*ndirac+idirac];
60 //We subtract a_2, hence the minus
61 a_2=-jqq*gamval[4*ndirac+idirac];
62 phi[(i*ngorkov+idirac)*nc]+=a_1*r[(i*ngorkov+igork)*nc+0];
63 phi[(i*ngorkov+idirac)*nc+1]+=a_1*r[(i*ngorkov+igork)*nc+1];
64 phi[(i*ngorkov+igork)*nc+0]+=a_2*r[(i*ngorkov+idirac)*nc];
65 phi[(i*ngorkov+igork)*nc+1]+=a_2*r[(i*ngorkov+idirac)*nc+1];
66 }
67
68 //Spacelike terms. Here's hoping I haven't put time as the zeroth component somewhere!
69#ifndef NO_SPACE
70 for(int mu = 0; mu <3; mu++){
71 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
72#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
73 for(int igorkov=0; igorkov<ngorkov; igorkov++){
74 //FORTRAN had mod((igorkov-1),4)+1 to prevent issues with non-zero indexing in the dirac term.
75 int idirac=igorkov%4;
76 int igork1 = (igorkov<4) ? gamin[mu*ndirac+idirac] : gamin[mu*ndirac+idirac]+4;
77 //Can manually vectorise with a pragma?
78 //Wilson + Dirac term in that order. Definitely easier
79 //to read when split into different loops, but should be faster this way
80 phi[(i*ngorkov+igorkov)*nc]+=-akappa*(u11t[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc]+\
81 u12t[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc+1]+\
82 conj(u11t[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]-\
83 u12t[did*ndim+mu]*r[(did*ngorkov+igorkov)*nc+1])+\
84 //Dirac term. Reminder! gamval was rescaled by kappa when we defined it
85 gamval[mu*ndirac+idirac]*(u11t[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc]+\
86 u12t[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc+1]-\
87 conj(u11t[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]+\
88 u12t[did*ndim+mu]*r[(did*ngorkov+igork1)*nc+1]);
89
90 phi[(i*ngorkov+igorkov)*nc+1]+=-akappa*(-conj(u12t[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc]+\
91 conj(u11t[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc+1]+\
92 conj(u12t[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]+\
93 u11t[did*ndim+mu]*r[(did*ngorkov+igorkov)*nc+1])+\
94 //Dirac term
95 gamval[mu*ndirac+idirac]*(-conj(u12t[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc]+\
96 conj(u11t[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc+1]-\
97 conj(u12t[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]-\
98 u11t[did*ndim+mu]*r[(did*ngorkov+igork1)*nc+1]);
99 }
100 }
101 //Timelike terms next. These run from igorkov=0..3 and 4..7 with slightly different rules for each
102 //We can fit it into a single loop by declaring igorkovPP=igorkov+4 instead of looping igorkov=4..7 separately
103 //Note that for the igorkov 4..7 loop idirac=igorkov-4, so we don't need to declare idiracPP separately
104#endif
105 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
106#ifndef NO_TIME
107#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
108 for(int igorkov=0; igorkov<4; igorkov++){
109 int igorkovPP=igorkov+4; //idirac = igorkov; It is a bit redundant but I'll mention it as that's how
110 //the FORTRAN code did it.
111 int igork1 = gamin[3*ndirac+igorkov]; int igork1PP = igork1+4;
112
113 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
114 phi[(i*ngorkov+igorkov)*nc]+=
115 -dk4p[i]*(u11t[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc]-r[(uid*ngorkov+igork1)*nc])
116 +u12t[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc+1]-r[(uid*ngorkov+igork1)*nc+1]))
117 -dk4m[did]*(conj(u11t[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]+r[(did*ngorkov+igork1)*nc])
118 -u12t[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]+r[(did*ngorkov+igork1)*nc+1]));
119 phi[(i*ngorkov+igorkov)*nc+1]+=
120 -dk4p[i]*(-conj(u12t[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc]-r[(uid*ngorkov+igork1)*nc])
121 +conj(u11t[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc+1]-r[(uid*ngorkov+igork1)*nc+1]))
122 -dk4m[did]*(conj(u12t[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]+r[(did*ngorkov+igork1)*nc])
123 +u11t[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]+r[(did*ngorkov+igork1)*nc+1]));
124
125 //And the +4 terms. Note that dk4p and dk4m swap positions compared to the above
126 phi[(i*ngorkov+igorkovPP)*nc]+=-dk4m[i]*(u11t[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc]-r[(uid*ngorkov+igork1PP)*nc])+\
127 u12t[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc+1]-r[(uid*ngorkov+igork1PP)*nc+1]))-\
128 dk4p[did]*(conj(u11t[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]+r[(did*ngorkov+igork1PP)*nc])-\
129 u12t[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]+r[(did*ngorkov+igork1PP)*nc+1]));
130
131 phi[(i*ngorkov+igorkovPP)*nc+1]+=-dk4m[i]*(conj(-u12t[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc]-r[(uid*ngorkov+igork1PP)*nc])+\
132 conj(u11t[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc+1]-r[(uid*ngorkov+igork1PP)*nc+1]))-\
133 dk4p[did]*(conj(u12t[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]+r[(did*ngorkov+igork1PP)*nc])+\
134 u11t[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]+r[(did*ngorkov+igork1PP)*nc+1]));
135 }
136#endif
137 }
138#endif
139 return 0;
140}
int ZHalo_swap_all(Complex *z, int ncpt)
Calls the functions to send data to both the up and down halos.
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define ndirac
Dirac indices.
Definition sizes.h:177
#define ndim
Dimensions.
Definition sizes.h:179

References Complex, Complex_f, kferm, kvol, nc, ndim, ndirac, ngorkov, and ZHalo_swap_all().

Here is the call graph for this function:

◆ Dslash_f()

int Dslash_f ( Complex_f * phi,
Complex_f * r,
Complex_f * u11t,
Complex_f * u12t,
unsigned int * iu,
unsigned int * id,
Complex_f * gamval,
int * gamin,
float * dk[2],
Complex_f jqq,
float akappa )

Evaluates \(\Phi=M r\) in single precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 463 of file matrices.c.

464 {
465 /*
466 * @brief Evaluates @f(\Phi=M r@f) in single precision.
467 *
468 * @param phi: The product
469 * @param r: The array being acted on by M
470 * @param u11t_f: First colour trial field
471 * @param u12t_f: Second colour trial field
472 * @param iu: Upper halo indices
473 * @param id: Lower halo indices
474 * @param gamval_f: Gamma matrices
475 * @param gamin: Indices for dirac terms
476 * @param dk_f[0]:
477 * @param dk_f[1]:
478 * @param jqq: Diquark source
479 * @param akappa: Hopping parameter
480 *
481 * @see CHalo_swap_all (MPI only)
482 *
483 * @return Zero on success, integer error code otherwise
484 */
485 const char *funcname = "Dslash_f";
486 //Get the halos in order
487#if(nproc>1)
488 CHalo_swap_all(r, 16);
489#endif
490
491 //Mass term
492 //Diquark Term (antihermitian)
493#ifdef __NVCC__
494 cuDslash_f(phi,r,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk_f[0],dk_f[1],jqq,akappa,dimGrid,dimBlock);
495#else
496 memcpy(phi, r, kferm*sizeof(Complex_f));
497#pragma omp parallel for
498 for(unsigned int i=0;i<kvol;i++){
499#pragma omp simd aligned(phi,r,gamval_f:AVX)
500 for(unsigned short idirac = 0; idirac<ndirac; idirac++){
501 unsigned short igork = idirac+4;
502 Complex_f a_1, a_2;
503 a_1=conj(jqq)*gamval_f[4*ndirac+idirac];
504 //We subtract a_2, hence the minus
505 a_2=-jqq*gamval_f[4*ndirac+idirac];
506 phi[(i*ngorkov+idirac)*nc]+=a_1*r[(i*ngorkov+igork)*nc+0];
507 phi[(i*ngorkov+idirac)*nc+1]+=a_1*r[(i*ngorkov+igork)*nc+1];
508 phi[(i*ngorkov+igork)*nc+0]+=a_2*r[(i*ngorkov+idirac)*nc];
509 phi[(i*ngorkov+igork)*nc+1]+=a_2*r[(i*ngorkov+idirac)*nc+1];
510 }
511
512 //Spacelike terms. Here's hoping I haven't put time as the zeroth component somewhere!
513#ifndef NO_SPACE
514 for(unsigned short mu = 0; mu <3; mu++){
515 unsigned int did=id[mu+ndim*i]; unsigned int uid = iu[mu+ndim*i];
516#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamval_f,gamin:AVX)
517 for(unsigned short igorkov=0; igorkov<ngorkov; igorkov++){
518 //FORTRAN had mod((igorkov-1),4)+1 to prevent issues with non-zero indexing in the dirac term.
519 unsigned short idirac=igorkov%4;
520 unsigned short igork1 = (igorkov<4) ? gamin[mu*ndirac+idirac] : gamin[mu*ndirac+idirac]+4;
521 //Can manually vectorise with a pragma?
522 //Wilson + Dirac term in that order. Definitely easier
523 //to read when split into different loops, but should be faster this way
524 phi[(i*ngorkov+igorkov)*nc]+=-akappa*(u11t_f[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc]+\
525 u12t_f[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc+1]+\
526 conj(u11t_f[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]-\
527 u12t_f[did*ndim+mu]*r[(did*ngorkov+igorkov)*nc+1])+\
528 //Dirac term. Reminder! gamval was rescaled by kappa when we defined it
529 gamval_f[mu*ndirac+idirac]*(u11t_f[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc]+\
530 u12t_f[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc+1]-\
531 conj(u11t_f[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]+\
532 u12t_f[did*ndim+mu]*r[(did*ngorkov+igork1)*nc+1]);
533
534 phi[(i*ngorkov+igorkov)*nc+1]+=-akappa*(-conj(u12t_f[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc]+\
535 conj(u11t_f[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc+1]+\
536 conj(u12t_f[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]+\
537 u11t_f[did*ndim+mu]*r[(did*ngorkov+igorkov)*nc+1])+\
538 //Dirac term
539 gamval_f[mu*ndirac+idirac]*(-conj(u12t_f[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc]+\
540 conj(u11t_f[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc+1]-\
541 conj(u12t_f[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]-\
542 u11t_f[did*ndim+mu]*r[(did*ngorkov+igork1)*nc+1]);
543 }
544 }
545 //Timelike terms next. These run from igorkov=0..3 and 4..7 with slightly different rules for each
546 //We can fit it into a single loop by declaring igorkovPP=igorkov+4 instead of looping igorkov=4..7 separately
547 //Note that for the igorkov 4..7 loop idirac=igorkov-4, so we don't need to declare idiracPP separately
548#endif
549 unsigned int did=id[3+ndim*i]; unsigned int uid = iu[3+ndim*i];
550#ifndef NO_TIME
551#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamin:AVX)
552 for(unsigned short igorkov=0; igorkov<4; igorkov++){
553 unsigned short igorkovPP=igorkov+4; //idirac = igorkov; It is a bit redundant but I'll mention it as that's how
554 //the FORTRAN code did it.
555 unsigned short igork1 = gamin[3*ndirac+igorkov]; unsigned short igork1PP = igork1+4;
556
557 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
558 phi[(i*ngorkov+igorkov)*nc]+=
559 -dk_f[1][i]*(u11t_f[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc]-r[(uid*ngorkov+igork1)*nc])
560 +u12t_f[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc+1]-r[(uid*ngorkov+igork1)*nc+1]))
561 -dk_f[0][did]*(conj(u11t_f[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]+r[(did*ngorkov+igork1)*nc])
562 -u12t_f[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]+r[(did*ngorkov+igork1)*nc+1]));
563 phi[(i*ngorkov+igorkov)*nc+1]+=
564 -dk_f[1][i]*(-conj(u12t_f[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc]-r[(uid*ngorkov+igork1)*nc])
565 +conj(u11t_f[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc+1]-r[(uid*ngorkov+igork1)*nc+1]))
566 -dk_f[0][did]*(conj(u12t_f[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]+r[(did*ngorkov+igork1)*nc])
567 +u11t_f[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]+r[(did*ngorkov+igork1)*nc+1]));
568
569 //And the +4 terms. Note that dk_f[1] and dk_f[0] swap positions compared to the above
570 phi[(i*ngorkov+igorkovPP)*nc]+=-dk_f[0][i]*(u11t_f[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc]-r[(uid*ngorkov+igork1PP)*nc])
571 +u12t_f[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc+1]-r[(uid*ngorkov+igork1PP)*nc+1]))
572 -dk_f[1][did]*(conj(u11t_f[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]+r[(did*ngorkov+igork1PP)*nc])
573 -u12t_f[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]+r[(did*ngorkov+igork1PP)*nc+1]));
574
575 phi[(i*ngorkov+igorkovPP)*nc+1]+=-dk_f[0][i]*(conj(-u12t_f[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc]-r[(uid*ngorkov+igork1PP)*nc])
576 +conj(u11t_f[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc+1]-r[(uid*ngorkov+igork1PP)*nc+1]))
577 -dk_f[1][did]*(conj(u12t_f[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]+r[(did*ngorkov+igork1PP)*nc])
578 +u11t_f[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]+r[(did*ngorkov+igork1PP)*nc+1]));
579 }
580#endif
581 }
582#endif
583 return 0;
584}
int CHalo_swap_all(Complex_f *c, int ncpt)
Calls the functions to send data to both the up and down halos.
#define Complex_f
Single precision complex number.
Definition sizes.h:56

References CHalo_swap_all(), Complex_f, kferm, kvol, nc, ndim, ndirac, and ngorkov.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Dslashd()

int Dslashd ( Complex * phi,
Complex * r,
Complex * u11t,
Complex * u12t,
unsigned int * iu,
unsigned int * id,
Complex * gamval,
int * gamin,
double * dk4m,
double * dk4p,
Complex_f jqq,
float akappa )

Evaluates \(\Phi=M^\dagger r\) in double precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 141 of file matrices.c.

142 {
143 /*
144 * @brief Evaluates @f(\Phi=M^\dagger r@f) in double precision.
145 *
146 * @param phi: The product
147 * @param r: The array being acted on by M
148 * @param u11t: First colour trial field
149 * @param u12t: Second colour trial field
150 * @param iu: Upper halo indices
151 * @param id: Lower halo indices
152 * @param gamval: Gamma matrices
153 * @param gamin: Indices for dirac terms
154 * @param dk4m:
155 * @param dk4p:
156 * @param jqq: Diquark source
157 * @param akappa: Hopping parameter
158 *
159 * @see ZHalo_swap_all (MPI only)
160 *
161 * @return Zero on success, integer error code otherwise
162 */
163 const char *funcname = "Dslashd";
164 //Get the halos in order
165#if(nproc>1)
166 ZHalo_swap_all(r, 16);
167#endif
168
169 //Mass term
170#ifdef __NVCC__
171 cuDslashd(phi,r,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa,dimGrid,dimBlock);
172#else
173 memcpy(phi, r, kferm*sizeof(Complex));
174#pragma omp parallel for
175 for(int i=0;i<kvol;i++){
176#pragma omp simd aligned(phi,r,gamval:AVX)
177 //Diquark Term (antihermitian) The signs of a_1 and a_2 below flip under dagger
178 for(int idirac = 0; idirac<ndirac; idirac++){
179 int igork = idirac+4;
180 Complex a_1, a_2;
181 //We subtract a_1, hence the minus
182 a_1=-conj(jqq)*gamval[4*ndirac+idirac];
183 a_2=jqq*gamval[4*ndirac+idirac];
184 phi[(i*ngorkov+idirac)*nc]+=a_1*r[(i*ngorkov+igork)*nc];
185 phi[(i*ngorkov+idirac)*nc+1]+=a_1*r[(i*ngorkov+igork)*nc+1];
186 phi[(i*ngorkov+igork)*nc]+=a_2*r[(i*ngorkov+idirac)*nc];
187 phi[(i*ngorkov+igork)*nc+1]+=a_2*r[(i*ngorkov+idirac)*nc+1];
188 }
189
190 //Spacelike terms. Here's hoping I haven't put time as the zeroth component somewhere!
191#ifndef NO_SPACE
192 for(int mu = 0; mu <3; mu++){
193 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
194#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
195 for(int igorkov=0; igorkov<ngorkov; igorkov++){
196 //FORTRAN had mod((igorkov-1),4)+1 to prevent issues with non-zero indexing.
197 int idirac=igorkov%4;
198 int igork1 = (igorkov<4) ? gamin[mu*ndirac+idirac] : gamin[mu*ndirac+idirac]+4;
199 //Wilson + Dirac term in that order. Definitely easier
200 //to read when split into different loops, but should be faster this way
201 //Reminder! gamval was rescaled by kappa when we defined it
202 phi[(i*ngorkov+igorkov)*nc]+=
203 -akappa*( u11t[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc]
204 +u12t[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc+1]
205 +conj(u11t[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]
206 -u12t[did*ndim+mu] *r[(did*ngorkov+igorkov)*nc+1])
207 -gamval[mu*ndirac+idirac]*
208 ( u11t[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc]
209 +u12t[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc+1]
210 -conj(u11t[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]
211 +u12t[did*ndim+mu] *r[(did*ngorkov+igork1)*nc+1]);
212
213 phi[(i*ngorkov+igorkov)*nc+1]+=
214 -akappa*(-conj(u12t[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc]
215 +conj(u11t[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc+1]
216 +conj(u12t[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]
217 +u11t[did*ndim+mu] *r[(did*ngorkov+igorkov)*nc+1])
218 -gamval[mu*ndirac+idirac]*
219 (-conj(u12t[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc]
220 +conj(u11t[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc+1]
221 -conj(u12t[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]
222 -u11t[did*ndim+mu] *r[(did*ngorkov+igork1)*nc+1]);
223 }
224 }
225#endif
226 //Timelike terms next. These run from igorkov=0..3 and 4..7 with slightly different rules for each
227 //We can fit it into a single loop by declaring igorkovPP=igorkov+4 instead of looping igorkov=4..7 separately
228 //Note that for the igorkov 4..7 loop idirac=igorkov-4, so we don't need to declare idiracPP separately
229 //Under dagger, dk4p and dk4m get swapped and the dirac component flips sign.
230 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
231#ifndef NO_TIME
232#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
233 for(int igorkov=0; igorkov<4; igorkov++){
234 //the FORTRAN code did it.
235 int igork1 = gamin[3*ndirac+igorkov];
236 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
237 phi[(i*ngorkov+igorkov)*nc]+=
238 -dk4m[i]*(u11t[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc]+r[(uid*ngorkov+igork1)*nc])
239 +u12t[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc+1]+r[(uid*ngorkov+igork1)*nc+1]))
240 -dk4p[did]*(conj(u11t[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]-r[(did*ngorkov+igork1)*nc])
241 -u12t[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]-r[(did*ngorkov+igork1)*nc+1]));
242 phi[(i*ngorkov+igorkov)*nc+1]+=
243 -dk4m[i]*(-conj(u12t[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc]+r[(uid*ngorkov+igork1)*nc])
244 +conj(u11t[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc+1]+r[(uid*ngorkov+igork1)*nc+1]))
245 -dk4p[did]*(conj(u12t[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]-r[(did*ngorkov+igork1)*nc])
246 +u11t[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]-r[(did*ngorkov+igork1)*nc+1]));
247
248
249 int igorkovPP=igorkov+4; //idirac = igorkov; It is a bit redundant but I'll mention it as that's how
250 int igork1PP = igork1+4;
251 //And the +4 terms. Note that dk4p and dk4m swap positions compared to the above
252 phi[(i*ngorkov+igorkovPP)*nc]+=-dk4p[i]*(u11t[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc]+r[(uid*ngorkov+igork1PP)*nc])+\
253 u12t[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc+1]+r[(uid*ngorkov+igork1PP)*nc+1]))-\
254 dk4m[did]*(conj(u11t[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]-r[(did*ngorkov+igork1PP)*nc])-\
255 u12t[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]-r[(did*ngorkov+igork1PP)*nc+1]));
256
257 phi[(i*ngorkov+igorkovPP)*nc+1]+=dk4p[i]*(conj(u12t[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc]+r[(uid*ngorkov+igork1PP)*nc])-\
258 conj(u11t[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc+1]+r[(uid*ngorkov+igork1PP)*nc+1]))-\
259 dk4m[did]*(conj(u12t[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]-r[(did*ngorkov+igork1PP)*nc])+
260 u11t[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]-r[(did*ngorkov+igork1PP)*nc+1]));
261
262 }
263#endif
264 }
265#endif
266 return 0;
267}

References Complex, Complex_f, kferm, kvol, nc, ndim, ndirac, ngorkov, and ZHalo_swap_all().

Here is the call graph for this function:

◆ Dslashd_f()

int Dslashd_f ( Complex_f * phi,
Complex_f * r,
Complex_f * u11t,
Complex_f * u12t,
unsigned int * iu,
unsigned int * id,
Complex_f * gamval,
int * gamin,
float * dk[2],
Complex_f jqq,
float akappa )

Evaluates \(\Phi=M^\dagger r\) in single precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 585 of file matrices.c.

586 {
587 /*
588 * @brief Evaluates @f(\Phi=M^\dagger r@f) in single precision.
589 *
590 * @param phi: The product
591 * @param r: The array being acted on by M
592 * @param u11t_f: First colour trial field
593 * @param u12t_f: Second colour trial field
594 * @param iu: Upper halo indices
595 * @param id: Lower halo indices
596 * @param gamval_f: Gamma matrices
597 * @param gamin: Indices for dirac terms
598 * @param dk_f[0]:
599 * @param dk_f[1]:
600 * @param jqq: Diquark source
601 * @param akappa: Hopping parameter
602 *
603 * @see CHalo_swap_all (MPI only)
604 *
605 * @return Zero on success, integer error code otherwise
606 */
607 const char *funcname = "Dslashd_f";
608 //Get the halos in order
609#if(nproc>1)
610 CHalo_swap_all(r, 16);
611#endif
612
613 //Mass term
614#ifdef __NVCC__
615 cuDslashd_f(phi,r,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk_f[0],dk_f[1],jqq,akappa,dimGrid,dimBlock);
616#else
617 memcpy(phi, r, kferm*sizeof(Complex_f));
618#pragma omp parallel for
619 for(unsigned int i=0;i<kvol;i++){
620#pragma omp simd aligned(phi,r,gamval_f:AVX)
621 //Diquark Term (antihermitian) The signs of a_1 and a_2 below flip under dagger
622 for(unsigned short idirac = 0; idirac<ndirac; idirac++){
623 unsigned short igork = idirac+4;
624 Complex_f a_1, a_2;
625 //We subtract a_1, hence the minus
626 a_1=-conj(jqq)*gamval_f[4*ndirac+idirac];
627 a_2=jqq*gamval_f[4*ndirac+idirac];
628 phi[(i*ngorkov+idirac)*nc]+=a_1*r[(i*ngorkov+igork)*nc];
629 phi[(i*ngorkov+idirac)*nc+1]+=a_1*r[(i*ngorkov+igork)*nc+1];
630 phi[(i*ngorkov+igork)*nc]+=a_2*r[(i*ngorkov+idirac)*nc];
631 phi[(i*ngorkov+igork)*nc+1]+=a_2*r[(i*ngorkov+idirac)*nc+1];
632 }
633
634 //Spacelike terms. Here's hoping I haven't put time as the zeroth component somewhere!
635#ifndef NO_SPACE
636 for(unsigned short mu = 0; mu <3; mu++){
637 unsigned int did=id[mu+ndim*i]; unsigned int uid = iu[mu+ndim*i];
638#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamval_f:AVX)
639 for(unsigned short igorkov=0; igorkov<ngorkov; igorkov++){
640 //FORTRAN had mod((igorkov-1),4)+1 to prevent issues with non-zero indexing.
641 unsigned short idirac=igorkov%4;
642 unsigned short igork1 = (igorkov<4) ? gamin[mu*ndirac+idirac] : gamin[mu*ndirac+idirac]+4;
643 //Wilson + Dirac term in that order. Definitely easier
644 //to read when split into different loops, but should be faster this way
645 //Reminder! gamval was rescaled by kappa when we defined it
646 phi[(i*ngorkov+igorkov)*nc]+=
647 -akappa*( u11t_f[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc]
648 +u12t_f[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc+1]
649 +conj(u11t_f[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]
650 -u12t_f[did*ndim+mu] *r[(did*ngorkov+igorkov)*nc+1])
651 -gamval_f[mu*ndirac+idirac]*
652 ( u11t_f[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc]
653 +u12t_f[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc+1]
654 -conj(u11t_f[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]
655 +u12t_f[did*ndim+mu] *r[(did*ngorkov+igork1)*nc+1]);
656
657 phi[(i*ngorkov+igorkov)*nc+1]+=
658 -akappa*(-conj(u12t_f[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc]
659 +conj(u11t_f[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc+1]
660 +conj(u12t_f[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]
661 +u11t_f[did*ndim+mu] *r[(did*ngorkov+igorkov)*nc+1])
662 -gamval_f[mu*ndirac+idirac]*
663 (-conj(u12t_f[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc]
664 +conj(u11t_f[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc+1]
665 -conj(u12t_f[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]
666 -u11t_f[did*ndim+mu] *r[(did*ngorkov+igork1)*nc+1]);
667 }
668 }
669#endif
670 //Timelike terms next. These run from igorkov=0..3 and 4..7 with slightly different rules for each
671 //We can fit it into a single loop by declaring igorkovPP=igorkov+4 instead of looping igorkov=4..7 separately
672 //Note that for the igorkov 4..7 loop idirac=igorkov-4, so we don't need to declare idiracPP separately
673 //Under dagger, dk_f[1] and dk_f[0] get swapped and the dirac component flips sign.
674 unsigned int did=id[3+ndim*i]; unsigned int uid = iu[3+ndim*i];
675#ifndef NO_TIME
676#pragma omp simd aligned(phi,r,u11t_f,u12t_f:AVX)
677 for(unsigned short igorkov=0; igorkov<4; igorkov++){
678 //the FORTRAN code did it.
679 unsigned short igork1 = gamin[3*ndirac+igorkov];
680 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
681 phi[(i*ngorkov+igorkov)*nc]+=
682 -dk_f[0][i]*(u11t_f[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc]+r[(uid*ngorkov+igork1)*nc])
683 +u12t_f[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc+1]+r[(uid*ngorkov+igork1)*nc+1]))
684 -dk_f[1][did]*(conj(u11t_f[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]-r[(did*ngorkov+igork1)*nc])
685 -u12t_f[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]-r[(did*ngorkov+igork1)*nc+1]));
686 phi[(i*ngorkov+igorkov)*nc+1]+=
687 -dk_f[0][i]*(-conj(u12t_f[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc]+r[(uid*ngorkov+igork1)*nc])
688 +conj(u11t_f[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc+1]+r[(uid*ngorkov+igork1)*nc+1]))
689 -dk_f[1][did]*(conj(u12t_f[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]-r[(did*ngorkov+igork1)*nc])
690 +u11t_f[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]-r[(did*ngorkov+igork1)*nc+1]));
691
692
693 unsigned short igorkovPP=igorkov+4; //idirac = igorkov; It is a bit redundant but I'll mention it as that's how
694 unsigned short igork1PP = igork1+4;
695 //And the +4 terms. Note that dk_f[1] and dk_f[0] swap positions compared to the above
696 phi[(i*ngorkov+igorkovPP)*nc]+=-dk_f[1][i]*(u11t_f[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc]+r[(uid*ngorkov+igork1PP)*nc])
697 +u12t_f[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc+1]+r[(uid*ngorkov+igork1PP)*nc+1]))
698 -dk_f[0][did]*(conj(u11t_f[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]-r[(did*ngorkov+igork1PP)*nc])
699 -u12t_f[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]-r[(did*ngorkov+igork1PP)*nc+1]));
700
701 phi[(i*ngorkov+igorkovPP)*nc+1]+=dk_f[1][i]*(conj(u12t_f[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc]+r[(uid*ngorkov+igork1PP)*nc])
702 -conj(u11t_f[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc+1]+r[(uid*ngorkov+igork1PP)*nc+1]))
703 -dk_f[0][did]*(conj(u12t_f[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]-r[(did*ngorkov+igork1PP)*nc])
704 +u11t_f[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]-r[(did*ngorkov+igork1PP)*nc+1]));
705
706 }
707#endif
708 }
709#endif
710 return 0;
711}

References CHalo_swap_all(), Complex_f, kferm, kvol, nc, ndim, ndirac, and ngorkov.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Hdslash()

int Hdslash ( Complex * phi,
Complex * r,
Complex * u11t[2],
unsigned int * iu,
unsigned int * id,
Complex * gamval,
int * gamin,
double * dk[2],
float akappa )

Evaluates \(\Phi=M r\) in double precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 268 of file matrices.c.

269 {
270 /*
271 * @brief Evaluates @f(\Phi=M r@f) in double precision
272 *
273 * @param phi: The product
274 * @param r: The array being acted on by M
275 * @param ut[0]: First colour trial field
276 * @param ut[1]: Second colour trial field
277 * @param iu: Upper halo indices
278 * @param id: Lower halo indices
279 * @param gamval: Gamma matrices
280 * @param gamin: Indices for dirac terms
281 * @param dk[0]:
282 * @param dk[1]:
283 * @param akappa: Hopping parameter
284 *
285 * @see ZHalo_swap_all (MPI only)
286 *
287 * @return Zero on success, integer error code otherwise
288 */
289 const char *funcname = "Hdslash";
290 //Get the halos in order
291#if(nproc>1)
292 ZHalo_swap_all(r, 8);
293#endif
294
295 //Mass term
296 //Spacelike term
297#ifdef __NVCC__
298 cuHdslash(phi,r,ut[0],ut[1],iu,id,gamval,gamin,dk[0],dk[1],akappa,dimGrid,dimBlock);
299#else
300 memcpy(phi, r, kferm2*sizeof(Complex));
301#pragma omp parallel for
302 for(int i=0;i<kvol;i++){
303#ifndef NO_SPACE
304 for(int mu = 0; mu <3; mu++){
305 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
306#pragma omp simd aligned(phi,r,gamval:AVX)
307 for(int idirac=0; idirac<ndirac; idirac++){
308 //FORTRAN had mod((idirac-1),4)+1 to prevent issues with non-zero indexing.
309 int igork1 = gamin[mu*ndirac+idirac];
310 //Can manually vectorise with a pragma?
311 //Wilson + Dirac term in that order. Definitely easier
312 //to read when split into different loops, but should be faster this way
313 phi[(i*ndirac+idirac)*nc]+=-akappa*(ut[0][i*ndim+mu]*r[(uid*ndirac+idirac)*nc]+\
314 ut[1][i*ndim+mu]*r[(uid*ndirac+idirac)*nc+1]+\
315 conj(ut[0][did*ndim+mu])*r[(did*ndirac+idirac)*nc]-\
316 ut[1][did*ndim+mu]*r[(did*ndirac+idirac)*nc+1])+\
317 //Dirac term
318 gamval[mu*ndirac+idirac]*(ut[0][i*ndim+mu]*r[(uid*ndirac+igork1)*nc]+\
319 ut[1][i*ndim+mu]*r[(uid*ndirac+igork1)*nc+1]-\
320 conj(ut[0][did*ndim+mu])*r[(did*ndirac+igork1)*nc]+\
321 ut[1][did*ndim+mu]*r[(did*ndirac+igork1)*nc+1]);
322
323 phi[(i*ndirac+idirac)*nc+1]+=-akappa*(-conj(ut[1][i*ndim+mu])*r[(uid*ndirac+idirac)*nc]+\
324 conj(ut[0][i*ndim+mu])*r[(uid*ndirac+idirac)*nc+1]+\
325 conj(ut[1][did*ndim+mu])*r[(did*ndirac+idirac)*nc]+\
326 ut[0][did*ndim+mu]*r[(did*ndirac+idirac)*nc+1])+\
327 //Dirac term
328 gamval[mu*ndirac+idirac]*(-conj(ut[1][i*ndim+mu])*r[(uid*ndirac+igork1)*nc]+\
329 conj(ut[0][i*ndim+mu])*r[(uid*ndirac+igork1)*nc+1]-\
330 conj(ut[1][did*ndim+mu])*r[(did*ndirac+igork1)*nc]-\
331 ut[0][did*ndim+mu]*r[(did*ndirac+igork1)*nc+1]);
332 }
333 }
334#endif
335 //Timelike terms
336 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
337#ifndef NO_TIME
338#pragma omp simd aligned(phi,r:AVX)
339 for(int idirac=0; idirac<ndirac; idirac++){
340 int igork1 = gamin[3*ndirac+idirac];
341 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
342 //Reminder! gamval was rescaled by kappa when we defined it
343 phi[(i*ndirac+idirac)*nc]+=
344 -dk[1][i]*(ut[0][i*ndim+3]*(r[(uid*ndirac+idirac)*nc]-r[(uid*ndirac+igork1)*nc])
345 +ut[1][i*ndim+3]*(r[(uid*ndirac+idirac)*nc+1]-r[(uid*ndirac+igork1)*nc+1]))
346 -dk[0][did]*(conj(ut[0][did*ndim+3])*(r[(did*ndirac+idirac)*nc]+r[(did*ndirac+igork1)*nc])
347 -ut[1][did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]+r[(did*ndirac+igork1)*nc+1]));
348 phi[(i*ndirac+idirac)*nc+1]+=
349 -dk[1][i]*(-conj(ut[1][i*ndim+3])*(r[(uid*ndirac+idirac)*nc]-r[(uid*ndirac+igork1)*nc])
350 +conj(ut[0][i*ndim+3])*(r[(uid*ndirac+idirac)*nc+1]-r[(uid*ndirac+igork1)*nc+1]))
351 -dk[0][did]*(conj(ut[1][did*ndim+3])*(r[(did*ndirac+idirac)*nc]+r[(did*ndirac+igork1)*nc])
352 +ut[0][did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]+r[(did*ndirac+igork1)*nc+1]));
353 }
354#endif
355 }
356#endif
357 return 0;
358}
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188

References Complex, kferm2, kvol, nc, ndim, ndirac, and ZHalo_swap_all().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Hdslash_f()

int Hdslash_f ( Complex_f * phi,
Complex_f * r,
Complex_f * ut[2],
unsigned int * iu,
unsigned int * id,
Complex_f * gamval,
int * gamin,
float * dk[2],
float akappa )

Evaluates \(\Phi=M r\) in single precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 712 of file matrices.c.

713 {
714 /*
715 * @brief Evaluates @f(\Phi=M r@f) in single precision.
716 *
717 * @param phi: The product
718 * @param r: The array being acted on by M
719 * @param ut[0]_f: First colour trial field
720 * @param ut[1]_f: Second colour trial field
721 * @param iu: Upper halo indices
722 * @param id: Lower halo indices
723 * @param gamval_f: Gamma matrices
724 * @param gamin: Indices for dirac terms
725 * @param dk4m_f:
726 * @param dk4p_f:
727 * @param akappa: Hopping parameter
728 *
729 * @see CHalo_swap_all (MPI only)
730 *
731 * @return Zero on success, integer error code otherwise
732 */
733 const char *funcname = "Hdslash_f";
734 //Get the halos in order
735#if(nproc>1)
736 CHalo_swap_all(r, 8);
737#endif
738#ifdef __NVCC__
739 cuHdslash_f(phi,r,ut,iu,id,gamval,gamin,dk,akappa,dimGrid,dimBlock);
740#else
741 //Mass term
742 memcpy(phi, r, kferm2*sizeof(Complex_f));
743#pragma omp parallel for
744 for(unsigned int i=0;i<kvol;i+=AVX){
745 alignas(AVX) Complex_f u11s[AVX]; alignas(AVX) Complex_f u12s[AVX];
746 alignas(AVX) Complex_f u11sd[AVX]; alignas(AVX) Complex_f u12sd[AVX];
747 alignas(AVX) Complex_f ru[2][AVX]; alignas(AVX) Complex_f rd[2][AVX];
748 alignas(AVX) Complex_f rgu[2][AVX]; alignas(AVX) Complex_f rgd[2][AVX];
749 alignas(AVX) Complex_f phi_s[ndirac*nc][AVX];
750 //Do we need to sync threads if each thread only accesses the value it put in shared memory?
751#pragma unroll(2)
752 for(unsigned short idirac=0; idirac<ndirac; idirac++)
753 for(unsigned short c=0; c<nc; c++)
754#pragma omp simd aligned(phi_s,phi:AVX)
755 for(unsigned short j=0;j<AVX;j++)
756 phi_s[idirac*nc+c][j]=phi[((i+j)*ndirac+idirac)*nc+c];
757 alignas(AVX) unsigned int did[AVX], uid[AVX];
758#pragma unroll
759 for(unsigned short mu = 0; mu <3; mu++){
760#pragma omp simd aligned(u11s,u12s,did,uid,id,iu,u11sd,u12sd:AVX)
761 for(unsigned short j =0;j<AVX;j++){
762 did[j]=id[(i+j)*ndim+mu]; uid[j] = iu[(i+j)*ndim+mu];
763 u11s[j]=ut[0][(i+j)*ndim+mu]; u12s[j]=ut[1][(i+j)*ndim+mu];
764 u11sd[j]=ut[0][did[j]*ndim+mu]; u12sd[j]=ut[1][did[j]*ndim+mu];
765 }
766#pragma unroll
767 for(unsigned short idirac=0; idirac<ndirac; idirac++){
768 unsigned short igork1 = gamin[mu*ndirac+idirac];
769#pragma unroll
770 for(unsigned short c=0; c<nc; c++)
771#pragma omp simd aligned(ru,rd,rgu,rgd,r,uid,did:AVX)
772 for(unsigned short j =0;j<AVX;j++){
773 ru[c][j]=r[(uid[j]*ndirac+idirac)*nc+c];
774 rd[c][j]=r[(did[j]*ndirac+idirac)*nc+c];
775 rgu[c][j]=r[(uid[j]*ndirac+igork1)*nc+c];
776 rgd[c][j]=r[(did[j]*ndirac+igork1)*nc+c];
777 }
778 //FORTRAN had mod((idirac-1),4)+1 to prevent issues with non-zero indexing.
779 //Can manually vectorise with a pragma?
780 //Wilson + Dirac term in that order. Definitely easier
781 //to read when split into different loops, but should be faster this way
782#pragma omp simd aligned(phi_s,u11s,u12s,u11sd,u12sd,ru,rd,rgu,rgd:AVX)
783 for(unsigned short j =0;j<AVX;j++){
784 phi_s[idirac*nc][j]+=-akappa*(u11s[j]*ru[0][j]+\
785 u12s[j]*ru[1][j]+\
786 conj(u11sd[j])*rd[0][j]-\
787 u12sd[j]*rd[1][j]);
788 //Dirac term
789 phi_s[idirac*nc][j]+=gamval[mu*ndirac+idirac]*(u11s[j]*rgu[0][j]+\
790 u12s[j]*rgu[1][j]-\
791 conj(u11sd[j])*rgd[0][j]+\
792 u12sd[j]*rgd[1][j]);
793
794 phi_s[idirac*nc+1][j]+=-akappa*(-conj(u12s[j])*ru[0][j]+\
795 conj(u11s[j])*ru[1][j]+\
796 conj(u12sd[j])*rd[0][j]+\
797 u11sd[j]*rd[1][j]);
798 //Dirac term
799 phi_s[idirac*nc+1][j]+=gamval[mu*ndirac+idirac]*(-conj(u12s[j])*rgu[0][j]+\
800 conj(u11s[j])*rgu[1][j]-\
801 conj(u12sd[j])*rgd[0][j]-\
802 u11sd[j]*rgd[1][j]);
803 }
804 }
805 }
806#ifndef NO_TIME
807 //Timelike terms
808 alignas(AVX) float dk4ms[AVX],dk4ps[AVX];
809#pragma omp simd
810 for(unsigned short j=0;j<AVX;j++){
811 u11s[j]=ut[0][(i+j)*ndim+3]; u12s[j]=ut[1][(i+j)*ndim+3];
812 did[j]=id[(i+j)*ndim+3];uid[j]= iu[(i+j)*ndim+3];
813 u11sd[j]=ut[0][did[j]*ndim+3]; u12sd[j]=ut[1][did[j]*ndim+3];
814 dk4ms[j]=dk[0][did[j]]; dk4ps[j]=dk[1][i+j];
815 }
816
817#pragma unroll
818 for(unsigned short idirac=0; idirac<ndirac; idirac++){
819 unsigned short igork1 = gamin[3*ndirac+idirac];
820#pragma unroll
821 for(unsigned short c=0; c<nc; c++)
822#pragma omp simd aligned(ru,rd,rgu,rgd,r,uid,did:AVX)
823 for(unsigned short j =0;j<AVX;j++){
824 ru[c][j]=r[(uid[j]*ndirac+idirac)*nc+c];
825 rd[c][j]=r[(did[j]*ndirac+idirac)*nc+c];
826 rgu[c][j]=r[(uid[j]*ndirac+igork1)*nc+c];
827 rgd[c][j]=r[(did[j]*ndirac+igork1)*nc+c];
828 }
829 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
830
831#pragma omp simd aligned(phi_s,u11s,u12s,u11sd,u12sd,ru,rd,rgu,rgd,dk4ms,dk4ps,phi:AVX)
832 for(unsigned short j =0;j<AVX;j++){
833 phi_s[idirac*nc+0][j]-=
834 dk4ps[j]*(u11s[j]*(ru[0][j]-rgu[0][j])
835 +u12s[j]*(ru[1][j]-rgu[1][j]));
836 phi_s[idirac*nc+0][j]-=
837 dk4ms[j]*(conj(u11sd[j])*(rd[0][j]+rgd[0][j])
838 -u12sd[j]*(rd[1][j]+rgd[1][j]));
839 phi[((i+j)*ndirac+idirac)*nc]=phi_s[idirac*nc][j];
840
841 phi_s[idirac*nc+1][j]-=
842 dk4ps[j]*(-conj(u12s[j])*(ru[0][j]-rgu[0][j])
843 +conj(u11s[j])*(ru[1][j]-rgu[1][j]));
844 phi_s[idirac*nc+1][j]-=
845 dk4ms[j]*(conj(u12sd[j])*(rd[0][j]+rgd[0][j])
846 +u11sd[j]*(rd[1][j]+rgd[1][j]));
847 phi[((i+j)*ndirac+idirac)*nc+1]=phi_s[idirac*nc+1][j];
848 }
849 }
850#endif
851 }
852#endif
853 return 0;
854}
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268

References AVX, CHalo_swap_all(), Complex_f, kferm2, kvol, nc, ndim, and ndirac.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Hdslashd()

int Hdslashd ( Complex * phi,
Complex * r,
Complex * u11t,
Complex * u12t,
unsigned int * iu,
unsigned int * id,
Complex * gamval,
int * gamin,
double * dk4m,
double * dk4p,
float akappa )

Evaluates \(\Phi=M^\dagger r\) in double precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 359 of file matrices.c.

360 {
361 /*
362 * @brief Evaluates @f(\Phi=M^\dagger r@f) in double precision
363 *
364 * @param phi: The product
365 * @param r: The array being acted on by M
366 * @param u11t: First colour trial field
367 * @param u12t: Second colour trial field
368 * @param iu: Upper halo indices
369 * @param id: Lower halo indices
370 * @param gamval: Gamma matrices
371 * @param gamin: Indices for dirac terms
372 * @param dk4m:
373 * @param dk4p:
374 * @param akappa: Hopping parameter
375 *
376 * @see ZHalo_swap_all (MPI only)
377 *
378 * @return Zero on success, integer error code otherwise
379 */
380 const char *funcname = "Hdslashd";
381 //Get the halos in order. Because C is row major, we need to extract the correct
382 //terms for each halo first. Changing the indices was considered but that caused
383 //issues with the BLAS routines.
384#if(nproc>1)
385 ZHalo_swap_all(r, 8);
386#endif
387
388 //Looks like flipping the array ordering for C has meant a lot
389 //of for loops. Sense we're jumping around quite a bit the cache is probably getting refreshed
390 //anyways so memory access patterns mightn't be as big of an limiting factor here anyway
391
392 //Mass term
393#ifdef __NVCC__
394 cuHdslashd(phi,r,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,akappa,dimGrid,dimBlock);
395#else
396 memcpy(phi, r, kferm2*sizeof(Complex));
397 //Spacelike term
398#pragma omp parallel for
399 for(int i=0;i<kvol;i++){
400#ifndef NO_SPACE
401 for(int mu = 0; mu <ndim-1; mu++){
402 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
403#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
404 for(int idirac=0; idirac<ndirac; idirac++){
405 //FORTRAN had mod((idirac-1),4)+1 to prevent issues with non-zero indexing.
406 int igork1 = gamin[mu*ndirac+idirac];
407 //Can manually vectorise with a pragma?
408 //Wilson + Dirac term in that order. Definitely easier
409 //to read when split into different loops, but should be faster this way
410
411 //Reminder! gamval was rescaled by kappa when we defined it
412 phi[(i*ndirac+idirac)*nc]+=
413 -akappa*(u11t[i*ndim+mu]*r[(uid*ndirac+idirac)*nc]
414 +u12t[i*ndim+mu]*r[(uid*ndirac+idirac)*nc+1]
415 +conj(u11t[did*ndim+mu])*r[(did*ndirac+idirac)*nc]
416 -u12t[did*ndim+mu] *r[(did*ndirac+idirac)*nc+1])
417 -gamval[mu*ndirac+idirac]*
418 ( u11t[i*ndim+mu]*r[(uid*ndirac+igork1)*nc]
419 +u12t[i*ndim+mu]*r[(uid*ndirac+igork1)*nc+1]
420 -conj(u11t[did*ndim+mu])*r[(did*ndirac+igork1)*nc]
421 +u12t[did*ndim+mu] *r[(did*ndirac+igork1)*nc+1]);
422
423 phi[(i*ndirac+idirac)*nc+1]+=
424 -akappa*(-conj(u12t[i*ndim+mu])*r[(uid*ndirac+idirac)*nc]
425 +conj(u11t[i*ndim+mu])*r[(uid*ndirac+idirac)*nc+1]
426 +conj(u12t[did*ndim+mu])*r[(did*ndirac+idirac)*nc]
427 +u11t[did*ndim+mu] *r[(did*ndirac+idirac)*nc+1])
428 -gamval[mu*ndirac+idirac]*
429 (-conj(u12t[i*ndim+mu])*r[(uid*ndirac+igork1)*nc]
430 +conj(u11t[i*ndim+mu])*r[(uid*ndirac+igork1)*nc+1]
431 -conj(u12t[did*ndim+mu])*r[(did*ndirac+igork1)*nc]
432 -u11t[did*ndim+mu] *r[(did*ndirac+igork1)*nc+1]);
433 }
434 }
435#endif
436 //Timelike terms
437 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
438#ifndef NO_TIME
439#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
440 for(int idirac=0; idirac<ndirac; idirac++){
441 int igork1 = gamin[3*ndirac+idirac];
442 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
443 //dk4m and dk4p swap under dagger
444 phi[(i*ndirac+idirac)*nc]+=
445 -dk4m[i]*(u11t[i*ndim+3]*(r[(uid*ndirac+idirac)*nc]+r[(uid*ndirac+igork1)*nc])
446 +u12t[i*ndim+3]*(r[(uid*ndirac+idirac)*nc+1]+r[(uid*ndirac+igork1)*nc+1]))
447 -dk4p[did]*(conj(u11t[did*ndim+3])*(r[(did*ndirac+idirac)*nc]-r[(did*ndirac+igork1)*nc])
448 -u12t[did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]-r[(did*ndirac+igork1)*nc+1]));
449
450 phi[(i*ndirac+idirac)*nc+1]+=
451 -dk4m[i]*(-conj(u12t[i*ndim+3])*(r[(uid*ndirac+idirac)*nc]+r[(uid*ndirac+igork1)*nc])
452 +conj(u11t[i*ndim+3])*(r[(uid*ndirac+idirac)*nc+1]+r[(uid*ndirac+igork1)*nc+1]))
453 -dk4p[did]*(conj(u12t[did*ndim+3])*(r[(did*ndirac+idirac)*nc]-r[(did*ndirac+igork1)*nc])
454 +u11t[did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]-r[(did*ndirac+igork1)*nc+1]));
455 }
456#endif
457 }
458#endif
459 return 0;
460}

References Complex, kferm2, kvol, nc, ndim, ndirac, and ZHalo_swap_all().

Here is the call graph for this function:

◆ Hdslashd_f()

int Hdslashd_f ( Complex_f * phi,
Complex_f * r,
Complex_f * ut[2],
unsigned int * iu,
unsigned int * id,
Complex_f * gamval,
int * gamin,
float * dk[2],
float akappa )

Evaluates \(\Phi=M^\dagger r\) in single precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 855 of file matrices.c.

856 {
857 /*
858 * @brief Evaluates @f(\Phi=M^\dagger r@f) in single precision
859 *
860 * @param phi: The product
861 * @param r: The array being acted on by M
862 * @param ut[0]_f: First colour trial field
863 * @param ut[1]_f: Second colour trial field
864 * @param iu: Upper halo indices
865 * @param id: Lower halo indices
866 * @param gamval_f: Gamma matrices
867 * @param gamin: Indices for dirac terms
868 * @param dk4m_f:
869 * @param dk4p_f:
870 * @param akappa: Hopping parameter
871 *
872 * @see CHalo_swap_all (MPI only)
873 *
874 * @return Zero on success, integer error code otherwise
875 */
876 const char *funcname = "Hdslashd_f";
877 //Get the halos in order. Because C is row major, we need to extract the correct
878 //terms for each halo first. Changing the indices was considered but that caused
879 //issues with the BLAS routines.
880#if(nproc>1)
881 CHalo_swap_all(r, 8);
882#endif
883
884 //Looks like flipping the array ordering for C has meant a lot
885 //of for loops. Sense we're jumping around quite a bit the cache is probably getting refreshed
886 //anyways so memory access patterns mightn't be as big of an limiting factor here anyway
887
888 //Mass term
889#ifdef __NVCC__
890 cuHdslashd_f(phi,r,ut,iu,id,gamval,gamin,dk,akappa,dimGrid,dimBlock);
891#else
892 memcpy(phi, r, kferm2*sizeof(Complex_f));
893
894 //Spacelike term
895 //Enough room on L1 data cache for Zen 2 to hold 160 elements at a time
896 //Vectorise with 128 maybe?
897#pragma omp parallel for
898 for(unsigned int i=0;i<kvol;i+=AVX){
899 //Right. Time to prefetch
900 alignas(AVX) Complex_f u11s[AVX]; alignas(AVX) Complex_f u12s[AVX];
901 alignas(AVX) Complex_f u11sd[AVX]; alignas(AVX) Complex_f u12sd[AVX];
902 alignas(AVX) Complex_f ru[2][AVX]; alignas(AVX) Complex_f rd[2][AVX];
903 alignas(AVX) Complex_f rgu[2][AVX]; alignas(AVX) Complex_f rgd[2][AVX];
904 alignas(AVX) Complex_f phi_s[ndirac*nc][AVX];
905#pragma unroll
906 for(unsigned short idirac=0; idirac<ndirac; idirac++)
907#pragma unroll
908 for(unsigned short c=0; c<nc; c++)
909#pragma omp simd aligned(phi_s,phi:AVX)
910 for(unsigned short j=0;j<AVX;j++)
911 phi_s[idirac*nc+c][j]=phi[((i+j)*ndirac+idirac)*nc+c];
912 alignas(AVX) unsigned int did[AVX], uid[AVX];
913#ifndef NO_SPACE
914#pragma unroll
915 for(unsigned short mu = 0; mu <ndim-1; mu++){
916 //FORTRAN had mod((idirac-1),4)+1 to prevent issues with non-zero indexing.
917#pragma omp simd aligned(u11s,u12s,did,uid,id,iu,u11sd,u12sd:AVX)
918 for(unsigned short j =0;j<AVX;j++){
919 did[j]=id[(i+j)*ndim+mu]; uid[j] = iu[(i+j)*ndim+mu];
920 u11s[j]=ut[0][(i+j)*ndim+mu]; u12s[j]=ut[1][(i+j)*ndim+mu];
921 u11sd[j]=ut[0][did[j]*ndim+mu]; u12sd[j]=ut[1][did[j]*ndim+mu];
922 }
923#pragma unroll
924 for(unsigned short idirac=0; idirac<ndirac; idirac++){
925 unsigned short igork1 = gamin[mu*ndirac+idirac];
926#pragma unroll
927 for(unsigned short c=0; c<nc; c++)
928#pragma omp simd aligned(ru,rd,rgu,rgd,r,uid,did:AVX)
929 for(unsigned short j =0;j<AVX;j++){
930 ru[c][j]=r[(uid[j]*ndirac+idirac)*nc+c];
931 rd[c][j]=r[(did[j]*ndirac+idirac)*nc+c];
932 rgu[c][j]=r[(uid[j]*ndirac+igork1)*nc+c];
933 rgd[c][j]=r[(did[j]*ndirac+igork1)*nc+c];
934 }
935 //Can manually vectorise with a pragma?
936 //Wilson + Dirac term in that order. Definitely easier
937 //to read when split into different loops, but should be faster this way
938#pragma omp simd aligned(phi_s,u11s,u12s,u11sd,u12sd,ru,rd,rgu,rgd:AVX)
939 for(unsigned short j =0;j<AVX;j++){
940 phi_s[idirac*nc][j]-=akappa*(u11s[j]*ru[0][j]
941 +u12s[j]*ru[1][j]
942 +conj(u11sd[j])*rd[0][j]
943 -u12sd[j] *rd[1][j]);
944 //Dirac term
945 phi_s[idirac*nc][j]-=gamval[mu*ndirac+idirac]*
946 (u11s[j]*rgu[0][j]
947 +u12s[j]*rgu[1][j]
948 -conj(u11sd[j])*rgd[0][j]
949 +u12sd[j] *rgd[1][j]);
950
951 phi_s[idirac*nc+1][j]-=akappa*(-conj(u12s[j])*ru[0][j]
952 +conj(u11s[j])*ru[1][j]
953 +conj(u12sd[j])*rd[0][j]
954 +u11sd[j] *rd[1][j]);
955 //Dirac term
956 phi_s[idirac*nc+1][j]-=gamval[mu*ndirac+idirac]*(-conj(u12s[j])*rgu[0][j]
957 +conj(u11s[j])*rgu[1][j]
958 -conj(u12sd[j])*rgd[0][j]
959 -u11sd[j] *rgd[1][j]);
960 }
961 }
962 }
963#endif
964#ifndef NO_TIME
965 //Timelike terms
966 alignas(AVX) float dk4ms[AVX],dk4ps[AVX];
967#pragma omp simd aligned(u11s,u12s,did,uid,id,iu,u11sd,u12sd,dk4ms,dk4ps:AVX)
968 for(unsigned short j=0;j<AVX;j++){
969 u11s[j]=ut[0][(i+j)*ndim+3]; u12s[j]=ut[1][(i+j)*ndim+3];
970 did[j]=id[(i+j)*ndim+3]; uid[j]= iu[(i+j)*ndim+3];
971 u11sd[j]=ut[0][did[j]*ndim+3]; u12sd[j]=ut[1][did[j]*ndim+3];
972 dk4ms[j]=dk[0][i+j]; dk4ps[j]=dk[1][did[j]];
973 }
974#pragma unroll
975 for(unsigned short idirac=0; idirac<ndirac; idirac++){
976 unsigned short igork1 = gamin[3*ndirac+idirac];
977#pragma unroll
978 for(unsigned short c=0; c<nc; c++)
979#pragma omp simd aligned(ru,rd,rgu,rgd,r,uid,did:AVX)
980 for(unsigned short j =0;j<AVX;j++){
981 ru[c][j]=r[(uid[j]*ndirac+idirac)*nc+c];
982 rd[c][j]=r[(did[j]*ndirac+idirac)*nc+c];
983 rgu[c][j]=r[(uid[j]*ndirac+igork1)*nc+c];
984 rgd[c][j]=r[(did[j]*ndirac+igork1)*nc+c];
985 }
986 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
987 //dk4m and dk4p swap under dagger
988#pragma omp simd aligned(phi_s,u11s,u12s,u11sd,u12sd,ru,rd,rgu,rgd,dk4ms,dk4ps,phi:AVX)
989 for(unsigned short j =0;j<AVX;j++){
990 phi_s[idirac*nc][j]+=
991 -dk4ms[j]*(u11s[j]*(ru[0][j]+rgu[0][j])
992 +u12s[j]*(ru[1][j]+rgu[1][j]));
993 phi_s[idirac*nc][j]+=
994 -dk4ps[j]*(conj(u11sd[j])*(rd[0][j]-rgd[0][j])
995 -u12sd[j] *(rd[1][j]-rgd[1][j]));
996 phi[((i+j)*ndirac+idirac)*nc]=phi_s[idirac*nc][j];
997
998 phi_s[idirac*nc+1][j]-=
999 dk4ms[j]*(-conj(u12s[j])*(ru[0][j]+rgu[0][j])
1000 +conj(u11s[j])*(ru[1][j]+rgu[1][j]));
1001 phi_s[idirac*nc+1][j]-=
1002 +dk4ps[j]*(conj(u12sd[j])*(rd[0][j]-rgd[0][j])
1003 +u11sd[j] *(rd[1][j]-rgd[1][j]));
1004 phi[((i+j)*ndirac+idirac)*nc+1]=phi_s[idirac*nc+1][j];
1005 }
1006 }
1007#endif
1008 }
1009#endif
1010 return 0;
1011}

References AVX, CHalo_swap_all(), Complex_f, kferm2, kvol, nc, ndim, and ndirac.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Reunitarise()

int Reunitarise ( Complex * ut[2])
inline

Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.

If you're looking at the FORTRAN code be careful. There are two header files for the /trial/ header. One with u11 u12 (which was included here originally) and the other with u11t and u12t.

Parameters
utTrial fields to be reunitarised
Returns
Zero on success, integer error code otherwise

Definition at line 1012 of file matrices.c.

1012 {
1013 /*
1014 * @brief Reunitarises ut[0] and ut[1] as in conj(ut[0][i])*ut[0][i]+conj(ut[1][i])*ut[1][i]=1
1015 *
1016 * If you're looking at the FORTRAN code be careful. There are two header files
1017 * for the /trial/ header. One with u11 u12 (which was included here originally)
1018 * and the other with ut[0] and ut[1].
1019 *
1020 * @see cuReunitarise (CUDA Wrapper)
1021 *
1022 * @param ut[0], ut[1] Trial fields to be reunitarised
1023 *
1024 * @return Zero on success, integer error code otherwise
1025 */
1026 const char *funcname = "Reunitarise";
1027#ifdef __NVCC__
1028 cuReunitarise(ut[0],ut[1],dimGrid,dimBlock);
1029#else
1030#pragma omp parallel for simd
1031 for(int i=0; i<kvol*ndim; i++){
1032 //Declaring anorm inside the loop will hopefully let the compiler know it
1033 //is safe to vectorise aggressively
1034 double anorm=sqrt(conj(ut[0][i])*ut[0][i]+conj(ut[1][i])*ut[1][i]);
1035 // Exception handling code. May be faster to leave out as the exit prevents vectorisation.
1036 // if(anorm==0){
1037 // fprintf(stderr, "Error %i in %s on rank %i: anorm = 0 for μ=%i and i=%i.\nExiting...\n\n",
1038 // DIVZERO, funcname, rank, mu, i);
1039 // MPI_Finalise();
1040 // exit(DIVZERO);
1041 // }
1042 ut[0][i]/=anorm;
1043 ut[1][i]/=anorm;
1044 }
1045#endif
1046 return 0;
1047}

References Complex, kvol, and ndim.

Here is the caller graph for this function:

◆ Transpose_c()

void Transpose_c ( Complex_f * out,
const int fast_in,
const int fast_out )
inline

Definition at line 1050 of file matrices.c.

1050 {
1051 const volatile char *funcname="Transpose_c";
1052
1053#ifdef __NVCC__
1054 cuTranspose_c(out,fast_in,fast_out,dimGrid,dimBlock);
1055#else
1056 Complex_f *in = (Complex_f *)aligned_alloc(AVX,fast_in*fast_out*sizeof(Complex_f));
1057 memcpy(in,out,fast_in*fast_out*sizeof(Complex_f));
1058 //Typically this is used to write back to the AoS/Coalseced format
1059 if(fast_out>fast_in){
1060 for(int x=0;x<fast_out;x++)
1061 for(int y=0; y<fast_in;y++)
1062 out[y*fast_out+x]=in[x*fast_in+y];
1063 }
1064 //Typically this is used to write back to the SoA/saved config format
1065 else{
1066 for(int x=0; x<fast_out;x++)
1067 for(int y=0;y<fast_in;y++)
1068 out[y*fast_out+x]=in[x*fast_in+y];
1069 }
1070 free(in);
1071#endif
1072}

◆ Transpose_d()

void Transpose_d ( double * out,
const int fast_in,
const int fast_out )
inline

Definition at line 1119 of file matrices.c.

1119 {
1120 const char *funcname="Transpose_f";
1121
1122#ifdef __NVCC__
1123 cuTranspose_d(out,fast_in,fast_out,dimGrid,dimBlock);
1124#else
1125 double *in = (double *)aligned_alloc(AVX,fast_in*fast_out*sizeof(double));
1126 memcpy(in,out,fast_in*fast_out*sizeof(double));
1127 //Typically this is used to write back to the AoS/Coalseced format
1128 if(fast_out>fast_in){
1129 for(int x=0;x<fast_out;x++)
1130 for(int y=0; y<fast_in;y++)
1131 out[y*fast_out+x]=in[x*fast_in+y];
1132 }
1133 //Typically this is used to write back to the SoA/saved config format
1134 else{
1135 for(int x=0; x<fast_out;x++)
1136 for(int y=0;y<fast_in;y++)
1137 out[y*fast_out+x]=in[x*fast_in+y];
1138 }
1139 free(in);
1140#endif
1141}

◆ Transpose_f()

void Transpose_f ( float * out,
const int fast_in,
const int fast_out )
inline

Definition at line 1096 of file matrices.c.

1096 {
1097 const char *funcname="Transpose_f";
1098
1099#ifdef __NVCC__
1100 cuTranspose_f(out,fast_in,fast_out,dimGrid,dimBlock);
1101#else
1102 float *in = (float *)aligned_alloc(AVX,fast_in*fast_out*sizeof(float));
1103 memcpy(in,out,fast_in*fast_out*sizeof(float));
1104 //Typically this is used to write back to the AoS/Coalseced format
1105 if(fast_out>fast_in){
1106 for(int x=0;x<fast_out;x++)
1107 for(int y=0; y<fast_in;y++)
1108 out[y*fast_out+x]=in[x*fast_in+y];
1109 }
1110 //Typically this is used to write back to the SoA/saved config format
1111 else{
1112 for(int x=0; x<fast_out;x++)
1113 for(int y=0;y<fast_in;y++)
1114 out[y*fast_out+x]=in[x*fast_in+y];
1115 }
1116 free(in);
1117#endif
1118}

◆ Transpose_I()

void Transpose_I ( int * out,
const int fast_in,
const int fast_out )
inline

Definition at line 1142 of file matrices.c.

1142 {
1143 const char *funcname="Transpose_I";
1144
1145#ifdef __NVCC__
1146 cuTranspose_I(out,fast_in,fast_out,dimGrid,dimBlock);
1147#else
1148 int *in = (int *)aligned_alloc(AVX,fast_in*fast_out*sizeof(int));
1149 memcpy(in,out,fast_in*fast_out*sizeof(int));
1150 //Typically this is used to write back to the AoS/Coalseced format
1151 if(fast_out>fast_in){
1152 for(int x=0;x<fast_out;x++)
1153 for(int y=0; y<fast_in;y++)
1154 out[y*fast_out+x]=in[x*fast_in+y];
1155 }
1156 //Typically this is used to write back to the SoA/saved config format
1157 else{
1158 for(int x=0; x<fast_out;x++)
1159 for(int y=0;y<fast_in;y++)
1160 out[y*fast_out+x]=in[x*fast_in+y];
1161 }
1162 free(in);
1163#endif
1164}

◆ Transpose_U()

void Transpose_U ( unsigned int * out,
const int fast_in,
const int fast_out )
inline

Definition at line 1165 of file matrices.c.

1165 {
1166 const char *funcname="Transpose_I";
1167
1168#ifdef __NVCC__
1169 cuTranspose_U(out,fast_in,fast_out,dimGrid,dimBlock);
1170#else
1171 unsigned int *in = (unsigned int *)aligned_alloc(AVX,fast_in*fast_out*sizeof(unsigned int));
1172 memcpy(in,out,fast_in*fast_out*sizeof(unsigned int));
1173 //Typically this is used to write back to the AoS/Coalseced format
1174 if(fast_out>fast_in){
1175 for(unsigned int x=0;x<fast_out;x++)
1176 for(unsigned int y=0; y<fast_in;y++)
1177 out[y*fast_out+x]=in[x*fast_in+y];
1178 }
1179 //Typically this is used to write back to the SoA/saved config format
1180 else{
1181 for(unsigned int x=0; x<fast_out;x++)
1182 for(unsigned int y=0;y<fast_in;y++)
1183 out[y*fast_out+x]=in[x*fast_in+y];
1184 }
1185 free(in);
1186#endif
1187}

◆ Transpose_z()

void Transpose_z ( Complex * out,
const int fast_in,
const int fast_out )
inline

Definition at line 1073 of file matrices.c.

1073 {
1074 const volatile char *funcname="Transpose_c";
1075
1076#ifdef __NVCC__
1077 cuTranspose_z(out,fast_in,fast_out,dimGrid,dimBlock);
1078#else
1079 Complex *in = (Complex *)aligned_alloc(AVX,fast_in*fast_out*sizeof(Complex));
1080 memcpy(in,out,fast_in*fast_out*sizeof(Complex));
1081 //Typically this is used to write back to the AoS/Coalseced format
1082 if(fast_out>fast_in){
1083 for(int x=0;x<fast_out;x++)
1084 for(int y=0; y<fast_in;y++)
1085 out[y*fast_out+x]=in[x*fast_in+y];
1086 }
1087 //Typically this is used to write back to the SoA/saved config format
1088 else{
1089 for(int x=0; x<fast_out;x++)
1090 for(int y=0;y<fast_in;y++)
1091 out[y*fast_out+x]=in[x*fast_in+y];
1092 }
1093 free(in);
1094#endif
1095}