su2hmc
Loading...
Searching...
No Matches
matrices.h File Reference

Matrix multiplication and related declarations. More...

#include <par_mpi.h>
#include <su2hmc.h>
Include dependency graph for matrices.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions

int Dslash (Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa)
 Evaluates \(\Phi=M r\) in double precision.
 
int Dslashd (Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa)
 Evaluates \(\Phi=M^\dagger r\) in double precision.
 
int Hdslash (Complex *phi, Complex *r, Complex *u11t[2], unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk[2], float akappa)
 Evaluates \(\Phi=M r\) in double precision.
 
int Hdslashd (Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
 Evaluates \(\Phi=M^\dagger r\) in double precision.
 
int Dslash_f (Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
 Evaluates \(\Phi=M r\) in single precision.
 
int Dslashd_f (Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], Complex_f jqq, float akappa)
 Evaluates \(\Phi=M^\dagger r\) in single precision.
 
int Hdslash_f (Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
 Evaluates \(\Phi=M r\) in single precision.
 
int Hdslashd_f (Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
 Evaluates \(\Phi=M^\dagger r\) in single precision.
 
void Transpose_z (Complex *out, const int, const int)
 
void Transpose_c (Complex_f *out, const int, const int)
 
void Transpose_d (double *out, const int, const int)
 
void Transpose_f (float *out, const int, const int)
 
void Transpose_I (int *out, const int, const int)
 
void Transpose_U (unsigned int *out, const int, const int)
 

Detailed Description

Matrix multiplication and related declarations.

Definition in file matrices.h.

Function Documentation

◆ Dslash()

int Dslash ( Complex * phi,
Complex * r,
Complex * u11t,
Complex * u12t,
unsigned int * iu,
unsigned int * id,
Complex * gamval,
int * gamin,
double * dk4m,
double * dk4p,
Complex_f jqq,
float akappa )

Evaluates \(\Phi=M r\) in double precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 19 of file matrices.c.

20 {
21 /*
22 * @brief Evaluates @f(\Phi=M r@f) in double precision.
23 *
24 * @param phi: The product
25 * @param r: The array being acted on by M
26 * @param u11t: First colour trial field
27 * @param u12t: Second colour trial field
28 * @param iu: Upper halo indices
29 * @param id: Lower halo indices
30 * @param gamval: Gamma matrices
31 * @param gamin: Indices for dirac terms
32 * @param dk4m:
33 * @param dk4p:
34 * @param jqq: Diquark source
35 * @param akappa: Hopping parameter
36 *
37 * @see ZHalo_swap_all (MPI only)
38 *
39 * @return Zero on success, integer error code otherwise
40 */
41 const char *funcname = "Dslash";
42 //Get the halos in order
43#if(nproc>1)
44 ZHalo_swap_all(r, 16);
45#endif
46
47 //Mass term
48 //Diquark Term (antihermitian)
49#ifdef __NVCC__
50 cuDslash(phi,r,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa,dimGrid,dimBlock);
51#else
52 memcpy(phi, r, kferm*sizeof(Complex));
53#pragma omp parallel for
54 for(int i=0;i<kvol;i++){
55#pragma omp simd aligned(phi,r,gamval:AVX)
56 for(int idirac = 0; idirac<ndirac; idirac++){
57 int igork = idirac+4;
58 Complex a_1, a_2;
59 a_1=conj(jqq)*gamval[4*ndirac+idirac];
60 //We subtract a_2, hence the minus
61 a_2=-jqq*gamval[4*ndirac+idirac];
62 phi[(i*ngorkov+idirac)*nc]+=a_1*r[(i*ngorkov+igork)*nc+0];
63 phi[(i*ngorkov+idirac)*nc+1]+=a_1*r[(i*ngorkov+igork)*nc+1];
64 phi[(i*ngorkov+igork)*nc+0]+=a_2*r[(i*ngorkov+idirac)*nc];
65 phi[(i*ngorkov+igork)*nc+1]+=a_2*r[(i*ngorkov+idirac)*nc+1];
66 }
67
68 //Spacelike terms. Here's hoping I haven't put time as the zeroth component somewhere!
69#ifndef NO_SPACE
70 for(int mu = 0; mu <3; mu++){
71 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
72#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
73 for(int igorkov=0; igorkov<ngorkov; igorkov++){
74 //FORTRAN had mod((igorkov-1),4)+1 to prevent issues with non-zero indexing in the dirac term.
75 int idirac=igorkov%4;
76 int igork1 = (igorkov<4) ? gamin[mu*ndirac+idirac] : gamin[mu*ndirac+idirac]+4;
77 //Can manually vectorise with a pragma?
78 //Wilson + Dirac term in that order. Definitely easier
79 //to read when split into different loops, but should be faster this way
80 phi[(i*ngorkov+igorkov)*nc]+=-akappa*(u11t[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc]+\
81 u12t[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc+1]+\
82 conj(u11t[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]-\
83 u12t[did*ndim+mu]*r[(did*ngorkov+igorkov)*nc+1])+\
84 //Dirac term. Reminder! gamval was rescaled by kappa when we defined it
85 gamval[mu*ndirac+idirac]*(u11t[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc]+\
86 u12t[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc+1]-\
87 conj(u11t[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]+\
88 u12t[did*ndim+mu]*r[(did*ngorkov+igork1)*nc+1]);
89
90 phi[(i*ngorkov+igorkov)*nc+1]+=-akappa*(-conj(u12t[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc]+\
91 conj(u11t[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc+1]+\
92 conj(u12t[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]+\
93 u11t[did*ndim+mu]*r[(did*ngorkov+igorkov)*nc+1])+\
94 //Dirac term
95 gamval[mu*ndirac+idirac]*(-conj(u12t[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc]+\
96 conj(u11t[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc+1]-\
97 conj(u12t[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]-\
98 u11t[did*ndim+mu]*r[(did*ngorkov+igork1)*nc+1]);
99 }
100 }
101 //Timelike terms next. These run from igorkov=0..3 and 4..7 with slightly different rules for each
102 //We can fit it into a single loop by declaring igorkovPP=igorkov+4 instead of looping igorkov=4..7 separately
103 //Note that for the igorkov 4..7 loop idirac=igorkov-4, so we don't need to declare idiracPP separately
104#endif
105 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
106#ifndef NO_TIME
107#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
108 for(int igorkov=0; igorkov<4; igorkov++){
109 int igorkovPP=igorkov+4; //idirac = igorkov; It is a bit redundant but I'll mention it as that's how
110 //the FORTRAN code did it.
111 int igork1 = gamin[3*ndirac+igorkov]; int igork1PP = igork1+4;
112
113 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
114 phi[(i*ngorkov+igorkov)*nc]+=
115 -dk4p[i]*(u11t[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc]-r[(uid*ngorkov+igork1)*nc])
116 +u12t[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc+1]-r[(uid*ngorkov+igork1)*nc+1]))
117 -dk4m[did]*(conj(u11t[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]+r[(did*ngorkov+igork1)*nc])
118 -u12t[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]+r[(did*ngorkov+igork1)*nc+1]));
119 phi[(i*ngorkov+igorkov)*nc+1]+=
120 -dk4p[i]*(-conj(u12t[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc]-r[(uid*ngorkov+igork1)*nc])
121 +conj(u11t[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc+1]-r[(uid*ngorkov+igork1)*nc+1]))
122 -dk4m[did]*(conj(u12t[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]+r[(did*ngorkov+igork1)*nc])
123 +u11t[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]+r[(did*ngorkov+igork1)*nc+1]));
124
125 //And the +4 terms. Note that dk4p and dk4m swap positions compared to the above
126 phi[(i*ngorkov+igorkovPP)*nc]+=-dk4m[i]*(u11t[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc]-r[(uid*ngorkov+igork1PP)*nc])+\
127 u12t[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc+1]-r[(uid*ngorkov+igork1PP)*nc+1]))-\
128 dk4p[did]*(conj(u11t[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]+r[(did*ngorkov+igork1PP)*nc])-\
129 u12t[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]+r[(did*ngorkov+igork1PP)*nc+1]));
130
131 phi[(i*ngorkov+igorkovPP)*nc+1]+=-dk4m[i]*(conj(-u12t[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc]-r[(uid*ngorkov+igork1PP)*nc])+\
132 conj(u11t[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc+1]-r[(uid*ngorkov+igork1PP)*nc+1]))-\
133 dk4p[did]*(conj(u12t[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]+r[(did*ngorkov+igork1PP)*nc])+\
134 u11t[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]+r[(did*ngorkov+igork1PP)*nc+1]));
135 }
136#endif
137 }
138#endif
139 return 0;
140}
int ZHalo_swap_all(Complex *z, int ncpt)
Calls the functions to send data to both the up and down halos.
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define ndirac
Dirac indices.
Definition sizes.h:177
#define ndim
Dimensions.
Definition sizes.h:179

References Complex, Complex_f, kferm, kvol, nc, ndim, ndirac, ngorkov, and ZHalo_swap_all().

Here is the call graph for this function:

◆ Dslash_f()

int Dslash_f ( Complex_f * phi,
Complex_f * r,
Complex_f * u11t,
Complex_f * u12t,
unsigned int * iu,
unsigned int * id,
Complex_f * gamval,
int * gamin,
float * dk[2],
Complex_f jqq,
float akappa )

Evaluates \(\Phi=M r\) in single precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 463 of file matrices.c.

464 {
465 /*
466 * @brief Evaluates @f(\Phi=M r@f) in single precision.
467 *
468 * @param phi: The product
469 * @param r: The array being acted on by M
470 * @param u11t_f: First colour trial field
471 * @param u12t_f: Second colour trial field
472 * @param iu: Upper halo indices
473 * @param id: Lower halo indices
474 * @param gamval_f: Gamma matrices
475 * @param gamin: Indices for dirac terms
476 * @param dk_f[0]:
477 * @param dk_f[1]:
478 * @param jqq: Diquark source
479 * @param akappa: Hopping parameter
480 *
481 * @see CHalo_swap_all (MPI only)
482 *
483 * @return Zero on success, integer error code otherwise
484 */
485 const char *funcname = "Dslash_f";
486 //Get the halos in order
487#if(nproc>1)
488 CHalo_swap_all(r, 16);
489#endif
490
491 //Mass term
492 //Diquark Term (antihermitian)
493#ifdef __NVCC__
494 cuDslash_f(phi,r,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk_f[0],dk_f[1],jqq,akappa,dimGrid,dimBlock);
495#else
496 memcpy(phi, r, kferm*sizeof(Complex_f));
497#pragma omp parallel for
498 for(unsigned int i=0;i<kvol;i++){
499#pragma omp simd aligned(phi,r,gamval_f:AVX)
500 for(unsigned short idirac = 0; idirac<ndirac; idirac++){
501 unsigned short igork = idirac+4;
502 Complex_f a_1, a_2;
503 a_1=conj(jqq)*gamval_f[4*ndirac+idirac];
504 //We subtract a_2, hence the minus
505 a_2=-jqq*gamval_f[4*ndirac+idirac];
506 phi[(i*ngorkov+idirac)*nc]+=a_1*r[(i*ngorkov+igork)*nc+0];
507 phi[(i*ngorkov+idirac)*nc+1]+=a_1*r[(i*ngorkov+igork)*nc+1];
508 phi[(i*ngorkov+igork)*nc+0]+=a_2*r[(i*ngorkov+idirac)*nc];
509 phi[(i*ngorkov+igork)*nc+1]+=a_2*r[(i*ngorkov+idirac)*nc+1];
510 }
511
512 //Spacelike terms. Here's hoping I haven't put time as the zeroth component somewhere!
513#ifndef NO_SPACE
514 for(unsigned short mu = 0; mu <3; mu++){
515 unsigned int did=id[mu+ndim*i]; unsigned int uid = iu[mu+ndim*i];
516#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamval_f,gamin:AVX)
517 for(unsigned short igorkov=0; igorkov<ngorkov; igorkov++){
518 //FORTRAN had mod((igorkov-1),4)+1 to prevent issues with non-zero indexing in the dirac term.
519 unsigned short idirac=igorkov%4;
520 unsigned short igork1 = (igorkov<4) ? gamin[mu*ndirac+idirac] : gamin[mu*ndirac+idirac]+4;
521 //Can manually vectorise with a pragma?
522 //Wilson + Dirac term in that order. Definitely easier
523 //to read when split into different loops, but should be faster this way
524 phi[(i*ngorkov+igorkov)*nc]+=-akappa*(u11t_f[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc]+\
525 u12t_f[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc+1]+\
526 conj(u11t_f[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]-\
527 u12t_f[did*ndim+mu]*r[(did*ngorkov+igorkov)*nc+1])+\
528 //Dirac term. Reminder! gamval was rescaled by kappa when we defined it
529 gamval_f[mu*ndirac+idirac]*(u11t_f[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc]+\
530 u12t_f[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc+1]-\
531 conj(u11t_f[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]+\
532 u12t_f[did*ndim+mu]*r[(did*ngorkov+igork1)*nc+1]);
533
534 phi[(i*ngorkov+igorkov)*nc+1]+=-akappa*(-conj(u12t_f[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc]+\
535 conj(u11t_f[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc+1]+\
536 conj(u12t_f[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]+\
537 u11t_f[did*ndim+mu]*r[(did*ngorkov+igorkov)*nc+1])+\
538 //Dirac term
539 gamval_f[mu*ndirac+idirac]*(-conj(u12t_f[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc]+\
540 conj(u11t_f[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc+1]-\
541 conj(u12t_f[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]-\
542 u11t_f[did*ndim+mu]*r[(did*ngorkov+igork1)*nc+1]);
543 }
544 }
545 //Timelike terms next. These run from igorkov=0..3 and 4..7 with slightly different rules for each
546 //We can fit it into a single loop by declaring igorkovPP=igorkov+4 instead of looping igorkov=4..7 separately
547 //Note that for the igorkov 4..7 loop idirac=igorkov-4, so we don't need to declare idiracPP separately
548#endif
549 unsigned int did=id[3+ndim*i]; unsigned int uid = iu[3+ndim*i];
550#ifndef NO_TIME
551#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamin:AVX)
552 for(unsigned short igorkov=0; igorkov<4; igorkov++){
553 unsigned short igorkovPP=igorkov+4; //idirac = igorkov; It is a bit redundant but I'll mention it as that's how
554 //the FORTRAN code did it.
555 unsigned short igork1 = gamin[3*ndirac+igorkov]; unsigned short igork1PP = igork1+4;
556
557 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
558 phi[(i*ngorkov+igorkov)*nc]+=
559 -dk_f[1][i]*(u11t_f[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc]-r[(uid*ngorkov+igork1)*nc])
560 +u12t_f[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc+1]-r[(uid*ngorkov+igork1)*nc+1]))
561 -dk_f[0][did]*(conj(u11t_f[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]+r[(did*ngorkov+igork1)*nc])
562 -u12t_f[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]+r[(did*ngorkov+igork1)*nc+1]));
563 phi[(i*ngorkov+igorkov)*nc+1]+=
564 -dk_f[1][i]*(-conj(u12t_f[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc]-r[(uid*ngorkov+igork1)*nc])
565 +conj(u11t_f[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc+1]-r[(uid*ngorkov+igork1)*nc+1]))
566 -dk_f[0][did]*(conj(u12t_f[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]+r[(did*ngorkov+igork1)*nc])
567 +u11t_f[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]+r[(did*ngorkov+igork1)*nc+1]));
568
569 //And the +4 terms. Note that dk_f[1] and dk_f[0] swap positions compared to the above
570 phi[(i*ngorkov+igorkovPP)*nc]+=-dk_f[0][i]*(u11t_f[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc]-r[(uid*ngorkov+igork1PP)*nc])
571 +u12t_f[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc+1]-r[(uid*ngorkov+igork1PP)*nc+1]))
572 -dk_f[1][did]*(conj(u11t_f[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]+r[(did*ngorkov+igork1PP)*nc])
573 -u12t_f[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]+r[(did*ngorkov+igork1PP)*nc+1]));
574
575 phi[(i*ngorkov+igorkovPP)*nc+1]+=-dk_f[0][i]*(conj(-u12t_f[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc]-r[(uid*ngorkov+igork1PP)*nc])
576 +conj(u11t_f[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc+1]-r[(uid*ngorkov+igork1PP)*nc+1]))
577 -dk_f[1][did]*(conj(u12t_f[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]+r[(did*ngorkov+igork1PP)*nc])
578 +u11t_f[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]+r[(did*ngorkov+igork1PP)*nc+1]));
579 }
580#endif
581 }
582#endif
583 return 0;
584}
int CHalo_swap_all(Complex_f *c, int ncpt)
Calls the functions to send data to both the up and down halos.
#define Complex_f
Single precision complex number.
Definition sizes.h:56

References CHalo_swap_all(), Complex_f, kferm, kvol, nc, ndim, ndirac, and ngorkov.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Dslashd()

int Dslashd ( Complex * phi,
Complex * r,
Complex * u11t,
Complex * u12t,
unsigned int * iu,
unsigned int * id,
Complex * gamval,
int * gamin,
double * dk4m,
double * dk4p,
Complex_f jqq,
float akappa )

Evaluates \(\Phi=M^\dagger r\) in double precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 141 of file matrices.c.

142 {
143 /*
144 * @brief Evaluates @f(\Phi=M^\dagger r@f) in double precision.
145 *
146 * @param phi: The product
147 * @param r: The array being acted on by M
148 * @param u11t: First colour trial field
149 * @param u12t: Second colour trial field
150 * @param iu: Upper halo indices
151 * @param id: Lower halo indices
152 * @param gamval: Gamma matrices
153 * @param gamin: Indices for dirac terms
154 * @param dk4m:
155 * @param dk4p:
156 * @param jqq: Diquark source
157 * @param akappa: Hopping parameter
158 *
159 * @see ZHalo_swap_all (MPI only)
160 *
161 * @return Zero on success, integer error code otherwise
162 */
163 const char *funcname = "Dslashd";
164 //Get the halos in order
165#if(nproc>1)
166 ZHalo_swap_all(r, 16);
167#endif
168
169 //Mass term
170#ifdef __NVCC__
171 cuDslashd(phi,r,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa,dimGrid,dimBlock);
172#else
173 memcpy(phi, r, kferm*sizeof(Complex));
174#pragma omp parallel for
175 for(int i=0;i<kvol;i++){
176#pragma omp simd aligned(phi,r,gamval:AVX)
177 //Diquark Term (antihermitian) The signs of a_1 and a_2 below flip under dagger
178 for(int idirac = 0; idirac<ndirac; idirac++){
179 int igork = idirac+4;
180 Complex a_1, a_2;
181 //We subtract a_1, hence the minus
182 a_1=-conj(jqq)*gamval[4*ndirac+idirac];
183 a_2=jqq*gamval[4*ndirac+idirac];
184 phi[(i*ngorkov+idirac)*nc]+=a_1*r[(i*ngorkov+igork)*nc];
185 phi[(i*ngorkov+idirac)*nc+1]+=a_1*r[(i*ngorkov+igork)*nc+1];
186 phi[(i*ngorkov+igork)*nc]+=a_2*r[(i*ngorkov+idirac)*nc];
187 phi[(i*ngorkov+igork)*nc+1]+=a_2*r[(i*ngorkov+idirac)*nc+1];
188 }
189
190 //Spacelike terms. Here's hoping I haven't put time as the zeroth component somewhere!
191#ifndef NO_SPACE
192 for(int mu = 0; mu <3; mu++){
193 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
194#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
195 for(int igorkov=0; igorkov<ngorkov; igorkov++){
196 //FORTRAN had mod((igorkov-1),4)+1 to prevent issues with non-zero indexing.
197 int idirac=igorkov%4;
198 int igork1 = (igorkov<4) ? gamin[mu*ndirac+idirac] : gamin[mu*ndirac+idirac]+4;
199 //Wilson + Dirac term in that order. Definitely easier
200 //to read when split into different loops, but should be faster this way
201 //Reminder! gamval was rescaled by kappa when we defined it
202 phi[(i*ngorkov+igorkov)*nc]+=
203 -akappa*( u11t[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc]
204 +u12t[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc+1]
205 +conj(u11t[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]
206 -u12t[did*ndim+mu] *r[(did*ngorkov+igorkov)*nc+1])
207 -gamval[mu*ndirac+idirac]*
208 ( u11t[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc]
209 +u12t[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc+1]
210 -conj(u11t[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]
211 +u12t[did*ndim+mu] *r[(did*ngorkov+igork1)*nc+1]);
212
213 phi[(i*ngorkov+igorkov)*nc+1]+=
214 -akappa*(-conj(u12t[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc]
215 +conj(u11t[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc+1]
216 +conj(u12t[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]
217 +u11t[did*ndim+mu] *r[(did*ngorkov+igorkov)*nc+1])
218 -gamval[mu*ndirac+idirac]*
219 (-conj(u12t[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc]
220 +conj(u11t[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc+1]
221 -conj(u12t[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]
222 -u11t[did*ndim+mu] *r[(did*ngorkov+igork1)*nc+1]);
223 }
224 }
225#endif
226 //Timelike terms next. These run from igorkov=0..3 and 4..7 with slightly different rules for each
227 //We can fit it into a single loop by declaring igorkovPP=igorkov+4 instead of looping igorkov=4..7 separately
228 //Note that for the igorkov 4..7 loop idirac=igorkov-4, so we don't need to declare idiracPP separately
229 //Under dagger, dk4p and dk4m get swapped and the dirac component flips sign.
230 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
231#ifndef NO_TIME
232#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
233 for(int igorkov=0; igorkov<4; igorkov++){
234 //the FORTRAN code did it.
235 int igork1 = gamin[3*ndirac+igorkov];
236 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
237 phi[(i*ngorkov+igorkov)*nc]+=
238 -dk4m[i]*(u11t[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc]+r[(uid*ngorkov+igork1)*nc])
239 +u12t[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc+1]+r[(uid*ngorkov+igork1)*nc+1]))
240 -dk4p[did]*(conj(u11t[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]-r[(did*ngorkov+igork1)*nc])
241 -u12t[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]-r[(did*ngorkov+igork1)*nc+1]));
242 phi[(i*ngorkov+igorkov)*nc+1]+=
243 -dk4m[i]*(-conj(u12t[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc]+r[(uid*ngorkov+igork1)*nc])
244 +conj(u11t[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc+1]+r[(uid*ngorkov+igork1)*nc+1]))
245 -dk4p[did]*(conj(u12t[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]-r[(did*ngorkov+igork1)*nc])
246 +u11t[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]-r[(did*ngorkov+igork1)*nc+1]));
247
248
249 int igorkovPP=igorkov+4; //idirac = igorkov; It is a bit redundant but I'll mention it as that's how
250 int igork1PP = igork1+4;
251 //And the +4 terms. Note that dk4p and dk4m swap positions compared to the above
252 phi[(i*ngorkov+igorkovPP)*nc]+=-dk4p[i]*(u11t[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc]+r[(uid*ngorkov+igork1PP)*nc])+\
253 u12t[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc+1]+r[(uid*ngorkov+igork1PP)*nc+1]))-\
254 dk4m[did]*(conj(u11t[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]-r[(did*ngorkov+igork1PP)*nc])-\
255 u12t[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]-r[(did*ngorkov+igork1PP)*nc+1]));
256
257 phi[(i*ngorkov+igorkovPP)*nc+1]+=dk4p[i]*(conj(u12t[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc]+r[(uid*ngorkov+igork1PP)*nc])-\
258 conj(u11t[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc+1]+r[(uid*ngorkov+igork1PP)*nc+1]))-\
259 dk4m[did]*(conj(u12t[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]-r[(did*ngorkov+igork1PP)*nc])+
260 u11t[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]-r[(did*ngorkov+igork1PP)*nc+1]));
261
262 }
263#endif
264 }
265#endif
266 return 0;
267}

References Complex, Complex_f, kferm, kvol, nc, ndim, ndirac, ngorkov, and ZHalo_swap_all().

Here is the call graph for this function:

◆ Dslashd_f()

int Dslashd_f ( Complex_f * phi,
Complex_f * r,
Complex_f * u11t,
Complex_f * u12t,
unsigned int * iu,
unsigned int * id,
Complex_f * gamval,
int * gamin,
float * dk[2],
Complex_f jqq,
float akappa )

Evaluates \(\Phi=M^\dagger r\) in single precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
jqqDiquark source
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 585 of file matrices.c.

586 {
587 /*
588 * @brief Evaluates @f(\Phi=M^\dagger r@f) in single precision.
589 *
590 * @param phi: The product
591 * @param r: The array being acted on by M
592 * @param u11t_f: First colour trial field
593 * @param u12t_f: Second colour trial field
594 * @param iu: Upper halo indices
595 * @param id: Lower halo indices
596 * @param gamval_f: Gamma matrices
597 * @param gamin: Indices for dirac terms
598 * @param dk_f[0]:
599 * @param dk_f[1]:
600 * @param jqq: Diquark source
601 * @param akappa: Hopping parameter
602 *
603 * @see CHalo_swap_all (MPI only)
604 *
605 * @return Zero on success, integer error code otherwise
606 */
607 const char *funcname = "Dslashd_f";
608 //Get the halos in order
609#if(nproc>1)
610 CHalo_swap_all(r, 16);
611#endif
612
613 //Mass term
614#ifdef __NVCC__
615 cuDslashd_f(phi,r,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk_f[0],dk_f[1],jqq,akappa,dimGrid,dimBlock);
616#else
617 memcpy(phi, r, kferm*sizeof(Complex_f));
618#pragma omp parallel for
619 for(unsigned int i=0;i<kvol;i++){
620#pragma omp simd aligned(phi,r,gamval_f:AVX)
621 //Diquark Term (antihermitian) The signs of a_1 and a_2 below flip under dagger
622 for(unsigned short idirac = 0; idirac<ndirac; idirac++){
623 unsigned short igork = idirac+4;
624 Complex_f a_1, a_2;
625 //We subtract a_1, hence the minus
626 a_1=-conj(jqq)*gamval_f[4*ndirac+idirac];
627 a_2=jqq*gamval_f[4*ndirac+idirac];
628 phi[(i*ngorkov+idirac)*nc]+=a_1*r[(i*ngorkov+igork)*nc];
629 phi[(i*ngorkov+idirac)*nc+1]+=a_1*r[(i*ngorkov+igork)*nc+1];
630 phi[(i*ngorkov+igork)*nc]+=a_2*r[(i*ngorkov+idirac)*nc];
631 phi[(i*ngorkov+igork)*nc+1]+=a_2*r[(i*ngorkov+idirac)*nc+1];
632 }
633
634 //Spacelike terms. Here's hoping I haven't put time as the zeroth component somewhere!
635#ifndef NO_SPACE
636 for(unsigned short mu = 0; mu <3; mu++){
637 unsigned int did=id[mu+ndim*i]; unsigned int uid = iu[mu+ndim*i];
638#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamval_f:AVX)
639 for(unsigned short igorkov=0; igorkov<ngorkov; igorkov++){
640 //FORTRAN had mod((igorkov-1),4)+1 to prevent issues with non-zero indexing.
641 unsigned short idirac=igorkov%4;
642 unsigned short igork1 = (igorkov<4) ? gamin[mu*ndirac+idirac] : gamin[mu*ndirac+idirac]+4;
643 //Wilson + Dirac term in that order. Definitely easier
644 //to read when split into different loops, but should be faster this way
645 //Reminder! gamval was rescaled by kappa when we defined it
646 phi[(i*ngorkov+igorkov)*nc]+=
647 -akappa*( u11t_f[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc]
648 +u12t_f[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc+1]
649 +conj(u11t_f[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]
650 -u12t_f[did*ndim+mu] *r[(did*ngorkov+igorkov)*nc+1])
651 -gamval_f[mu*ndirac+idirac]*
652 ( u11t_f[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc]
653 +u12t_f[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc+1]
654 -conj(u11t_f[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]
655 +u12t_f[did*ndim+mu] *r[(did*ngorkov+igork1)*nc+1]);
656
657 phi[(i*ngorkov+igorkov)*nc+1]+=
658 -akappa*(-conj(u12t_f[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc]
659 +conj(u11t_f[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc+1]
660 +conj(u12t_f[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]
661 +u11t_f[did*ndim+mu] *r[(did*ngorkov+igorkov)*nc+1])
662 -gamval_f[mu*ndirac+idirac]*
663 (-conj(u12t_f[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc]
664 +conj(u11t_f[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc+1]
665 -conj(u12t_f[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]
666 -u11t_f[did*ndim+mu] *r[(did*ngorkov+igork1)*nc+1]);
667 }
668 }
669#endif
670 //Timelike terms next. These run from igorkov=0..3 and 4..7 with slightly different rules for each
671 //We can fit it into a single loop by declaring igorkovPP=igorkov+4 instead of looping igorkov=4..7 separately
672 //Note that for the igorkov 4..7 loop idirac=igorkov-4, so we don't need to declare idiracPP separately
673 //Under dagger, dk_f[1] and dk_f[0] get swapped and the dirac component flips sign.
674 unsigned int did=id[3+ndim*i]; unsigned int uid = iu[3+ndim*i];
675#ifndef NO_TIME
676#pragma omp simd aligned(phi,r,u11t_f,u12t_f:AVX)
677 for(unsigned short igorkov=0; igorkov<4; igorkov++){
678 //the FORTRAN code did it.
679 unsigned short igork1 = gamin[3*ndirac+igorkov];
680 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
681 phi[(i*ngorkov+igorkov)*nc]+=
682 -dk_f[0][i]*(u11t_f[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc]+r[(uid*ngorkov+igork1)*nc])
683 +u12t_f[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc+1]+r[(uid*ngorkov+igork1)*nc+1]))
684 -dk_f[1][did]*(conj(u11t_f[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]-r[(did*ngorkov+igork1)*nc])
685 -u12t_f[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]-r[(did*ngorkov+igork1)*nc+1]));
686 phi[(i*ngorkov+igorkov)*nc+1]+=
687 -dk_f[0][i]*(-conj(u12t_f[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc]+r[(uid*ngorkov+igork1)*nc])
688 +conj(u11t_f[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc+1]+r[(uid*ngorkov+igork1)*nc+1]))
689 -dk_f[1][did]*(conj(u12t_f[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]-r[(did*ngorkov+igork1)*nc])
690 +u11t_f[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]-r[(did*ngorkov+igork1)*nc+1]));
691
692
693 unsigned short igorkovPP=igorkov+4; //idirac = igorkov; It is a bit redundant but I'll mention it as that's how
694 unsigned short igork1PP = igork1+4;
695 //And the +4 terms. Note that dk_f[1] and dk_f[0] swap positions compared to the above
696 phi[(i*ngorkov+igorkovPP)*nc]+=-dk_f[1][i]*(u11t_f[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc]+r[(uid*ngorkov+igork1PP)*nc])
697 +u12t_f[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc+1]+r[(uid*ngorkov+igork1PP)*nc+1]))
698 -dk_f[0][did]*(conj(u11t_f[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]-r[(did*ngorkov+igork1PP)*nc])
699 -u12t_f[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]-r[(did*ngorkov+igork1PP)*nc+1]));
700
701 phi[(i*ngorkov+igorkovPP)*nc+1]+=dk_f[1][i]*(conj(u12t_f[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc]+r[(uid*ngorkov+igork1PP)*nc])
702 -conj(u11t_f[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc+1]+r[(uid*ngorkov+igork1PP)*nc+1]))
703 -dk_f[0][did]*(conj(u12t_f[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]-r[(did*ngorkov+igork1PP)*nc])
704 +u11t_f[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]-r[(did*ngorkov+igork1PP)*nc+1]));
705
706 }
707#endif
708 }
709#endif
710 return 0;
711}

References CHalo_swap_all(), Complex_f, kferm, kvol, nc, ndim, ndirac, and ngorkov.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Hdslash()

int Hdslash ( Complex * phi,
Complex * r,
Complex * u11t[2],
unsigned int * iu,
unsigned int * id,
Complex * gamval,
int * gamin,
double * dk[2],
float akappa )

Evaluates \(\Phi=M r\) in double precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 268 of file matrices.c.

269 {
270 /*
271 * @brief Evaluates @f(\Phi=M r@f) in double precision
272 *
273 * @param phi: The product
274 * @param r: The array being acted on by M
275 * @param ut[0]: First colour trial field
276 * @param ut[1]: Second colour trial field
277 * @param iu: Upper halo indices
278 * @param id: Lower halo indices
279 * @param gamval: Gamma matrices
280 * @param gamin: Indices for dirac terms
281 * @param dk[0]:
282 * @param dk[1]:
283 * @param akappa: Hopping parameter
284 *
285 * @see ZHalo_swap_all (MPI only)
286 *
287 * @return Zero on success, integer error code otherwise
288 */
289 const char *funcname = "Hdslash";
290 //Get the halos in order
291#if(nproc>1)
292 ZHalo_swap_all(r, 8);
293#endif
294
295 //Mass term
296 //Spacelike term
297#ifdef __NVCC__
298 cuHdslash(phi,r,ut[0],ut[1],iu,id,gamval,gamin,dk[0],dk[1],akappa,dimGrid,dimBlock);
299#else
300 memcpy(phi, r, kferm2*sizeof(Complex));
301#pragma omp parallel for
302 for(int i=0;i<kvol;i++){
303#ifndef NO_SPACE
304 for(int mu = 0; mu <3; mu++){
305 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
306#pragma omp simd aligned(phi,r,gamval:AVX)
307 for(int idirac=0; idirac<ndirac; idirac++){
308 //FORTRAN had mod((idirac-1),4)+1 to prevent issues with non-zero indexing.
309 int igork1 = gamin[mu*ndirac+idirac];
310 //Can manually vectorise with a pragma?
311 //Wilson + Dirac term in that order. Definitely easier
312 //to read when split into different loops, but should be faster this way
313 phi[(i*ndirac+idirac)*nc]+=-akappa*(ut[0][i*ndim+mu]*r[(uid*ndirac+idirac)*nc]+\
314 ut[1][i*ndim+mu]*r[(uid*ndirac+idirac)*nc+1]+\
315 conj(ut[0][did*ndim+mu])*r[(did*ndirac+idirac)*nc]-\
316 ut[1][did*ndim+mu]*r[(did*ndirac+idirac)*nc+1])+\
317 //Dirac term
318 gamval[mu*ndirac+idirac]*(ut[0][i*ndim+mu]*r[(uid*ndirac+igork1)*nc]+\
319 ut[1][i*ndim+mu]*r[(uid*ndirac+igork1)*nc+1]-\
320 conj(ut[0][did*ndim+mu])*r[(did*ndirac+igork1)*nc]+\
321 ut[1][did*ndim+mu]*r[(did*ndirac+igork1)*nc+1]);
322
323 phi[(i*ndirac+idirac)*nc+1]+=-akappa*(-conj(ut[1][i*ndim+mu])*r[(uid*ndirac+idirac)*nc]+\
324 conj(ut[0][i*ndim+mu])*r[(uid*ndirac+idirac)*nc+1]+\
325 conj(ut[1][did*ndim+mu])*r[(did*ndirac+idirac)*nc]+\
326 ut[0][did*ndim+mu]*r[(did*ndirac+idirac)*nc+1])+\
327 //Dirac term
328 gamval[mu*ndirac+idirac]*(-conj(ut[1][i*ndim+mu])*r[(uid*ndirac+igork1)*nc]+\
329 conj(ut[0][i*ndim+mu])*r[(uid*ndirac+igork1)*nc+1]-\
330 conj(ut[1][did*ndim+mu])*r[(did*ndirac+igork1)*nc]-\
331 ut[0][did*ndim+mu]*r[(did*ndirac+igork1)*nc+1]);
332 }
333 }
334#endif
335 //Timelike terms
336 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
337#ifndef NO_TIME
338#pragma omp simd aligned(phi,r:AVX)
339 for(int idirac=0; idirac<ndirac; idirac++){
340 int igork1 = gamin[3*ndirac+idirac];
341 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
342 //Reminder! gamval was rescaled by kappa when we defined it
343 phi[(i*ndirac+idirac)*nc]+=
344 -dk[1][i]*(ut[0][i*ndim+3]*(r[(uid*ndirac+idirac)*nc]-r[(uid*ndirac+igork1)*nc])
345 +ut[1][i*ndim+3]*(r[(uid*ndirac+idirac)*nc+1]-r[(uid*ndirac+igork1)*nc+1]))
346 -dk[0][did]*(conj(ut[0][did*ndim+3])*(r[(did*ndirac+idirac)*nc]+r[(did*ndirac+igork1)*nc])
347 -ut[1][did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]+r[(did*ndirac+igork1)*nc+1]));
348 phi[(i*ndirac+idirac)*nc+1]+=
349 -dk[1][i]*(-conj(ut[1][i*ndim+3])*(r[(uid*ndirac+idirac)*nc]-r[(uid*ndirac+igork1)*nc])
350 +conj(ut[0][i*ndim+3])*(r[(uid*ndirac+idirac)*nc+1]-r[(uid*ndirac+igork1)*nc+1]))
351 -dk[0][did]*(conj(ut[1][did*ndim+3])*(r[(did*ndirac+idirac)*nc]+r[(did*ndirac+igork1)*nc])
352 +ut[0][did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]+r[(did*ndirac+igork1)*nc+1]));
353 }
354#endif
355 }
356#endif
357 return 0;
358}
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188

References Complex, kferm2, kvol, nc, ndim, ndirac, and ZHalo_swap_all().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Hdslash_f()

int Hdslash_f ( Complex_f * phi,
Complex_f * r,
Complex_f * ut[2],
unsigned int * iu,
unsigned int * id,
Complex_f * gamval,
int * gamin,
float * dk[2],
float akappa )

Evaluates \(\Phi=M r\) in single precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 712 of file matrices.c.

713 {
714 /*
715 * @brief Evaluates @f(\Phi=M r@f) in single precision.
716 *
717 * @param phi: The product
718 * @param r: The array being acted on by M
719 * @param ut[0]_f: First colour trial field
720 * @param ut[1]_f: Second colour trial field
721 * @param iu: Upper halo indices
722 * @param id: Lower halo indices
723 * @param gamval_f: Gamma matrices
724 * @param gamin: Indices for dirac terms
725 * @param dk4m_f:
726 * @param dk4p_f:
727 * @param akappa: Hopping parameter
728 *
729 * @see CHalo_swap_all (MPI only)
730 *
731 * @return Zero on success, integer error code otherwise
732 */
733 const char *funcname = "Hdslash_f";
734 //Get the halos in order
735#if(nproc>1)
736 CHalo_swap_all(r, 8);
737#endif
738#ifdef __NVCC__
739 cuHdslash_f(phi,r,ut,iu,id,gamval,gamin,dk,akappa,dimGrid,dimBlock);
740#else
741 //Mass term
742 memcpy(phi, r, kferm2*sizeof(Complex_f));
743#pragma omp parallel for
744 for(unsigned int i=0;i<kvol;i+=AVX){
745 alignas(AVX) Complex_f u11s[AVX]; alignas(AVX) Complex_f u12s[AVX];
746 alignas(AVX) Complex_f u11sd[AVX]; alignas(AVX) Complex_f u12sd[AVX];
747 alignas(AVX) Complex_f ru[2][AVX]; alignas(AVX) Complex_f rd[2][AVX];
748 alignas(AVX) Complex_f rgu[2][AVX]; alignas(AVX) Complex_f rgd[2][AVX];
749 alignas(AVX) Complex_f phi_s[ndirac*nc][AVX];
750 //Do we need to sync threads if each thread only accesses the value it put in shared memory?
751#pragma unroll(2)
752 for(unsigned short idirac=0; idirac<ndirac; idirac++)
753 for(unsigned short c=0; c<nc; c++)
754#pragma omp simd aligned(phi_s,phi:AVX)
755 for(unsigned short j=0;j<AVX;j++)
756 phi_s[idirac*nc+c][j]=phi[((i+j)*ndirac+idirac)*nc+c];
757 alignas(AVX) unsigned int did[AVX], uid[AVX];
758#pragma unroll
759 for(unsigned short mu = 0; mu <3; mu++){
760#pragma omp simd aligned(u11s,u12s,did,uid,id,iu,u11sd,u12sd:AVX)
761 for(unsigned short j =0;j<AVX;j++){
762 did[j]=id[(i+j)*ndim+mu]; uid[j] = iu[(i+j)*ndim+mu];
763 u11s[j]=ut[0][(i+j)*ndim+mu]; u12s[j]=ut[1][(i+j)*ndim+mu];
764 u11sd[j]=ut[0][did[j]*ndim+mu]; u12sd[j]=ut[1][did[j]*ndim+mu];
765 }
766#pragma unroll
767 for(unsigned short idirac=0; idirac<ndirac; idirac++){
768 unsigned short igork1 = gamin[mu*ndirac+idirac];
769#pragma unroll
770 for(unsigned short c=0; c<nc; c++)
771#pragma omp simd aligned(ru,rd,rgu,rgd,r,uid,did:AVX)
772 for(unsigned short j =0;j<AVX;j++){
773 ru[c][j]=r[(uid[j]*ndirac+idirac)*nc+c];
774 rd[c][j]=r[(did[j]*ndirac+idirac)*nc+c];
775 rgu[c][j]=r[(uid[j]*ndirac+igork1)*nc+c];
776 rgd[c][j]=r[(did[j]*ndirac+igork1)*nc+c];
777 }
778 //FORTRAN had mod((idirac-1),4)+1 to prevent issues with non-zero indexing.
779 //Can manually vectorise with a pragma?
780 //Wilson + Dirac term in that order. Definitely easier
781 //to read when split into different loops, but should be faster this way
782#pragma omp simd aligned(phi_s,u11s,u12s,u11sd,u12sd,ru,rd,rgu,rgd:AVX)
783 for(unsigned short j =0;j<AVX;j++){
784 phi_s[idirac*nc][j]+=-akappa*(u11s[j]*ru[0][j]+\
785 u12s[j]*ru[1][j]+\
786 conj(u11sd[j])*rd[0][j]-\
787 u12sd[j]*rd[1][j]);
788 //Dirac term
789 phi_s[idirac*nc][j]+=gamval[mu*ndirac+idirac]*(u11s[j]*rgu[0][j]+\
790 u12s[j]*rgu[1][j]-\
791 conj(u11sd[j])*rgd[0][j]+\
792 u12sd[j]*rgd[1][j]);
793
794 phi_s[idirac*nc+1][j]+=-akappa*(-conj(u12s[j])*ru[0][j]+\
795 conj(u11s[j])*ru[1][j]+\
796 conj(u12sd[j])*rd[0][j]+\
797 u11sd[j]*rd[1][j]);
798 //Dirac term
799 phi_s[idirac*nc+1][j]+=gamval[mu*ndirac+idirac]*(-conj(u12s[j])*rgu[0][j]+\
800 conj(u11s[j])*rgu[1][j]-\
801 conj(u12sd[j])*rgd[0][j]-\
802 u11sd[j]*rgd[1][j]);
803 }
804 }
805 }
806#ifndef NO_TIME
807 //Timelike terms
808 alignas(AVX) float dk4ms[AVX],dk4ps[AVX];
809#pragma omp simd
810 for(unsigned short j=0;j<AVX;j++){
811 u11s[j]=ut[0][(i+j)*ndim+3]; u12s[j]=ut[1][(i+j)*ndim+3];
812 did[j]=id[(i+j)*ndim+3];uid[j]= iu[(i+j)*ndim+3];
813 u11sd[j]=ut[0][did[j]*ndim+3]; u12sd[j]=ut[1][did[j]*ndim+3];
814 dk4ms[j]=dk[0][did[j]]; dk4ps[j]=dk[1][i+j];
815 }
816
817#pragma unroll
818 for(unsigned short idirac=0; idirac<ndirac; idirac++){
819 unsigned short igork1 = gamin[3*ndirac+idirac];
820#pragma unroll
821 for(unsigned short c=0; c<nc; c++)
822#pragma omp simd aligned(ru,rd,rgu,rgd,r,uid,did:AVX)
823 for(unsigned short j =0;j<AVX;j++){
824 ru[c][j]=r[(uid[j]*ndirac+idirac)*nc+c];
825 rd[c][j]=r[(did[j]*ndirac+idirac)*nc+c];
826 rgu[c][j]=r[(uid[j]*ndirac+igork1)*nc+c];
827 rgd[c][j]=r[(did[j]*ndirac+igork1)*nc+c];
828 }
829 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
830
831#pragma omp simd aligned(phi_s,u11s,u12s,u11sd,u12sd,ru,rd,rgu,rgd,dk4ms,dk4ps,phi:AVX)
832 for(unsigned short j =0;j<AVX;j++){
833 phi_s[idirac*nc+0][j]-=
834 dk4ps[j]*(u11s[j]*(ru[0][j]-rgu[0][j])
835 +u12s[j]*(ru[1][j]-rgu[1][j]));
836 phi_s[idirac*nc+0][j]-=
837 dk4ms[j]*(conj(u11sd[j])*(rd[0][j]+rgd[0][j])
838 -u12sd[j]*(rd[1][j]+rgd[1][j]));
839 phi[((i+j)*ndirac+idirac)*nc]=phi_s[idirac*nc][j];
840
841 phi_s[idirac*nc+1][j]-=
842 dk4ps[j]*(-conj(u12s[j])*(ru[0][j]-rgu[0][j])
843 +conj(u11s[j])*(ru[1][j]-rgu[1][j]));
844 phi_s[idirac*nc+1][j]-=
845 dk4ms[j]*(conj(u12sd[j])*(rd[0][j]+rgd[0][j])
846 +u11sd[j]*(rd[1][j]+rgd[1][j]));
847 phi[((i+j)*ndirac+idirac)*nc+1]=phi_s[idirac*nc+1][j];
848 }
849 }
850#endif
851 }
852#endif
853 return 0;
854}
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268

References AVX, CHalo_swap_all(), Complex_f, kferm2, kvol, nc, ndim, and ndirac.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Hdslashd()

int Hdslashd ( Complex * phi,
Complex * r,
Complex * u11t,
Complex * u12t,
unsigned int * iu,
unsigned int * id,
Complex * gamval,
int * gamin,
double * dk4m,
double * dk4p,
float akappa )

Evaluates \(\Phi=M^\dagger r\) in double precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 359 of file matrices.c.

360 {
361 /*
362 * @brief Evaluates @f(\Phi=M^\dagger r@f) in double precision
363 *
364 * @param phi: The product
365 * @param r: The array being acted on by M
366 * @param u11t: First colour trial field
367 * @param u12t: Second colour trial field
368 * @param iu: Upper halo indices
369 * @param id: Lower halo indices
370 * @param gamval: Gamma matrices
371 * @param gamin: Indices for dirac terms
372 * @param dk4m:
373 * @param dk4p:
374 * @param akappa: Hopping parameter
375 *
376 * @see ZHalo_swap_all (MPI only)
377 *
378 * @return Zero on success, integer error code otherwise
379 */
380 const char *funcname = "Hdslashd";
381 //Get the halos in order. Because C is row major, we need to extract the correct
382 //terms for each halo first. Changing the indices was considered but that caused
383 //issues with the BLAS routines.
384#if(nproc>1)
385 ZHalo_swap_all(r, 8);
386#endif
387
388 //Looks like flipping the array ordering for C has meant a lot
389 //of for loops. Sense we're jumping around quite a bit the cache is probably getting refreshed
390 //anyways so memory access patterns mightn't be as big of an limiting factor here anyway
391
392 //Mass term
393#ifdef __NVCC__
394 cuHdslashd(phi,r,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,akappa,dimGrid,dimBlock);
395#else
396 memcpy(phi, r, kferm2*sizeof(Complex));
397 //Spacelike term
398#pragma omp parallel for
399 for(int i=0;i<kvol;i++){
400#ifndef NO_SPACE
401 for(int mu = 0; mu <ndim-1; mu++){
402 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
403#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
404 for(int idirac=0; idirac<ndirac; idirac++){
405 //FORTRAN had mod((idirac-1),4)+1 to prevent issues with non-zero indexing.
406 int igork1 = gamin[mu*ndirac+idirac];
407 //Can manually vectorise with a pragma?
408 //Wilson + Dirac term in that order. Definitely easier
409 //to read when split into different loops, but should be faster this way
410
411 //Reminder! gamval was rescaled by kappa when we defined it
412 phi[(i*ndirac+idirac)*nc]+=
413 -akappa*(u11t[i*ndim+mu]*r[(uid*ndirac+idirac)*nc]
414 +u12t[i*ndim+mu]*r[(uid*ndirac+idirac)*nc+1]
415 +conj(u11t[did*ndim+mu])*r[(did*ndirac+idirac)*nc]
416 -u12t[did*ndim+mu] *r[(did*ndirac+idirac)*nc+1])
417 -gamval[mu*ndirac+idirac]*
418 ( u11t[i*ndim+mu]*r[(uid*ndirac+igork1)*nc]
419 +u12t[i*ndim+mu]*r[(uid*ndirac+igork1)*nc+1]
420 -conj(u11t[did*ndim+mu])*r[(did*ndirac+igork1)*nc]
421 +u12t[did*ndim+mu] *r[(did*ndirac+igork1)*nc+1]);
422
423 phi[(i*ndirac+idirac)*nc+1]+=
424 -akappa*(-conj(u12t[i*ndim+mu])*r[(uid*ndirac+idirac)*nc]
425 +conj(u11t[i*ndim+mu])*r[(uid*ndirac+idirac)*nc+1]
426 +conj(u12t[did*ndim+mu])*r[(did*ndirac+idirac)*nc]
427 +u11t[did*ndim+mu] *r[(did*ndirac+idirac)*nc+1])
428 -gamval[mu*ndirac+idirac]*
429 (-conj(u12t[i*ndim+mu])*r[(uid*ndirac+igork1)*nc]
430 +conj(u11t[i*ndim+mu])*r[(uid*ndirac+igork1)*nc+1]
431 -conj(u12t[did*ndim+mu])*r[(did*ndirac+igork1)*nc]
432 -u11t[did*ndim+mu] *r[(did*ndirac+igork1)*nc+1]);
433 }
434 }
435#endif
436 //Timelike terms
437 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
438#ifndef NO_TIME
439#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
440 for(int idirac=0; idirac<ndirac; idirac++){
441 int igork1 = gamin[3*ndirac+idirac];
442 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
443 //dk4m and dk4p swap under dagger
444 phi[(i*ndirac+idirac)*nc]+=
445 -dk4m[i]*(u11t[i*ndim+3]*(r[(uid*ndirac+idirac)*nc]+r[(uid*ndirac+igork1)*nc])
446 +u12t[i*ndim+3]*(r[(uid*ndirac+idirac)*nc+1]+r[(uid*ndirac+igork1)*nc+1]))
447 -dk4p[did]*(conj(u11t[did*ndim+3])*(r[(did*ndirac+idirac)*nc]-r[(did*ndirac+igork1)*nc])
448 -u12t[did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]-r[(did*ndirac+igork1)*nc+1]));
449
450 phi[(i*ndirac+idirac)*nc+1]+=
451 -dk4m[i]*(-conj(u12t[i*ndim+3])*(r[(uid*ndirac+idirac)*nc]+r[(uid*ndirac+igork1)*nc])
452 +conj(u11t[i*ndim+3])*(r[(uid*ndirac+idirac)*nc+1]+r[(uid*ndirac+igork1)*nc+1]))
453 -dk4p[did]*(conj(u12t[did*ndim+3])*(r[(did*ndirac+idirac)*nc]-r[(did*ndirac+igork1)*nc])
454 +u11t[did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]-r[(did*ndirac+igork1)*nc+1]));
455 }
456#endif
457 }
458#endif
459 return 0;
460}

References Complex, kferm2, kvol, nc, ndim, ndirac, and ZHalo_swap_all().

Here is the call graph for this function:

◆ Hdslashd_f()

int Hdslashd_f ( Complex_f * phi,
Complex_f * r,
Complex_f * ut[2],
unsigned int * iu,
unsigned int * id,
Complex_f * gamval,
int * gamin,
float * dk[2],
float akappa )

Evaluates \(\Phi=M^\dagger r\) in single precision.

Parameters
phiThe product
rThe array being acted on by M
u11tFirst colour trial field
u12tSecond colour trial field
iuUpper halo indices
idLower halo indices
gamvalGamma matrices rescaled by kappa
gaminIndices for dirac terms
dk4m\(\left(1+\gamma_0\right)e^{-\mu}\)
dk4p\(\left(1-\gamma_0\right)e^\mu\)
akappaHopping parameter
Returns
Zero on success, integer error code otherwise

Definition at line 855 of file matrices.c.

856 {
857 /*
858 * @brief Evaluates @f(\Phi=M^\dagger r@f) in single precision
859 *
860 * @param phi: The product
861 * @param r: The array being acted on by M
862 * @param ut[0]_f: First colour trial field
863 * @param ut[1]_f: Second colour trial field
864 * @param iu: Upper halo indices
865 * @param id: Lower halo indices
866 * @param gamval_f: Gamma matrices
867 * @param gamin: Indices for dirac terms
868 * @param dk4m_f:
869 * @param dk4p_f:
870 * @param akappa: Hopping parameter
871 *
872 * @see CHalo_swap_all (MPI only)
873 *
874 * @return Zero on success, integer error code otherwise
875 */
876 const char *funcname = "Hdslashd_f";
877 //Get the halos in order. Because C is row major, we need to extract the correct
878 //terms for each halo first. Changing the indices was considered but that caused
879 //issues with the BLAS routines.
880#if(nproc>1)
881 CHalo_swap_all(r, 8);
882#endif
883
884 //Looks like flipping the array ordering for C has meant a lot
885 //of for loops. Sense we're jumping around quite a bit the cache is probably getting refreshed
886 //anyways so memory access patterns mightn't be as big of an limiting factor here anyway
887
888 //Mass term
889#ifdef __NVCC__
890 cuHdslashd_f(phi,r,ut,iu,id,gamval,gamin,dk,akappa,dimGrid,dimBlock);
891#else
892 memcpy(phi, r, kferm2*sizeof(Complex_f));
893
894 //Spacelike term
895 //Enough room on L1 data cache for Zen 2 to hold 160 elements at a time
896 //Vectorise with 128 maybe?
897#pragma omp parallel for
898 for(unsigned int i=0;i<kvol;i+=AVX){
899 //Right. Time to prefetch
900 alignas(AVX) Complex_f u11s[AVX]; alignas(AVX) Complex_f u12s[AVX];
901 alignas(AVX) Complex_f u11sd[AVX]; alignas(AVX) Complex_f u12sd[AVX];
902 alignas(AVX) Complex_f ru[2][AVX]; alignas(AVX) Complex_f rd[2][AVX];
903 alignas(AVX) Complex_f rgu[2][AVX]; alignas(AVX) Complex_f rgd[2][AVX];
904 alignas(AVX) Complex_f phi_s[ndirac*nc][AVX];
905#pragma unroll
906 for(unsigned short idirac=0; idirac<ndirac; idirac++)
907#pragma unroll
908 for(unsigned short c=0; c<nc; c++)
909#pragma omp simd aligned(phi_s,phi:AVX)
910 for(unsigned short j=0;j<AVX;j++)
911 phi_s[idirac*nc+c][j]=phi[((i+j)*ndirac+idirac)*nc+c];
912 alignas(AVX) unsigned int did[AVX], uid[AVX];
913#ifndef NO_SPACE
914#pragma unroll
915 for(unsigned short mu = 0; mu <ndim-1; mu++){
916 //FORTRAN had mod((idirac-1),4)+1 to prevent issues with non-zero indexing.
917#pragma omp simd aligned(u11s,u12s,did,uid,id,iu,u11sd,u12sd:AVX)
918 for(unsigned short j =0;j<AVX;j++){
919 did[j]=id[(i+j)*ndim+mu]; uid[j] = iu[(i+j)*ndim+mu];
920 u11s[j]=ut[0][(i+j)*ndim+mu]; u12s[j]=ut[1][(i+j)*ndim+mu];
921 u11sd[j]=ut[0][did[j]*ndim+mu]; u12sd[j]=ut[1][did[j]*ndim+mu];
922 }
923#pragma unroll
924 for(unsigned short idirac=0; idirac<ndirac; idirac++){
925 unsigned short igork1 = gamin[mu*ndirac+idirac];
926#pragma unroll
927 for(unsigned short c=0; c<nc; c++)
928#pragma omp simd aligned(ru,rd,rgu,rgd,r,uid,did:AVX)
929 for(unsigned short j =0;j<AVX;j++){
930 ru[c][j]=r[(uid[j]*ndirac+idirac)*nc+c];
931 rd[c][j]=r[(did[j]*ndirac+idirac)*nc+c];
932 rgu[c][j]=r[(uid[j]*ndirac+igork1)*nc+c];
933 rgd[c][j]=r[(did[j]*ndirac+igork1)*nc+c];
934 }
935 //Can manually vectorise with a pragma?
936 //Wilson + Dirac term in that order. Definitely easier
937 //to read when split into different loops, but should be faster this way
938#pragma omp simd aligned(phi_s,u11s,u12s,u11sd,u12sd,ru,rd,rgu,rgd:AVX)
939 for(unsigned short j =0;j<AVX;j++){
940 phi_s[idirac*nc][j]-=akappa*(u11s[j]*ru[0][j]
941 +u12s[j]*ru[1][j]
942 +conj(u11sd[j])*rd[0][j]
943 -u12sd[j] *rd[1][j]);
944 //Dirac term
945 phi_s[idirac*nc][j]-=gamval[mu*ndirac+idirac]*
946 (u11s[j]*rgu[0][j]
947 +u12s[j]*rgu[1][j]
948 -conj(u11sd[j])*rgd[0][j]
949 +u12sd[j] *rgd[1][j]);
950
951 phi_s[idirac*nc+1][j]-=akappa*(-conj(u12s[j])*ru[0][j]
952 +conj(u11s[j])*ru[1][j]
953 +conj(u12sd[j])*rd[0][j]
954 +u11sd[j] *rd[1][j]);
955 //Dirac term
956 phi_s[idirac*nc+1][j]-=gamval[mu*ndirac+idirac]*(-conj(u12s[j])*rgu[0][j]
957 +conj(u11s[j])*rgu[1][j]
958 -conj(u12sd[j])*rgd[0][j]
959 -u11sd[j] *rgd[1][j]);
960 }
961 }
962 }
963#endif
964#ifndef NO_TIME
965 //Timelike terms
966 alignas(AVX) float dk4ms[AVX],dk4ps[AVX];
967#pragma omp simd aligned(u11s,u12s,did,uid,id,iu,u11sd,u12sd,dk4ms,dk4ps:AVX)
968 for(unsigned short j=0;j<AVX;j++){
969 u11s[j]=ut[0][(i+j)*ndim+3]; u12s[j]=ut[1][(i+j)*ndim+3];
970 did[j]=id[(i+j)*ndim+3]; uid[j]= iu[(i+j)*ndim+3];
971 u11sd[j]=ut[0][did[j]*ndim+3]; u12sd[j]=ut[1][did[j]*ndim+3];
972 dk4ms[j]=dk[0][i+j]; dk4ps[j]=dk[1][did[j]];
973 }
974#pragma unroll
975 for(unsigned short idirac=0; idirac<ndirac; idirac++){
976 unsigned short igork1 = gamin[3*ndirac+idirac];
977#pragma unroll
978 for(unsigned short c=0; c<nc; c++)
979#pragma omp simd aligned(ru,rd,rgu,rgd,r,uid,did:AVX)
980 for(unsigned short j =0;j<AVX;j++){
981 ru[c][j]=r[(uid[j]*ndirac+idirac)*nc+c];
982 rd[c][j]=r[(did[j]*ndirac+idirac)*nc+c];
983 rgu[c][j]=r[(uid[j]*ndirac+igork1)*nc+c];
984 rgd[c][j]=r[(did[j]*ndirac+igork1)*nc+c];
985 }
986 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
987 //dk4m and dk4p swap under dagger
988#pragma omp simd aligned(phi_s,u11s,u12s,u11sd,u12sd,ru,rd,rgu,rgd,dk4ms,dk4ps,phi:AVX)
989 for(unsigned short j =0;j<AVX;j++){
990 phi_s[idirac*nc][j]+=
991 -dk4ms[j]*(u11s[j]*(ru[0][j]+rgu[0][j])
992 +u12s[j]*(ru[1][j]+rgu[1][j]));
993 phi_s[idirac*nc][j]+=
994 -dk4ps[j]*(conj(u11sd[j])*(rd[0][j]-rgd[0][j])
995 -u12sd[j] *(rd[1][j]-rgd[1][j]));
996 phi[((i+j)*ndirac+idirac)*nc]=phi_s[idirac*nc][j];
997
998 phi_s[idirac*nc+1][j]-=
999 dk4ms[j]*(-conj(u12s[j])*(ru[0][j]+rgu[0][j])
1000 +conj(u11s[j])*(ru[1][j]+rgu[1][j]));
1001 phi_s[idirac*nc+1][j]-=
1002 +dk4ps[j]*(conj(u12sd[j])*(rd[0][j]-rgd[0][j])
1003 +u11sd[j] *(rd[1][j]-rgd[1][j]));
1004 phi[((i+j)*ndirac+idirac)*nc+1]=phi_s[idirac*nc+1][j];
1005 }
1006 }
1007#endif
1008 }
1009#endif
1010 return 0;
1011}

References AVX, CHalo_swap_all(), Complex_f, kferm2, kvol, nc, ndim, and ndirac.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Transpose_c()

void Transpose_c ( Complex_f * out,
const int fast_in,
const int fast_out )
inline

Definition at line 1050 of file matrices.c.

1050 {
1051 const volatile char *funcname="Transpose_c";
1052
1053#ifdef __NVCC__
1054 cuTranspose_c(out,fast_in,fast_out,dimGrid,dimBlock);
1055#else
1056 Complex_f *in = (Complex_f *)aligned_alloc(AVX,fast_in*fast_out*sizeof(Complex_f));
1057 memcpy(in,out,fast_in*fast_out*sizeof(Complex_f));
1058 //Typically this is used to write back to the AoS/Coalseced format
1059 if(fast_out>fast_in){
1060 for(int x=0;x<fast_out;x++)
1061 for(int y=0; y<fast_in;y++)
1062 out[y*fast_out+x]=in[x*fast_in+y];
1063 }
1064 //Typically this is used to write back to the SoA/saved config format
1065 else{
1066 for(int x=0; x<fast_out;x++)
1067 for(int y=0;y<fast_in;y++)
1068 out[y*fast_out+x]=in[x*fast_in+y];
1069 }
1070 free(in);
1071#endif
1072}

◆ Transpose_d()

void Transpose_d ( double * out,
const int fast_in,
const int fast_out )
inline

Definition at line 1119 of file matrices.c.

1119 {
1120 const char *funcname="Transpose_f";
1121
1122#ifdef __NVCC__
1123 cuTranspose_d(out,fast_in,fast_out,dimGrid,dimBlock);
1124#else
1125 double *in = (double *)aligned_alloc(AVX,fast_in*fast_out*sizeof(double));
1126 memcpy(in,out,fast_in*fast_out*sizeof(double));
1127 //Typically this is used to write back to the AoS/Coalseced format
1128 if(fast_out>fast_in){
1129 for(int x=0;x<fast_out;x++)
1130 for(int y=0; y<fast_in;y++)
1131 out[y*fast_out+x]=in[x*fast_in+y];
1132 }
1133 //Typically this is used to write back to the SoA/saved config format
1134 else{
1135 for(int x=0; x<fast_out;x++)
1136 for(int y=0;y<fast_in;y++)
1137 out[y*fast_out+x]=in[x*fast_in+y];
1138 }
1139 free(in);
1140#endif
1141}

◆ Transpose_f()

void Transpose_f ( float * out,
const int fast_in,
const int fast_out )
inline

Definition at line 1096 of file matrices.c.

1096 {
1097 const char *funcname="Transpose_f";
1098
1099#ifdef __NVCC__
1100 cuTranspose_f(out,fast_in,fast_out,dimGrid,dimBlock);
1101#else
1102 float *in = (float *)aligned_alloc(AVX,fast_in*fast_out*sizeof(float));
1103 memcpy(in,out,fast_in*fast_out*sizeof(float));
1104 //Typically this is used to write back to the AoS/Coalseced format
1105 if(fast_out>fast_in){
1106 for(int x=0;x<fast_out;x++)
1107 for(int y=0; y<fast_in;y++)
1108 out[y*fast_out+x]=in[x*fast_in+y];
1109 }
1110 //Typically this is used to write back to the SoA/saved config format
1111 else{
1112 for(int x=0; x<fast_out;x++)
1113 for(int y=0;y<fast_in;y++)
1114 out[y*fast_out+x]=in[x*fast_in+y];
1115 }
1116 free(in);
1117#endif
1118}

◆ Transpose_I()

void Transpose_I ( int * out,
const int fast_in,
const int fast_out )
inline

Definition at line 1142 of file matrices.c.

1142 {
1143 const char *funcname="Transpose_I";
1144
1145#ifdef __NVCC__
1146 cuTranspose_I(out,fast_in,fast_out,dimGrid,dimBlock);
1147#else
1148 int *in = (int *)aligned_alloc(AVX,fast_in*fast_out*sizeof(int));
1149 memcpy(in,out,fast_in*fast_out*sizeof(int));
1150 //Typically this is used to write back to the AoS/Coalseced format
1151 if(fast_out>fast_in){
1152 for(int x=0;x<fast_out;x++)
1153 for(int y=0; y<fast_in;y++)
1154 out[y*fast_out+x]=in[x*fast_in+y];
1155 }
1156 //Typically this is used to write back to the SoA/saved config format
1157 else{
1158 for(int x=0; x<fast_out;x++)
1159 for(int y=0;y<fast_in;y++)
1160 out[y*fast_out+x]=in[x*fast_in+y];
1161 }
1162 free(in);
1163#endif
1164}

◆ Transpose_U()

void Transpose_U ( unsigned int * out,
const int fast_in,
const int fast_out )
inline

Definition at line 1165 of file matrices.c.

1165 {
1166 const char *funcname="Transpose_I";
1167
1168#ifdef __NVCC__
1169 cuTranspose_U(out,fast_in,fast_out,dimGrid,dimBlock);
1170#else
1171 unsigned int *in = (unsigned int *)aligned_alloc(AVX,fast_in*fast_out*sizeof(unsigned int));
1172 memcpy(in,out,fast_in*fast_out*sizeof(unsigned int));
1173 //Typically this is used to write back to the AoS/Coalseced format
1174 if(fast_out>fast_in){
1175 for(unsigned int x=0;x<fast_out;x++)
1176 for(unsigned int y=0; y<fast_in;y++)
1177 out[y*fast_out+x]=in[x*fast_in+y];
1178 }
1179 //Typically this is used to write back to the SoA/saved config format
1180 else{
1181 for(unsigned int x=0; x<fast_out;x++)
1182 for(unsigned int y=0;y<fast_in;y++)
1183 out[y*fast_out+x]=in[x*fast_in+y];
1184 }
1185 free(in);
1186#endif
1187}

◆ Transpose_z()

void Transpose_z ( Complex * out,
const int fast_in,
const int fast_out )
inline

Definition at line 1073 of file matrices.c.

1073 {
1074 const volatile char *funcname="Transpose_c";
1075
1076#ifdef __NVCC__
1077 cuTranspose_z(out,fast_in,fast_out,dimGrid,dimBlock);
1078#else
1079 Complex *in = (Complex *)aligned_alloc(AVX,fast_in*fast_out*sizeof(Complex));
1080 memcpy(in,out,fast_in*fast_out*sizeof(Complex));
1081 //Typically this is used to write back to the AoS/Coalseced format
1082 if(fast_out>fast_in){
1083 for(int x=0;x<fast_out;x++)
1084 for(int y=0; y<fast_in;y++)
1085 out[y*fast_out+x]=in[x*fast_in+y];
1086 }
1087 //Typically this is used to write back to the SoA/saved config format
1088 else{
1089 for(int x=0; x<fast_out;x++)
1090 for(int y=0;y<fast_in;y++)
1091 out[y*fast_out+x]=in[x*fast_in+y];
1092 }
1093 free(in);
1094#endif
1095}