20 Complex *gamval,
int *gamin,
double *dk4m,
double *dk4p,
Complex_f jqq,
float akappa){
41 const char *funcname =
"Dslash";
50 cuDslash(phi,r,u11t,u12t,iu,
id,gamval,gamin,dk4m,dk4p,jqq,akappa,dimGrid,dimBlock);
53#pragma omp parallel for
54 for(
int i=0;i<
kvol;i++){
55#pragma omp simd aligned(phi,r,gamval:AVX)
56 for(
int idirac = 0; idirac<
ndirac; idirac++){
59 a_1=conj(jqq)*gamval[4*
ndirac+idirac];
61 a_2=-jqq*gamval[4*
ndirac+idirac];
70 for(
int mu = 0; mu <3; mu++){
71 int did=
id[mu+
ndim*i];
int uid = iu[mu+
ndim*i];
72#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
73 for(
int igorkov=0; igorkov<
ngorkov; igorkov++){
76 int igork1 = (igorkov<4) ? gamin[mu*
ndirac+idirac] : gamin[mu*
ndirac+idirac]+4;
105 int did=
id[3+
ndim*i];
int uid = iu[3+
ndim*i];
107#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
108 for(
int igorkov=0; igorkov<4; igorkov++){
109 int igorkovPP=igorkov+4;
111 int igork1 = gamin[3*
ndirac+igorkov];
int igork1PP = igork1+4;
142 Complex *gamval,
int *gamin,
double *dk4m,
double *dk4p,
Complex_f jqq,
float akappa){
163 const char *funcname =
"Dslashd";
171 cuDslashd(phi,r,u11t,u12t,iu,
id,gamval,gamin,dk4m,dk4p,jqq,akappa,dimGrid,dimBlock);
174#pragma omp parallel for
175 for(
int i=0;i<
kvol;i++){
176#pragma omp simd aligned(phi,r,gamval:AVX)
178 for(
int idirac = 0; idirac<
ndirac; idirac++){
179 int igork = idirac+4;
182 a_1=-conj(jqq)*gamval[4*
ndirac+idirac];
183 a_2=jqq*gamval[4*
ndirac+idirac];
192 for(
int mu = 0; mu <3; mu++){
193 int did=
id[mu+
ndim*i];
int uid = iu[mu+
ndim*i];
194#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
195 for(
int igorkov=0; igorkov<
ngorkov; igorkov++){
197 int idirac=igorkov%4;
198 int igork1 = (igorkov<4) ? gamin[mu*
ndirac+idirac] : gamin[mu*
ndirac+idirac]+4;
207 -gamval[mu*
ndirac+idirac]*
218 -gamval[mu*
ndirac+idirac]*
230 int did=
id[3+
ndim*i];
int uid = iu[3+
ndim*i];
232#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
233 for(
int igorkov=0; igorkov<4; igorkov++){
235 int igork1 = gamin[3*
ndirac+igorkov];
249 int igorkovPP=igorkov+4;
250 int igork1PP = igork1+4;
269 Complex *gamval,
int *gamin,
double *dk[2],
float akappa){
289 const char *funcname =
"Hdslash";
298 cuHdslash(phi,r,ut[0],ut[1],iu,
id,gamval,gamin,dk[0],dk[1],akappa,dimGrid,dimBlock);
301#pragma omp parallel for
302 for(
int i=0;i<
kvol;i++){
304 for(
int mu = 0; mu <3; mu++){
305 int did=
id[mu+
ndim*i];
int uid = iu[mu+
ndim*i];
306#pragma omp simd aligned(phi,r,gamval:AVX)
307 for(
int idirac=0; idirac<
ndirac; idirac++){
309 int igork1 = gamin[mu*
ndirac+idirac];
336 int did=
id[3+
ndim*i];
int uid = iu[3+
ndim*i];
338#pragma omp simd aligned(phi,r:AVX)
339 for(
int idirac=0; idirac<
ndirac; idirac++){
340 int igork1 = gamin[3*
ndirac+idirac];
360 Complex *gamval,
int *gamin,
double *dk4m,
double *dk4p,
float akappa){
380 const char *funcname =
"Hdslashd";
394 cuHdslashd(phi,r,u11t,u12t,iu,
id,gamval,gamin,dk4m,dk4p,akappa,dimGrid,dimBlock);
398#pragma omp parallel for
399 for(
int i=0;i<
kvol;i++){
401 for(
int mu = 0; mu <
ndim-1; mu++){
402 int did=
id[mu+
ndim*i];
int uid = iu[mu+
ndim*i];
403#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
404 for(
int idirac=0; idirac<
ndirac; idirac++){
406 int igork1 = gamin[mu*
ndirac+idirac];
417 -gamval[mu*
ndirac+idirac]*
424 -akappa*(-conj(u12t[i*
ndim+mu])*r[(uid*
ndirac+idirac)*
nc]
428 -gamval[mu*
ndirac+idirac]*
437 int did=
id[3+
ndim*i];
int uid = iu[3+
ndim*i];
439#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
440 for(
int idirac=0; idirac<
ndirac; idirac++){
441 int igork1 = gamin[3*
ndirac+idirac];
485 const char *funcname =
"Dslash_f";
494 cuDslash_f(phi,r,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk_f[0],dk_f[1],jqq,akappa,dimGrid,dimBlock);
497#pragma omp parallel for
498 for(
unsigned int i=0;i<
kvol;i++){
499#pragma omp simd aligned(phi,r,gamval_f:AVX)
500 for(
unsigned short idirac = 0; idirac<
ndirac; idirac++){
501 unsigned short igork = idirac+4;
503 a_1=conj(jqq)*gamval_f[4*
ndirac+idirac];
505 a_2=-jqq*gamval_f[4*
ndirac+idirac];
514 for(
unsigned short mu = 0; mu <3; mu++){
515 unsigned int did=
id[mu+
ndim*i];
unsigned int uid = iu[mu+
ndim*i];
516#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamval_f,gamin:AVX)
517 for(
unsigned short igorkov=0; igorkov<
ngorkov; igorkov++){
519 unsigned short idirac=igorkov%4;
520 unsigned short igork1 = (igorkov<4) ? gamin[mu*
ndirac+idirac] : gamin[mu*
ndirac+idirac]+4;
549 unsigned int did=
id[3+
ndim*i];
unsigned int uid = iu[3+
ndim*i];
551#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamin:AVX)
552 for(
unsigned short igorkov=0; igorkov<4; igorkov++){
553 unsigned short igorkovPP=igorkov+4;
555 unsigned short igork1 = gamin[3*
ndirac+igorkov];
unsigned short igork1PP = igork1+4;
607 const char *funcname =
"Dslashd_f";
615 cuDslashd_f(phi,r,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk_f[0],dk_f[1],jqq,akappa,dimGrid,dimBlock);
618#pragma omp parallel for
619 for(
unsigned int i=0;i<
kvol;i++){
620#pragma omp simd aligned(phi,r,gamval_f:AVX)
622 for(
unsigned short idirac = 0; idirac<
ndirac; idirac++){
623 unsigned short igork = idirac+4;
626 a_1=-conj(jqq)*gamval_f[4*
ndirac+idirac];
627 a_2=jqq*gamval_f[4*
ndirac+idirac];
636 for(
unsigned short mu = 0; mu <3; mu++){
637 unsigned int did=
id[mu+
ndim*i];
unsigned int uid = iu[mu+
ndim*i];
638#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamval_f:AVX)
639 for(
unsigned short igorkov=0; igorkov<
ngorkov; igorkov++){
641 unsigned short idirac=igorkov%4;
642 unsigned short igork1 = (igorkov<4) ? gamin[mu*
ndirac+idirac] : gamin[mu*
ndirac+idirac]+4;
651 -gamval_f[mu*
ndirac+idirac]*
658 -akappa*(-conj(u12t_f[i*
ndim+mu])*r[(uid*
ngorkov+igorkov)*
nc]
662 -gamval_f[mu*
ndirac+idirac]*
674 unsigned int did=
id[3+
ndim*i];
unsigned int uid = iu[3+
ndim*i];
676#pragma omp simd aligned(phi,r,u11t_f,u12t_f:AVX)
677 for(
unsigned short igorkov=0; igorkov<4; igorkov++){
679 unsigned short igork1 = gamin[3*
ndirac+igorkov];
693 unsigned short igorkovPP=igorkov+4;
694 unsigned short igork1PP = igork1+4;
713 Complex_f *gamval,
int *gamin,
float *dk[2],
float akappa){
733 const char *funcname =
"Hdslash_f";
739 cuHdslash_f(phi,r,ut,iu,
id,gamval,gamin,dk,akappa,dimGrid,dimBlock);
743#pragma omp parallel for
744 for(
unsigned int i=0;i<
kvol;i+=
AVX){
752 for(
unsigned short idirac=0; idirac<
ndirac; idirac++)
753 for(
unsigned short c=0; c<
nc; c++)
754#pragma omp simd aligned(phi_s,phi:
AVX)
755 for(
unsigned short j=0;j<
AVX;j++)
756 phi_s[idirac*
nc+c][j]=phi[((i+j)*
ndirac+idirac)*
nc+c];
757 alignas(
AVX)
unsigned int did[
AVX], uid[
AVX];
759 for(
unsigned short mu = 0; mu <3; mu++){
760#pragma omp simd aligned(u11s,u12s,did,uid,id,iu,u11sd,u12sd:AVX)
761 for(
unsigned short j =0;j<
AVX;j++){
762 did[j]=
id[(i+j)*
ndim+mu]; uid[j] = iu[(i+j)*
ndim+mu];
763 u11s[j]=ut[0][(i+j)*
ndim+mu]; u12s[j]=ut[1][(i+j)*
ndim+mu];
764 u11sd[j]=ut[0][did[j]*
ndim+mu]; u12sd[j]=ut[1][did[j]*
ndim+mu];
767 for(
unsigned short idirac=0; idirac<
ndirac; idirac++){
768 unsigned short igork1 = gamin[mu*
ndirac+idirac];
770 for(
unsigned short c=0; c<
nc; c++)
771#pragma omp simd aligned(ru,rd,rgu,rgd,r,uid,did:
AVX)
772 for(
unsigned short j =0;j<
AVX;j++){
773 ru[c][j]=r[(uid[j]*
ndirac+idirac)*
nc+c];
774 rd[c][j]=r[(did[j]*
ndirac+idirac)*
nc+c];
775 rgu[c][j]=r[(uid[j]*
ndirac+igork1)*
nc+c];
776 rgd[c][j]=r[(did[j]*
ndirac+igork1)*
nc+c];
782#pragma omp simd aligned(phi_s,u11s,u12s,u11sd,u12sd,ru,rd,rgu,rgd:AVX)
783 for(
unsigned short j =0;j<
AVX;j++){
784 phi_s[idirac*
nc][j]+=-akappa*(u11s[j]*ru[0][j]+\
786 conj(u11sd[j])*rd[0][j]-\
789 phi_s[idirac*
nc][j]+=gamval[mu*
ndirac+idirac]*(u11s[j]*rgu[0][j]+\
791 conj(u11sd[j])*rgd[0][j]+\
794 phi_s[idirac*
nc+1][j]+=-akappa*(-conj(u12s[j])*ru[0][j]+\
795 conj(u11s[j])*ru[1][j]+\
796 conj(u12sd[j])*rd[0][j]+\
799 phi_s[idirac*
nc+1][j]+=gamval[mu*
ndirac+idirac]*(-conj(u12s[j])*rgu[0][j]+\
800 conj(u11s[j])*rgu[1][j]-\
801 conj(u12sd[j])*rgd[0][j]-\
808 alignas(
AVX)
float dk4ms[
AVX],dk4ps[
AVX];
810 for(
unsigned short j=0;j<
AVX;j++){
811 u11s[j]=ut[0][(i+j)*
ndim+3]; u12s[j]=ut[1][(i+j)*
ndim+3];
812 did[j]=
id[(i+j)*
ndim+3];uid[j]= iu[(i+j)*
ndim+3];
813 u11sd[j]=ut[0][did[j]*
ndim+3]; u12sd[j]=ut[1][did[j]*
ndim+3];
814 dk4ms[j]=dk[0][did[j]]; dk4ps[j]=dk[1][i+j];
818 for(
unsigned short idirac=0; idirac<
ndirac; idirac++){
819 unsigned short igork1 = gamin[3*
ndirac+idirac];
821 for(
unsigned short c=0; c<
nc; c++)
822#pragma omp simd aligned(ru,rd,rgu,rgd,r,uid,did:
AVX)
823 for(
unsigned short j =0;j<
AVX;j++){
824 ru[c][j]=r[(uid[j]*
ndirac+idirac)*
nc+c];
825 rd[c][j]=r[(did[j]*
ndirac+idirac)*
nc+c];
826 rgu[c][j]=r[(uid[j]*
ndirac+igork1)*
nc+c];
827 rgd[c][j]=r[(did[j]*
ndirac+igork1)*
nc+c];
831#pragma omp simd aligned(phi_s,u11s,u12s,u11sd,u12sd,ru,rd,rgu,rgd,dk4ms,dk4ps,phi:AVX)
832 for(
unsigned short j =0;j<
AVX;j++){
833 phi_s[idirac*
nc+0][j]-=
834 dk4ps[j]*(u11s[j]*(ru[0][j]-rgu[0][j])
835 +u12s[j]*(ru[1][j]-rgu[1][j]));
836 phi_s[idirac*
nc+0][j]-=
837 dk4ms[j]*(conj(u11sd[j])*(rd[0][j]+rgd[0][j])
838 -u12sd[j]*(rd[1][j]+rgd[1][j]));
839 phi[((i+j)*
ndirac+idirac)*
nc]=phi_s[idirac*
nc][j];
841 phi_s[idirac*
nc+1][j]-=
842 dk4ps[j]*(-conj(u12s[j])*(ru[0][j]-rgu[0][j])
843 +conj(u11s[j])*(ru[1][j]-rgu[1][j]));
844 phi_s[idirac*
nc+1][j]-=
845 dk4ms[j]*(conj(u12sd[j])*(rd[0][j]+rgd[0][j])
846 +u11sd[j]*(rd[1][j]+rgd[1][j]));
847 phi[((i+j)*
ndirac+idirac)*
nc+1]=phi_s[idirac*
nc+1][j];
856 Complex_f *gamval,
int *gamin,
float *dk[2],
float akappa){
876 const char *funcname =
"Hdslashd_f";
890 cuHdslashd_f(phi,r,ut,iu,
id,gamval,gamin,dk,akappa,dimGrid,dimBlock);
897#pragma omp parallel for
898 for(
unsigned int i=0;i<
kvol;i+=
AVX){
906 for(
unsigned short idirac=0; idirac<
ndirac; idirac++)
908 for(
unsigned short c=0; c<
nc; c++)
909#pragma omp simd aligned(phi_s,phi:
AVX)
910 for(
unsigned short j=0;j<
AVX;j++)
911 phi_s[idirac*
nc+c][j]=phi[((i+j)*
ndirac+idirac)*
nc+c];
912 alignas(
AVX)
unsigned int did[
AVX], uid[
AVX];
915 for(
unsigned short mu = 0; mu <
ndim-1; mu++){
917#pragma omp simd aligned(u11s,u12s,did,uid,id,iu,u11sd,u12sd:AVX)
918 for(
unsigned short j =0;j<
AVX;j++){
919 did[j]=
id[(i+j)*
ndim+mu]; uid[j] = iu[(i+j)*
ndim+mu];
920 u11s[j]=ut[0][(i+j)*
ndim+mu]; u12s[j]=ut[1][(i+j)*
ndim+mu];
921 u11sd[j]=ut[0][did[j]*
ndim+mu]; u12sd[j]=ut[1][did[j]*
ndim+mu];
924 for(
unsigned short idirac=0; idirac<
ndirac; idirac++){
925 unsigned short igork1 = gamin[mu*
ndirac+idirac];
927 for(
unsigned short c=0; c<
nc; c++)
928#pragma omp simd aligned(ru,rd,rgu,rgd,r,uid,did:
AVX)
929 for(
unsigned short j =0;j<
AVX;j++){
930 ru[c][j]=r[(uid[j]*
ndirac+idirac)*
nc+c];
931 rd[c][j]=r[(did[j]*
ndirac+idirac)*
nc+c];
932 rgu[c][j]=r[(uid[j]*
ndirac+igork1)*
nc+c];
933 rgd[c][j]=r[(did[j]*
ndirac+igork1)*
nc+c];
938#pragma omp simd aligned(phi_s,u11s,u12s,u11sd,u12sd,ru,rd,rgu,rgd:AVX)
939 for(
unsigned short j =0;j<
AVX;j++){
940 phi_s[idirac*
nc][j]-=akappa*(u11s[j]*ru[0][j]
942 +conj(u11sd[j])*rd[0][j]
943 -u12sd[j] *rd[1][j]);
945 phi_s[idirac*
nc][j]-=gamval[mu*
ndirac+idirac]*
948 -conj(u11sd[j])*rgd[0][j]
949 +u12sd[j] *rgd[1][j]);
951 phi_s[idirac*
nc+1][j]-=akappa*(-conj(u12s[j])*ru[0][j]
952 +conj(u11s[j])*ru[1][j]
953 +conj(u12sd[j])*rd[0][j]
954 +u11sd[j] *rd[1][j]);
956 phi_s[idirac*
nc+1][j]-=gamval[mu*
ndirac+idirac]*(-conj(u12s[j])*rgu[0][j]
957 +conj(u11s[j])*rgu[1][j]
958 -conj(u12sd[j])*rgd[0][j]
959 -u11sd[j] *rgd[1][j]);
966 alignas(
AVX)
float dk4ms[
AVX],dk4ps[
AVX];
967#pragma omp simd aligned(u11s,u12s,did,uid,id,iu,u11sd,u12sd,dk4ms,dk4ps:AVX)
968 for(
unsigned short j=0;j<
AVX;j++){
969 u11s[j]=ut[0][(i+j)*
ndim+3]; u12s[j]=ut[1][(i+j)*
ndim+3];
970 did[j]=
id[(i+j)*
ndim+3]; uid[j]= iu[(i+j)*
ndim+3];
971 u11sd[j]=ut[0][did[j]*
ndim+3]; u12sd[j]=ut[1][did[j]*
ndim+3];
972 dk4ms[j]=dk[0][i+j]; dk4ps[j]=dk[1][did[j]];
975 for(
unsigned short idirac=0; idirac<
ndirac; idirac++){
976 unsigned short igork1 = gamin[3*
ndirac+idirac];
978 for(
unsigned short c=0; c<
nc; c++)
979#pragma omp simd aligned(ru,rd,rgu,rgd,r,uid,did:
AVX)
980 for(
unsigned short j =0;j<
AVX;j++){
981 ru[c][j]=r[(uid[j]*
ndirac+idirac)*
nc+c];
982 rd[c][j]=r[(did[j]*
ndirac+idirac)*
nc+c];
983 rgu[c][j]=r[(uid[j]*
ndirac+igork1)*
nc+c];
984 rgd[c][j]=r[(did[j]*
ndirac+igork1)*
nc+c];
988#pragma omp simd aligned(phi_s,u11s,u12s,u11sd,u12sd,ru,rd,rgu,rgd,dk4ms,dk4ps,phi:AVX)
989 for(
unsigned short j =0;j<
AVX;j++){
990 phi_s[idirac*
nc][j]+=
991 -dk4ms[j]*(u11s[j]*(ru[0][j]+rgu[0][j])
992 +u12s[j]*(ru[1][j]+rgu[1][j]));
993 phi_s[idirac*
nc][j]+=
994 -dk4ps[j]*(conj(u11sd[j])*(rd[0][j]-rgd[0][j])
995 -u12sd[j] *(rd[1][j]-rgd[1][j]));
996 phi[((i+j)*
ndirac+idirac)*
nc]=phi_s[idirac*
nc][j];
998 phi_s[idirac*
nc+1][j]-=
999 dk4ms[j]*(-conj(u12s[j])*(ru[0][j]+rgu[0][j])
1000 +conj(u11s[j])*(ru[1][j]+rgu[1][j]));
1001 phi_s[idirac*
nc+1][j]-=
1002 +dk4ps[j]*(conj(u12sd[j])*(rd[0][j]-rgd[0][j])
1003 +u11sd[j] *(rd[1][j]-rgd[1][j]));
1004 phi[((i+j)*
ndirac+idirac)*
nc+1]=phi_s[idirac*
nc+1][j];
1026 const char *funcname =
"Reunitarise";
1028 cuReunitarise(ut[0],ut[1],dimGrid,dimBlock);
1030#pragma omp parallel for simd
1034 double anorm=sqrt(conj(ut[0][i])*ut[0][i]+conj(ut[1][i])*ut[1][i]);
1050inline void Transpose_c(
Complex_f *out,
const int fast_in,
const int fast_out){
1051 const volatile char *funcname=
"Transpose_c";
1054 cuTranspose_c(out,fast_in,fast_out,dimGrid,dimBlock);
1057 memcpy(in,out,fast_in*fast_out*
sizeof(
Complex_f));
1059 if(fast_out>fast_in){
1060 for(
int x=0;x<fast_out;x++)
1061 for(
int y=0; y<fast_in;y++)
1062 out[y*fast_out+x]=in[x*fast_in+y];
1066 for(
int x=0; x<fast_out;x++)
1067 for(
int y=0;y<fast_in;y++)
1068 out[y*fast_out+x]=in[x*fast_in+y];
1073inline void Transpose_z(
Complex *out,
const int fast_in,
const int fast_out){
1074 const volatile char *funcname=
"Transpose_c";
1077 cuTranspose_z(out,fast_in,fast_out,dimGrid,dimBlock);
1080 memcpy(in,out,fast_in*fast_out*
sizeof(
Complex));
1082 if(fast_out>fast_in){
1083 for(
int x=0;x<fast_out;x++)
1084 for(
int y=0; y<fast_in;y++)
1085 out[y*fast_out+x]=in[x*fast_in+y];
1089 for(
int x=0; x<fast_out;x++)
1090 for(
int y=0;y<fast_in;y++)
1091 out[y*fast_out+x]=in[x*fast_in+y];
1096inline void Transpose_f(
float *out,
const int fast_in,
const int fast_out){
1097 const char *funcname=
"Transpose_f";
1100 cuTranspose_f(out,fast_in,fast_out,dimGrid,dimBlock);
1102 float *in = (
float *)aligned_alloc(
AVX,fast_in*fast_out*
sizeof(
float));
1103 memcpy(in,out,fast_in*fast_out*
sizeof(
float));
1105 if(fast_out>fast_in){
1106 for(
int x=0;x<fast_out;x++)
1107 for(
int y=0; y<fast_in;y++)
1108 out[y*fast_out+x]=in[x*fast_in+y];
1112 for(
int x=0; x<fast_out;x++)
1113 for(
int y=0;y<fast_in;y++)
1114 out[y*fast_out+x]=in[x*fast_in+y];
1119inline void Transpose_d(
double *out,
const int fast_in,
const int fast_out){
1120 const char *funcname=
"Transpose_f";
1123 cuTranspose_d(out,fast_in,fast_out,dimGrid,dimBlock);
1125 double *in = (
double *)aligned_alloc(
AVX,fast_in*fast_out*
sizeof(
double));
1126 memcpy(in,out,fast_in*fast_out*
sizeof(
double));
1128 if(fast_out>fast_in){
1129 for(
int x=0;x<fast_out;x++)
1130 for(
int y=0; y<fast_in;y++)
1131 out[y*fast_out+x]=in[x*fast_in+y];
1135 for(
int x=0; x<fast_out;x++)
1136 for(
int y=0;y<fast_in;y++)
1137 out[y*fast_out+x]=in[x*fast_in+y];
1142inline void Transpose_I(
int *out,
const int fast_in,
const int fast_out){
1143 const char *funcname=
"Transpose_I";
1146 cuTranspose_I(out,fast_in,fast_out,dimGrid,dimBlock);
1148 int *in = (
int *)aligned_alloc(
AVX,fast_in*fast_out*
sizeof(
int));
1149 memcpy(in,out,fast_in*fast_out*
sizeof(
int));
1151 if(fast_out>fast_in){
1152 for(
int x=0;x<fast_out;x++)
1153 for(
int y=0; y<fast_in;y++)
1154 out[y*fast_out+x]=in[x*fast_in+y];
1158 for(
int x=0; x<fast_out;x++)
1159 for(
int y=0;y<fast_in;y++)
1160 out[y*fast_out+x]=in[x*fast_in+y];
1165inline void Transpose_U(
unsigned int *out,
const int fast_in,
const int fast_out){
1166 const char *funcname=
"Transpose_I";
1169 cuTranspose_U(out,fast_in,fast_out,dimGrid,dimBlock);
1171 unsigned int *in = (
unsigned int *)aligned_alloc(
AVX,fast_in*fast_out*
sizeof(
unsigned int));
1172 memcpy(in,out,fast_in*fast_out*
sizeof(
unsigned int));
1174 if(fast_out>fast_in){
1175 for(
unsigned int x=0;x<fast_out;x++)
1176 for(
unsigned int y=0; y<fast_in;y++)
1177 out[y*fast_out+x]=in[x*fast_in+y];
1181 for(
unsigned int x=0; x<fast_out;x++)
1182 for(
unsigned int y=0;y<fast_in;y++)
1183 out[y*fast_out+x]=in[x*fast_in+y];
int Hdslashd_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates in single precision.
int Dslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa)
Evaluates in double precision.
int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk_f[2], Complex_f jqq, float akappa)
Evaluates in single precision.
int Hdslash(Complex *phi, Complex *r, Complex *ut[2], unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk[2], float akappa)
Evaluates in double precision.
int Dslashd(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa)
Evaluates in double precision.
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates in single precision.
int Reunitarise(Complex *ut[2])
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk_f[2], Complex_f jqq, float akappa)
Evaluates in single precision.
int Hdslashd(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
Evaluates in double precision.
Matrix multiplication and related declarations.
int CHalo_swap_all(Complex_f *c, int ncpt)
Calls the functions to send data to both the up and down halos.
int ZHalo_swap_all(Complex *z, int ncpt)
Calls the functions to send data to both the up and down halos.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
#define ngorkov
Gor'kov indices.
#define kvol
Sublattice volume.
#define Complex
Double precision complex number.
#define kferm
sublattice size including Gor'kov indices
#define ndirac
Dirac indices.
#define Complex_f
Single precision complex number.
#define kferm2
sublattice size including Dirac indices