su2hmc
Loading...
Searching...
No Matches
matrices.c
Go to the documentation of this file.
1
12#include <assert.h>
13#include <complex.h>
14#include <matrices.h>
15#include <string.h>
16//TO DO: Check and see are there any terms we are evaluating twice in the same loop
17//and use a variable to hold them instead to reduce the number of evaluations.
18int Dslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu,unsigned int *id,\
19 Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa){
20 /*
21 * @brief Evaluates @f(\Phi=M r@f) in double precision.
22 *
23 * @param phi: The product
24 * @param r: The array being acted on by M
25 * @param u11t: First colour trial field
26 * @param u12t: Second colour trial field
27 * @param iu: Upper halo indices
28 * @param id: Lower halo indices
29 * @param gamval: Gamma matrices
30 * @param gamin: Indices for dirac terms
31 * @param dk4m:
32 * @param dk4p:
33 * @param jqq: Diquark source
34 * @param akappa: Hopping parameter
35 *
36 * @see ZHalo_swap_all (MPI only)
37 *
38 * @return Zero on success, integer error code otherwise
39 */
40 const char *funcname = "Dslash";
41 //Get the halos in order
42#if(nproc>1)
43 ZHalo_swap_all(r, 16);
44#endif
45
46 //Mass term
47 //Diquark Term (antihermitian)
48#ifdef __NVCC__
49 cuDslash(phi,r,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa,dimGrid,dimBlock);
50#else
51 memcpy(phi, r, kferm*sizeof(Complex));
52#pragma omp parallel for
53 for(int i=0;i<kvol;i++){
54#pragma omp simd aligned(phi,r,gamval:AVX)
55 for(int idirac = 0; idirac<ndirac; idirac++){
56 int igork = idirac+4;
57 Complex a_1, a_2;
58 a_1=conj(jqq)*gamval[4*ndirac+idirac];
59 //We subtract a_2, hence the minus
60 a_2=-jqq*gamval[4*ndirac+idirac];
61 phi[(i*ngorkov+idirac)*nc]+=a_1*r[(i*ngorkov+igork)*nc+0];
62 phi[(i*ngorkov+idirac)*nc+1]+=a_1*r[(i*ngorkov+igork)*nc+1];
63 phi[(i*ngorkov+igork)*nc+0]+=a_2*r[(i*ngorkov+idirac)*nc];
64 phi[(i*ngorkov+igork)*nc+1]+=a_2*r[(i*ngorkov+idirac)*nc+1];
65 }
66
67 //Spacelike terms. Here's hoping I haven't put time as the zeroth component somewhere!
68#ifndef NO_SPACE
69 for(int mu = 0; mu <3; mu++){
70 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
71#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
72 for(int igorkov=0; igorkov<ngorkov; igorkov++){
73 //FORTRAN had mod((igorkov-1),4)+1 to prevent issues with non-zero indexing in the dirac term.
74 int idirac=igorkov%4;
75 int igork1 = (igorkov<4) ? gamin[mu*ndirac+idirac] : gamin[mu*ndirac+idirac]+4;
76 //Can manually vectorise with a pragma?
77 //Wilson + Dirac term in that order. Definitely easier
78 //to read when split into different loops, but should be faster this way
79 phi[(i*ngorkov+igorkov)*nc]+=-akappa*(u11t[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc]+\
80 u12t[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc+1]+\
81 conj(u11t[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]-\
82 u12t[did*ndim+mu]*r[(did*ngorkov+igorkov)*nc+1])+\
83 //Dirac term. Reminder! gamval was rescaled by kappa when we defined it
84 gamval[mu*ndirac+idirac]*(u11t[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc]+\
85 u12t[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc+1]-\
86 conj(u11t[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]+\
87 u12t[did*ndim+mu]*r[(did*ngorkov+igork1)*nc+1]);
88
89 phi[(i*ngorkov+igorkov)*nc+1]+=-akappa*(-conj(u12t[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc]+\
90 conj(u11t[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc+1]+\
91 conj(u12t[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]+\
92 u11t[did*ndim+mu]*r[(did*ngorkov+igorkov)*nc+1])+\
93 //Dirac term
94 gamval[mu*ndirac+idirac]*(-conj(u12t[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc]+\
95 conj(u11t[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc+1]-\
96 conj(u12t[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]-\
97 u11t[did*ndim+mu]*r[(did*ngorkov+igork1)*nc+1]);
98 }
99 }
100 //Timelike terms next. These run from igorkov=0..3 and 4..7 with slightly different rules for each
101 //We can fit it into a single loop by declaring igorkovPP=igorkov+4 instead of looping igorkov=4..7 separately
102 //Note that for the igorkov 4..7 loop idirac=igorkov-4, so we don't need to declare idiracPP separately
103#endif
104 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
105#ifndef NO_TIME
106#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
107 for(int igorkov=0; igorkov<4; igorkov++){
108 int igorkovPP=igorkov+4; //idirac = igorkov; It is a bit redundant but I'll mention it as that's how
109 //the FORTRAN code did it.
110 int igork1 = gamin[3*ndirac+igorkov]; int igork1PP = igork1+4;
111
112 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
113 phi[(i*ngorkov+igorkov)*nc]+=
114 -dk4p[i]*(u11t[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc]-r[(uid*ngorkov+igork1)*nc])
115 +u12t[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc+1]-r[(uid*ngorkov+igork1)*nc+1]))
116 -dk4m[did]*(conj(u11t[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]+r[(did*ngorkov+igork1)*nc])
117 -u12t[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]+r[(did*ngorkov+igork1)*nc+1]));
118 phi[(i*ngorkov+igorkov)*nc+1]+=
119 -dk4p[i]*(-conj(u12t[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc]-r[(uid*ngorkov+igork1)*nc])
120 +conj(u11t[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc+1]-r[(uid*ngorkov+igork1)*nc+1]))
121 -dk4m[did]*(conj(u12t[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]+r[(did*ngorkov+igork1)*nc])
122 +u11t[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]+r[(did*ngorkov+igork1)*nc+1]));
123
124 //And the +4 terms. Note that dk4p and dk4m swap positions compared to the above
125 phi[(i*ngorkov+igorkovPP)*nc]+=-dk4m[i]*(u11t[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc]-r[(uid*ngorkov+igork1PP)*nc])+\
126 u12t[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc+1]-r[(uid*ngorkov+igork1PP)*nc+1]))-\
127 dk4p[did]*(conj(u11t[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]+r[(did*ngorkov+igork1PP)*nc])-\
128 u12t[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]+r[(did*ngorkov+igork1PP)*nc+1]));
129
130 phi[(i*ngorkov+igorkovPP)*nc+1]+=-dk4m[i]*(conj(-u12t[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc]-r[(uid*ngorkov+igork1PP)*nc])+\
131 conj(u11t[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc+1]-r[(uid*ngorkov+igork1PP)*nc+1]))-\
132 dk4p[did]*(conj(u12t[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]+r[(did*ngorkov+igork1PP)*nc])+\
133 u11t[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]+r[(did*ngorkov+igork1PP)*nc+1]));
134 }
135#endif
136 }
137#endif
138 return 0;
139}
140int Dslashd(Complex *phi, Complex *r, Complex *u11t, Complex *u12t,unsigned int *iu,unsigned int *id,\
141 Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa){
142 /*
143 * @brief Evaluates @f(\Phi=M^\dagger r@f) in double precision.
144 *
145 * @param phi: The product
146 * @param r: The array being acted on by M
147 * @param u11t: First colour trial field
148 * @param u12t: Second colour trial field
149 * @param iu: Upper halo indices
150 * @param id: Lower halo indices
151 * @param gamval: Gamma matrices
152 * @param gamin: Indices for dirac terms
153 * @param dk4m:
154 * @param dk4p:
155 * @param jqq: Diquark source
156 * @param akappa: Hopping parameter
157 *
158 * @see ZHalo_swap_all (MPI only)
159 *
160 * @return Zero on success, integer error code otherwise
161 */
162 const char *funcname = "Dslashd";
163 //Get the halos in order
164#if(nproc>1)
165 ZHalo_swap_all(r, 16);
166#endif
167
168 //Mass term
169#ifdef __NVCC__
170 cuDslashd(phi,r,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa,dimGrid,dimBlock);
171#else
172 memcpy(phi, r, kferm*sizeof(Complex));
173#pragma omp parallel for
174 for(int i=0;i<kvol;i++){
175#pragma omp simd aligned(phi,r,gamval:AVX)
176 //Diquark Term (antihermitian) The signs of a_1 and a_2 below flip under dagger
177 for(int idirac = 0; idirac<ndirac; idirac++){
178 int igork = idirac+4;
179 Complex a_1, a_2;
180 //We subtract a_1, hence the minus
181 a_1=-conj(jqq)*gamval[4*ndirac+idirac];
182 a_2=jqq*gamval[4*ndirac+idirac];
183 phi[(i*ngorkov+idirac)*nc]+=a_1*r[(i*ngorkov+igork)*nc];
184 phi[(i*ngorkov+idirac)*nc+1]+=a_1*r[(i*ngorkov+igork)*nc+1];
185 phi[(i*ngorkov+igork)*nc]+=a_2*r[(i*ngorkov+idirac)*nc];
186 phi[(i*ngorkov+igork)*nc+1]+=a_2*r[(i*ngorkov+idirac)*nc+1];
187 }
188
189 //Spacelike terms. Here's hoping I haven't put time as the zeroth component somewhere!
190#ifndef NO_SPACE
191 for(int mu = 0; mu <3; mu++){
192 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
193#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
194 for(int igorkov=0; igorkov<ngorkov; igorkov++){
195 //FORTRAN had mod((igorkov-1),4)+1 to prevent issues with non-zero indexing.
196 int idirac=igorkov%4;
197 int igork1 = (igorkov<4) ? gamin[mu*ndirac+idirac] : gamin[mu*ndirac+idirac]+4;
198 //Wilson + Dirac term in that order. Definitely easier
199 //to read when split into different loops, but should be faster this way
200 //Reminder! gamval was rescaled by kappa when we defined it
201 phi[(i*ngorkov+igorkov)*nc]+=
202 -akappa*( u11t[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc]
203 +u12t[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc+1]
204 +conj(u11t[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]
205 -u12t[did*ndim+mu] *r[(did*ngorkov+igorkov)*nc+1])
206 -gamval[mu*ndirac+idirac]*
207 ( u11t[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc]
208 +u12t[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc+1]
209 -conj(u11t[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]
210 +u12t[did*ndim+mu] *r[(did*ngorkov+igork1)*nc+1]);
211
212 phi[(i*ngorkov+igorkov)*nc+1]+=
213 -akappa*(-conj(u12t[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc]
214 +conj(u11t[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc+1]
215 +conj(u12t[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]
216 +u11t[did*ndim+mu] *r[(did*ngorkov+igorkov)*nc+1])
217 -gamval[mu*ndirac+idirac]*
218 (-conj(u12t[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc]
219 +conj(u11t[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc+1]
220 -conj(u12t[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]
221 -u11t[did*ndim+mu] *r[(did*ngorkov+igork1)*nc+1]);
222 }
223 }
224#endif
225 //Timelike terms next. These run from igorkov=0..3 and 4..7 with slightly different rules for each
226 //We can fit it into a single loop by declaring igorkovPP=igorkov+4 instead of looping igorkov=4..7 separately
227 //Note that for the igorkov 4..7 loop idirac=igorkov-4, so we don't need to declare idiracPP separately
228 //Under dagger, dk4p and dk4m get swapped and the dirac component flips sign.
229 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
230#ifndef NO_TIME
231#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
232 for(int igorkov=0; igorkov<4; igorkov++){
233 //the FORTRAN code did it.
234 int igork1 = gamin[3*ndirac+igorkov];
235 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
236 phi[(i*ngorkov+igorkov)*nc]+=
237 -dk4m[i]*(u11t[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc]+r[(uid*ngorkov+igork1)*nc])
238 +u12t[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc+1]+r[(uid*ngorkov+igork1)*nc+1]))
239 -dk4p[did]*(conj(u11t[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]-r[(did*ngorkov+igork1)*nc])
240 -u12t[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]-r[(did*ngorkov+igork1)*nc+1]));
241 phi[(i*ngorkov+igorkov)*nc+1]+=
242 -dk4m[i]*(-conj(u12t[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc]+r[(uid*ngorkov+igork1)*nc])
243 +conj(u11t[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc+1]+r[(uid*ngorkov+igork1)*nc+1]))
244 -dk4p[did]*(conj(u12t[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]-r[(did*ngorkov+igork1)*nc])
245 +u11t[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]-r[(did*ngorkov+igork1)*nc+1]));
246
247
248 int igorkovPP=igorkov+4; //idirac = igorkov; It is a bit redundant but I'll mention it as that's how
249 int igork1PP = igork1+4;
250 //And the +4 terms. Note that dk4p and dk4m swap positions compared to the above
251 phi[(i*ngorkov+igorkovPP)*nc]+=-dk4p[i]*(u11t[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc]+r[(uid*ngorkov+igork1PP)*nc])+\
252 u12t[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc+1]+r[(uid*ngorkov+igork1PP)*nc+1]))-\
253 dk4m[did]*(conj(u11t[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]-r[(did*ngorkov+igork1PP)*nc])-\
254 u12t[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]-r[(did*ngorkov+igork1PP)*nc+1]));
255
256 phi[(i*ngorkov+igorkovPP)*nc+1]+=dk4p[i]*(conj(u12t[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc]+r[(uid*ngorkov+igork1PP)*nc])-\
257 conj(u11t[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc+1]+r[(uid*ngorkov+igork1PP)*nc+1]))-\
258 dk4m[did]*(conj(u12t[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]-r[(did*ngorkov+igork1PP)*nc])+
259 u11t[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]-r[(did*ngorkov+igork1PP)*nc+1]));
260
261 }
262#endif
263 }
264#endif
265 return 0;
266}
267int Hdslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t,unsigned int *iu,unsigned int *id,\
268 Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa){
269 /*
270 * @brief Evaluates @f(\Phi=M r@f) in double precision
271 *
272 * @param phi: The product
273 * @param r: The array being acted on by M
274 * @param u11t: First colour trial field
275 * @param u12t: Second colour trial field
276 * @param iu: Upper halo indices
277 * @param id: Lower halo indices
278 * @param gamval: Gamma matrices
279 * @param gamin: Indices for dirac terms
280 * @param dk4m:
281 * @param dk4p:
282 * @param akappa: Hopping parameter
283 *
284 * @see ZHalo_swap_all (MPI only)
285 *
286 * @return Zero on success, integer error code otherwise
287 */
288 const char *funcname = "Hdslash";
289 //Get the halos in order
290#if(nproc>1)
291 ZHalo_swap_all(r, 8);
292#endif
293
294 //Mass term
295 //Spacelike term
296#ifdef __NVCC__
297 cuHdslash(phi,r,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,akappa,dimGrid,dimBlock);
298#else
299 memcpy(phi, r, kferm2*sizeof(Complex));
300#pragma omp parallel for
301 for(int i=0;i<kvol;i++){
302#ifndef NO_SPACE
303 for(int mu = 0; mu <3; mu++){
304 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
305#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
306 for(int idirac=0; idirac<ndirac; idirac++){
307 //FORTRAN had mod((idirac-1),4)+1 to prevent issues with non-zero indexing.
308 int igork1 = gamin[mu*ndirac+idirac];
309 //Can manually vectorise with a pragma?
310 //Wilson + Dirac term in that order. Definitely easier
311 //to read when split into different loops, but should be faster this way
312 phi[(i*ndirac+idirac)*nc]+=-akappa*(u11t[i*ndim+mu]*r[(uid*ndirac+idirac)*nc]+\
313 u12t[i*ndim+mu]*r[(uid*ndirac+idirac)*nc+1]+\
314 conj(u11t[did*ndim+mu])*r[(did*ndirac+idirac)*nc]-\
315 u12t[did*ndim+mu]*r[(did*ndirac+idirac)*nc+1])+\
316 //Dirac term
317 gamval[mu*ndirac+idirac]*(u11t[i*ndim+mu]*r[(uid*ndirac+igork1)*nc]+\
318 u12t[i*ndim+mu]*r[(uid*ndirac+igork1)*nc+1]-\
319 conj(u11t[did*ndim+mu])*r[(did*ndirac+igork1)*nc]+\
320 u12t[did*ndim+mu]*r[(did*ndirac+igork1)*nc+1]);
321
322 phi[(i*ndirac+idirac)*nc+1]+=-akappa*(-conj(u12t[i*ndim+mu])*r[(uid*ndirac+idirac)*nc]+\
323 conj(u11t[i*ndim+mu])*r[(uid*ndirac+idirac)*nc+1]+\
324 conj(u12t[did*ndim+mu])*r[(did*ndirac+idirac)*nc]+\
325 u11t[did*ndim+mu]*r[(did*ndirac+idirac)*nc+1])+\
326 //Dirac term
327 gamval[mu*ndirac+idirac]*(-conj(u12t[i*ndim+mu])*r[(uid*ndirac+igork1)*nc]+\
328 conj(u11t[i*ndim+mu])*r[(uid*ndirac+igork1)*nc+1]-\
329 conj(u12t[did*ndim+mu])*r[(did*ndirac+igork1)*nc]-\
330 u11t[did*ndim+mu]*r[(did*ndirac+igork1)*nc+1]);
331 }
332 }
333#endif
334 //Timelike terms
335 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
336#ifndef NO_TIME
337#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
338 for(int idirac=0; idirac<ndirac; idirac++){
339 int igork1 = gamin[3*ndirac+idirac];
340 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
341 //Reminder! gamval was rescaled by kappa when we defined it
342 phi[(i*ndirac+idirac)*nc]+=
343 -dk4p[i]*(u11t[i*ndim+3]*(r[(uid*ndirac+idirac)*nc]-r[(uid*ndirac+igork1)*nc])
344 +u12t[i*ndim+3]*(r[(uid*ndirac+idirac)*nc+1]-r[(uid*ndirac+igork1)*nc+1]))
345 -dk4m[did]*(conj(u11t[did*ndim+3])*(r[(did*ndirac+idirac)*nc]+r[(did*ndirac+igork1)*nc])
346 -u12t[did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]+r[(did*ndirac+igork1)*nc+1]));
347 phi[(i*ndirac+idirac)*nc+1]+=
348 -dk4p[i]*(-conj(u12t[i*ndim+3])*(r[(uid*ndirac+idirac)*nc]-r[(uid*ndirac+igork1)*nc])
349 +conj(u11t[i*ndim+3])*(r[(uid*ndirac+idirac)*nc+1]-r[(uid*ndirac+igork1)*nc+1]))
350 -dk4m[did]*(conj(u12t[did*ndim+3])*(r[(did*ndirac+idirac)*nc]+r[(did*ndirac+igork1)*nc])
351 +u11t[did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]+r[(did*ndirac+igork1)*nc+1]));
352 }
353#endif
354 }
355#endif
356 return 0;
357}
358int Hdslashd(Complex *phi, Complex *r, Complex *u11t, Complex *u12t,unsigned int *iu,unsigned int *id,\
359 Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa){
360 /*
361 * @brief Evaluates @f(\Phi=M^\dagger r@f) in double precision
362 *
363 * @param phi: The product
364 * @param r: The array being acted on by M
365 * @param u11t: First colour trial field
366 * @param u12t: Second colour trial field
367 * @param iu: Upper halo indices
368 * @param id: Lower halo indices
369 * @param gamval: Gamma matrices
370 * @param gamin: Indices for dirac terms
371 * @param dk4m:
372 * @param dk4p:
373 * @param akappa: Hopping parameter
374 *
375 * @see ZHalo_swap_all (MPI only)
376 *
377 * @return Zero on success, integer error code otherwise
378 */
379 const char *funcname = "Hdslashd";
380 //Get the halos in order. Because C is row major, we need to extract the correct
381 //terms for each halo first. Changing the indices was considered but that caused
382 //issues with the BLAS routines.
383#if(nproc>1)
384 ZHalo_swap_all(r, 8);
385#endif
386
387 //Looks like flipping the array ordering for C has meant a lot
388 //of for loops. Sense we're jumping around quite a bit the cache is probably getting refreshed
389 //anyways so memory access patterns mightn't be as big of an limiting factor here anyway
390
391 //Mass term
392#ifdef __NVCC__
393 cuHdslashd(phi,r,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,akappa,dimGrid,dimBlock);
394#else
395 memcpy(phi, r, kferm2*sizeof(Complex));
396 //Spacelike term
397#pragma omp parallel for
398 for(int i=0;i<kvol;i++){
399#ifndef NO_SPACE
400 for(int mu = 0; mu <ndim-1; mu++){
401 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
402#pragma omp simd aligned(phi,r,u11t,u12t,gamval:AVX)
403 for(int idirac=0; idirac<ndirac; idirac++){
404 //FORTRAN had mod((idirac-1),4)+1 to prevent issues with non-zero indexing.
405 int igork1 = gamin[mu*ndirac+idirac];
406 //Can manually vectorise with a pragma?
407 //Wilson + Dirac term in that order. Definitely easier
408 //to read when split into different loops, but should be faster this way
409
410 //Reminder! gamval was rescaled by kappa when we defined it
411 phi[(i*ndirac+idirac)*nc]+=
412 -akappa*(u11t[i*ndim+mu]*r[(uid*ndirac+idirac)*nc]
413 +u12t[i*ndim+mu]*r[(uid*ndirac+idirac)*nc+1]
414 +conj(u11t[did*ndim+mu])*r[(did*ndirac+idirac)*nc]
415 -u12t[did*ndim+mu] *r[(did*ndirac+idirac)*nc+1])
416 -gamval[mu*ndirac+idirac]*
417 ( u11t[i*ndim+mu]*r[(uid*ndirac+igork1)*nc]
418 +u12t[i*ndim+mu]*r[(uid*ndirac+igork1)*nc+1]
419 -conj(u11t[did*ndim+mu])*r[(did*ndirac+igork1)*nc]
420 +u12t[did*ndim+mu] *r[(did*ndirac+igork1)*nc+1]);
421
422 phi[(i*ndirac+idirac)*nc+1]+=
423 -akappa*(-conj(u12t[i*ndim+mu])*r[(uid*ndirac+idirac)*nc]
424 +conj(u11t[i*ndim+mu])*r[(uid*ndirac+idirac)*nc+1]
425 +conj(u12t[did*ndim+mu])*r[(did*ndirac+idirac)*nc]
426 +u11t[did*ndim+mu] *r[(did*ndirac+idirac)*nc+1])
427 -gamval[mu*ndirac+idirac]*
428 (-conj(u12t[i*ndim+mu])*r[(uid*ndirac+igork1)*nc]
429 +conj(u11t[i*ndim+mu])*r[(uid*ndirac+igork1)*nc+1]
430 -conj(u12t[did*ndim+mu])*r[(did*ndirac+igork1)*nc]
431 -u11t[did*ndim+mu] *r[(did*ndirac+igork1)*nc+1]);
432 }
433 }
434#endif
435 //Timelike terms
436 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
437#ifndef NO_TIME
438#pragma omp simd aligned(phi,r,u11t,u12t,dk4m,dk4p:AVX)
439 for(int idirac=0; idirac<ndirac; idirac++){
440 int igork1 = gamin[3*ndirac+idirac];
441 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
442 //dk4m and dk4p swap under dagger
443 phi[(i*ndirac+idirac)*nc]+=
444 -dk4m[i]*(u11t[i*ndim+3]*(r[(uid*ndirac+idirac)*nc]+r[(uid*ndirac+igork1)*nc])
445 +u12t[i*ndim+3]*(r[(uid*ndirac+idirac)*nc+1]+r[(uid*ndirac+igork1)*nc+1]))
446 -dk4p[did]*(conj(u11t[did*ndim+3])*(r[(did*ndirac+idirac)*nc]-r[(did*ndirac+igork1)*nc])
447 -u12t[did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]-r[(did*ndirac+igork1)*nc+1]));
448
449 phi[(i*ndirac+idirac)*nc+1]+=
450 -dk4m[i]*(-conj(u12t[i*ndim+3])*(r[(uid*ndirac+idirac)*nc]+r[(uid*ndirac+igork1)*nc])
451 +conj(u11t[i*ndim+3])*(r[(uid*ndirac+idirac)*nc+1]+r[(uid*ndirac+igork1)*nc+1]))
452 -dk4p[did]*(conj(u12t[did*ndim+3])*(r[(did*ndirac+idirac)*nc]-r[(did*ndirac+igork1)*nc])
453 +u11t[did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]-r[(did*ndirac+igork1)*nc+1]));
454 }
455#endif
456 }
457#endif
458 return 0;
459}
460//Float Versions
461//int Dslash_f(Complex_f *phi, Complex_f *r){
462int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t_f, Complex_f *u12t_f,unsigned int *iu, unsigned int *id,\
463 Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa){
464 /*
465 * @brief Evaluates @f(\Phi=M r@f) in single precision.
466 *
467 * @param phi: The product
468 * @param r: The array being acted on by M
469 * @param u11t_f: First colour trial field
470 * @param u12t_f: Second colour trial field
471 * @param iu: Upper halo indices
472 * @param id: Lower halo indices
473 * @param gamval_f: Gamma matrices
474 * @param gamin: Indices for dirac terms
475 * @param dk4m_f:
476 * @param dk4p_f:
477 * @param jqq: Diquark source
478 * @param akappa: Hopping parameter
479 *
480 * @see CHalo_swap_all (MPI only)
481 *
482 * @return Zero on success, integer error code otherwise
483 */
484 const char *funcname = "Dslash_f";
485 //Get the halos in order
486#if(nproc>1)
487 CHalo_swap_all(r, 16);
488#endif
489
490 //Mass term
491 //Diquark Term (antihermitian)
492#ifdef __NVCC__
493 cuDslash_f(phi,r,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,dimGrid,dimBlock);
494#else
495 memcpy(phi, r, kferm*sizeof(Complex_f));
496#pragma omp parallel for
497 for(int i=0;i<kvol;i++){
498#pragma omp simd aligned(phi,r,gamval_f:AVX)
499 for(int idirac = 0; idirac<ndirac; idirac++){
500 int igork = idirac+4;
501 Complex_f a_1, a_2;
502 a_1=conj(jqq)*gamval_f[4*ndirac+idirac];
503 //We subtract a_2, hence the minus
504 a_2=-jqq*gamval_f[4*ndirac+idirac];
505 phi[(i*ngorkov+idirac)*nc]+=a_1*r[(i*ngorkov+igork)*nc+0];
506 phi[(i*ngorkov+idirac)*nc+1]+=a_1*r[(i*ngorkov+igork)*nc+1];
507 phi[(i*ngorkov+igork)*nc+0]+=a_2*r[(i*ngorkov+idirac)*nc];
508 phi[(i*ngorkov+igork)*nc+1]+=a_2*r[(i*ngorkov+idirac)*nc+1];
509 }
510
511 //Spacelike terms. Here's hoping I haven't put time as the zeroth component somewhere!
512#ifndef NO_SPACE
513 for(int mu = 0; mu <3; mu++){
514 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
515#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamval_f,gamin:AVX)
516 for(int igorkov=0; igorkov<ngorkov; igorkov++){
517 //FORTRAN had mod((igorkov-1),4)+1 to prevent issues with non-zero indexing in the dirac term.
518 int idirac=igorkov%4;
519 int igork1 = (igorkov<4) ? gamin[mu*ndirac+idirac] : gamin[mu*ndirac+idirac]+4;
520 //Can manually vectorise with a pragma?
521 //Wilson + Dirac term in that order. Definitely easier
522 //to read when split into different loops, but should be faster this way
523 phi[(i*ngorkov+igorkov)*nc]+=-akappa*(u11t_f[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc]+\
524 u12t_f[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc+1]+\
525 conj(u11t_f[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]-\
526 u12t_f[did*ndim+mu]*r[(did*ngorkov+igorkov)*nc+1])+\
527 //Dirac term. Reminder! gamval was rescaled by kappa when we defined it
528 gamval_f[mu*ndirac+idirac]*(u11t_f[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc]+\
529 u12t_f[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc+1]-\
530 conj(u11t_f[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]+\
531 u12t_f[did*ndim+mu]*r[(did*ngorkov+igork1)*nc+1]);
532
533 phi[(i*ngorkov+igorkov)*nc+1]+=-akappa*(-conj(u12t_f[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc]+\
534 conj(u11t_f[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc+1]+\
535 conj(u12t_f[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]+\
536 u11t_f[did*ndim+mu]*r[(did*ngorkov+igorkov)*nc+1])+\
537 //Dirac term
538 gamval_f[mu*ndirac+idirac]*(-conj(u12t_f[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc]+\
539 conj(u11t_f[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc+1]-\
540 conj(u12t_f[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]-\
541 u11t_f[did*ndim+mu]*r[(did*ngorkov+igork1)*nc+1]);
542 }
543 }
544 //Timelike terms next. These run from igorkov=0..3 and 4..7 with slightly different rules for each
545 //We can fit it into a single loop by declaring igorkovPP=igorkov+4 instead of looping igorkov=4..7 separately
546 //Note that for the igorkov 4..7 loop idirac=igorkov-4, so we don't need to declare idiracPP separately
547#endif
548 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
549#ifndef NO_TIME
550#pragma omp simd aligned(phi,r,u11t_f,u12t_f,dk4m_f,dk4p_f,gamin:AVX)
551 for(int igorkov=0; igorkov<4; igorkov++){
552 int igorkovPP=igorkov+4; //idirac = igorkov; It is a bit redundant but I'll mention it as that's how
553 //the FORTRAN code did it.
554 int igork1 = gamin[3*ndirac+igorkov]; int igork1PP = igork1+4;
555
556 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
557 phi[(i*ngorkov+igorkov)*nc]+=
558 -dk4p_f[i]*(u11t_f[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc]-r[(uid*ngorkov+igork1)*nc])
559 +u12t_f[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc+1]-r[(uid*ngorkov+igork1)*nc+1]))
560 -dk4m_f[did]*(conj(u11t_f[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]+r[(did*ngorkov+igork1)*nc])
561 -u12t_f[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]+r[(did*ngorkov+igork1)*nc+1]));
562 phi[(i*ngorkov+igorkov)*nc+1]+=
563 -dk4p_f[i]*(-conj(u12t_f[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc]-r[(uid*ngorkov+igork1)*nc])
564 +conj(u11t_f[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc+1]-r[(uid*ngorkov+igork1)*nc+1]))
565 -dk4m_f[did]*(conj(u12t_f[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]+r[(did*ngorkov+igork1)*nc])
566 +u11t_f[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]+r[(did*ngorkov+igork1)*nc+1]));
567
568 //And the +4 terms. Note that dk4p_f and dk4m_f swap positions compared to the above
569 phi[(i*ngorkov+igorkovPP)*nc]+=-dk4m_f[i]*(u11t_f[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc]-r[(uid*ngorkov+igork1PP)*nc])
570 +u12t_f[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc+1]-r[(uid*ngorkov+igork1PP)*nc+1]))
571 -dk4p_f[did]*(conj(u11t_f[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]+r[(did*ngorkov+igork1PP)*nc])
572 -u12t_f[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]+r[(did*ngorkov+igork1PP)*nc+1]));
573
574 phi[(i*ngorkov+igorkovPP)*nc+1]+=-dk4m_f[i]*(conj(-u12t_f[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc]-r[(uid*ngorkov+igork1PP)*nc])
575 +conj(u11t_f[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc+1]-r[(uid*ngorkov+igork1PP)*nc+1]))
576 -dk4p_f[did]*(conj(u12t_f[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]+r[(did*ngorkov+igork1PP)*nc])
577 +u11t_f[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]+r[(did*ngorkov+igork1PP)*nc+1]));
578 }
579#endif
580 }
581#endif
582 return 0;
583}
584int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t_f, Complex_f *u12t_f,unsigned int *iu,unsigned int *id,\
585 Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa){
586 /*
587 * @brief Evaluates @f(\Phi=M^\dagger r@f) in single precision.
588 *
589 * @param phi: The product
590 * @param r: The array being acted on by M
591 * @param u11t_f: First colour trial field
592 * @param u12t_f: Second colour trial field
593 * @param iu: Upper halo indices
594 * @param id: Lower halo indices
595 * @param gamval_f: Gamma matrices
596 * @param gamin: Indices for dirac terms
597 * @param dk4m_f:
598 * @param dk4p_f:
599 * @param jqq: Diquark source
600 * @param akappa: Hopping parameter
601 *
602 * @see CHalo_swap_all (MPI only)
603 *
604 * @return Zero on success, integer error code otherwise
605 */
606 const char *funcname = "Dslashd_f";
607 //Get the halos in order
608#if(nproc>1)
609 CHalo_swap_all(r, 16);
610#endif
611
612 //Mass term
613#ifdef __NVCC__
614 cuDslashd_f(phi,r,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,dimGrid,dimBlock);
615#else
616 memcpy(phi, r, kferm*sizeof(Complex_f));
617#pragma omp parallel for
618 for(int i=0;i<kvol;i++){
619#pragma omp simd aligned(phi,r,gamval_f:AVX)
620 //Diquark Term (antihermitian) The signs of a_1 and a_2 below flip under dagger
621 for(int idirac = 0; idirac<ndirac; idirac++){
622 int igork = idirac+4;
623 Complex_f a_1, a_2;
624 //We subtract a_1, hence the minus
625 a_1=-conj(jqq)*gamval_f[4*ndirac+idirac];
626 a_2=jqq*gamval_f[4*ndirac+idirac];
627 phi[(i*ngorkov+idirac)*nc]+=a_1*r[(i*ngorkov+igork)*nc];
628 phi[(i*ngorkov+idirac)*nc+1]+=a_1*r[(i*ngorkov+igork)*nc+1];
629 phi[(i*ngorkov+igork)*nc]+=a_2*r[(i*ngorkov+idirac)*nc];
630 phi[(i*ngorkov+igork)*nc+1]+=a_2*r[(i*ngorkov+idirac)*nc+1];
631 }
632
633 //Spacelike terms. Here's hoping I haven't put time as the zeroth component somewhere!
634#ifndef NO_SPACE
635 for(int mu = 0; mu <3; mu++){
636 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
637#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamval_f:AVX)
638 for(int igorkov=0; igorkov<ngorkov; igorkov++){
639 //FORTRAN had mod((igorkov-1),4)+1 to prevent issues with non-zero indexing.
640 int idirac=igorkov%4;
641 int igork1 = (igorkov<4) ? gamin[mu*ndirac+idirac] : gamin[mu*ndirac+idirac]+4;
642 //Wilson + Dirac term in that order. Definitely easier
643 //to read when split into different loops, but should be faster this way
644 //Reminder! gamval was rescaled by kappa when we defined it
645 phi[(i*ngorkov+igorkov)*nc]+=
646 -akappa*( u11t_f[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc]
647 +u12t_f[i*ndim+mu]*r[(uid*ngorkov+igorkov)*nc+1]
648 +conj(u11t_f[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]
649 -u12t_f[did*ndim+mu] *r[(did*ngorkov+igorkov)*nc+1])
650 -gamval_f[mu*ndirac+idirac]*
651 ( u11t_f[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc]
652 +u12t_f[i*ndim+mu]*r[(uid*ngorkov+igork1)*nc+1]
653 -conj(u11t_f[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]
654 +u12t_f[did*ndim+mu] *r[(did*ngorkov+igork1)*nc+1]);
655
656 phi[(i*ngorkov+igorkov)*nc+1]+=
657 -akappa*(-conj(u12t_f[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc]
658 +conj(u11t_f[i*ndim+mu])*r[(uid*ngorkov+igorkov)*nc+1]
659 +conj(u12t_f[did*ndim+mu])*r[(did*ngorkov+igorkov)*nc]
660 +u11t_f[did*ndim+mu] *r[(did*ngorkov+igorkov)*nc+1])
661 -gamval_f[mu*ndirac+idirac]*
662 (-conj(u12t_f[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc]
663 +conj(u11t_f[i*ndim+mu])*r[(uid*ngorkov+igork1)*nc+1]
664 -conj(u12t_f[did*ndim+mu])*r[(did*ngorkov+igork1)*nc]
665 -u11t_f[did*ndim+mu] *r[(did*ngorkov+igork1)*nc+1]);
666 }
667 }
668#endif
669 //Timelike terms next. These run from igorkov=0..3 and 4..7 with slightly different rules for each
670 //We can fit it into a single loop by declaring igorkovPP=igorkov+4 instead of looping igorkov=4..7 separately
671 //Note that for the igorkov 4..7 loop idirac=igorkov-4, so we don't need to declare idiracPP separately
672 //Under dagger, dk4p_f and dk4m_f get swapped and the dirac component flips sign.
673 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
674#ifndef NO_TIME
675#pragma omp simd aligned(phi,r,u11t_f,u12t_f,dk4m_f,dk4p_f:AVX)
676 for(int igorkov=0; igorkov<4; igorkov++){
677 //the FORTRAN code did it.
678 int igork1 = gamin[3*ndirac+igorkov];
679 //Factorising for performance, we get dk4?*u1?*(+/-r_wilson -/+ r_dirac)
680 phi[(i*ngorkov+igorkov)*nc]+=
681 -dk4m_f[i]*(u11t_f[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc]+r[(uid*ngorkov+igork1)*nc])
682 +u12t_f[i*ndim+3]*(r[(uid*ngorkov+igorkov)*nc+1]+r[(uid*ngorkov+igork1)*nc+1]))
683 -dk4p_f[did]*(conj(u11t_f[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]-r[(did*ngorkov+igork1)*nc])
684 -u12t_f[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]-r[(did*ngorkov+igork1)*nc+1]));
685 phi[(i*ngorkov+igorkov)*nc+1]+=
686 -dk4m_f[i]*(-conj(u12t_f[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc]+r[(uid*ngorkov+igork1)*nc])
687 +conj(u11t_f[i*ndim+3])*(r[(uid*ngorkov+igorkov)*nc+1]+r[(uid*ngorkov+igork1)*nc+1]))
688 -dk4p_f[did]*(conj(u12t_f[did*ndim+3])*(r[(did*ngorkov+igorkov)*nc]-r[(did*ngorkov+igork1)*nc])
689 +u11t_f[did*ndim+3] *(r[(did*ngorkov+igorkov)*nc+1]-r[(did*ngorkov+igork1)*nc+1]));
690
691
692 int igorkovPP=igorkov+4; //idirac = igorkov; It is a bit redundant but I'll mention it as that's how
693 int igork1PP = igork1+4;
694 //And the +4 terms. Note that dk4p_f and dk4m_f swap positions compared to the above
695 phi[(i*ngorkov+igorkovPP)*nc]+=-dk4p_f[i]*(u11t_f[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc]+r[(uid*ngorkov+igork1PP)*nc])
696 +u12t_f[i*ndim+3]*(r[(uid*ngorkov+igorkovPP)*nc+1]+r[(uid*ngorkov+igork1PP)*nc+1]))
697 -dk4m_f[did]*(conj(u11t_f[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]-r[(did*ngorkov+igork1PP)*nc])
698 -u12t_f[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]-r[(did*ngorkov+igork1PP)*nc+1]));
699
700 phi[(i*ngorkov+igorkovPP)*nc+1]+=dk4p_f[i]*(conj(u12t_f[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc]+r[(uid*ngorkov+igork1PP)*nc])
701 -conj(u11t_f[i*ndim+3])*(r[(uid*ngorkov+igorkovPP)*nc+1]+r[(uid*ngorkov+igork1PP)*nc+1]))
702 -dk4m_f[did]*(conj(u12t_f[did*ndim+3])*(r[(did*ngorkov+igorkovPP)*nc]-r[(did*ngorkov+igork1PP)*nc])
703 +u11t_f[did*ndim+3]*(r[(did*ngorkov+igorkovPP)*nc+1]-r[(did*ngorkov+igork1PP)*nc+1]));
704
705 }
706#endif
707 }
708#endif
709 return 0;
710}
711int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t_f, Complex_f *u12t_f,unsigned int *iu,unsigned int *id,\
712 Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, float akappa){
713 /*
714 * @brief Evaluates @f(\Phi=M r@f) in single precision.
715 *
716 * @param phi: The product
717 * @param r: The array being acted on by M
718 * @param u11t_f: First colour trial field
719 * @param u12t_f: Second colour trial field
720 * @param iu: Upper halo indices
721 * @param id: Lower halo indices
722 * @param gamval_f: Gamma matrices
723 * @param gamin: Indices for dirac terms
724 * @param dk4m_f:
725 * @param dk4p_f:
726 * @param akappa: Hopping parameter
727 *
728 * @see CHalo_swap_all (MPI only)
729 *
730 * @return Zero on success, integer error code otherwise
731 */
732 const char *funcname = "Hdslash_f";
733 //Get the halos in order
734#if(nproc>1)
735 CHalo_swap_all(r, 8);
736#endif
737#ifdef __NVCC__
738 cuHdslash_f(phi,r,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,akappa,dimGrid,dimBlock);
739#else
740 //Mass term
741 memcpy(phi, r, kferm2*sizeof(Complex_f));
742#pragma omp parallel for
743 for(int i=0;i<kvol;i++){
744 //Spacelike term
745#ifndef NO_SPACE
746 for(int mu = 0; mu <3; mu++){
747 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
748#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamval_f:AVX)
749 for(int idirac=0; idirac<ndirac; idirac++){
750 //FORTRAN had mod((idirac-1),4)+1 to prevent issues with non-zero indexing.
751 int igork1 = gamin[mu*ndirac+idirac];
752 //Can manually vectorise with a pragma?
753 //Wilson + Dirac term in that order. Definitely easier
754 //to read when split into different loops, but should be faster this way
755 phi[(i*ndirac+idirac)*nc]+=-akappa*(u11t_f[i*ndim+mu]*r[(uid*ndirac+idirac)*nc]+\
756 u12t_f[i*ndim+mu]*r[(uid*ndirac+idirac)*nc+1]+\
757 conjf(u11t_f[did*ndim+mu])*r[(did*ndirac+idirac)*nc]-\
758 u12t_f[did*ndim+mu]*r[(did*ndirac+idirac)*nc+1]);
759 //Dirac term
760 //Reminder! gamval was rescaled by kappa when we defined it
761 phi[(i*ndirac+idirac)*nc]+=gamval_f[mu*ndirac+idirac]*(u11t_f[i*ndim+mu]*r[(uid*ndirac+igork1)*nc]+\
762 u12t_f[i*ndim+mu]*r[(uid*ndirac+igork1)*nc+1]-\
763 conjf(u11t_f[did*ndim+mu])*r[(did*ndirac+igork1)*nc]+\
764 u12t_f[did*ndim+mu]*r[(did*ndirac+igork1)*nc+1]);
765
766 phi[(i*ndirac+idirac)*nc+1]+=-akappa*(-conjf(u12t_f[i*ndim+mu])*r[(uid*ndirac+idirac)*nc]+\
767 conjf(u11t_f[i*ndim+mu])*r[(uid*ndirac+idirac)*nc+1]+\
768 conjf(u12t_f[did*ndim+mu])*r[(did*ndirac+idirac)*nc]+\
769 u11t_f[did*ndim+mu]*r[(did*ndirac+idirac)*nc+1]);
770 //Dirac term
771 phi[(i*ndirac+idirac)*nc+1]+=gamval_f[mu*ndirac+idirac]*(-conjf(u12t_f[i*ndim+mu])*r[(uid*ndirac+igork1)*nc]+\
772 conjf(u11t_f[i*ndim+mu])*r[(uid*ndirac+igork1)*nc+1]-\
773 conjf(u12t_f[did*ndim+mu])*r[(did*ndirac+igork1)*nc]-\
774 u11t_f[did*ndim+mu]*r[(did*ndirac+igork1)*nc+1]);
775 }
776 }
777#endif
778 //Timelike terms
779 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
780#ifndef NO_TIME
781#pragma omp simd aligned(phi,r,u11t_f,u12t_f,dk4m_f,dk4p_f:AVX)
782 for(int idirac=0; idirac<ndirac; idirac++){
783 int igork1 = gamin[3*ndirac+idirac];
784 //Factorising for performance, we get dk4?*(float)u1?*(+/-r_wilson -/+ r_dirac)
785 phi[(i*ndirac+idirac)*nc]+=
786 -dk4p_f[i]*(u11t_f[i*ndim+3]*(r[(uid*ndirac+idirac)*nc]-r[(uid*ndirac+igork1)*nc])
787 +u12t_f[i*ndim+3]*(r[(uid*ndirac+idirac)*nc+1]-r[(uid*ndirac+igork1)*nc+1]))
788 -dk4m_f[did]*(conjf(u11t_f[did*ndim+3])*(r[(did*ndirac+idirac)*nc]+r[(did*ndirac+igork1)*nc])
789 -u12t_f[did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]+r[(did*ndirac+igork1)*nc+1]));
790
791 phi[(i*ndirac+idirac)*nc+1]+=
792 -dk4p_f[i]*(-conjf(u12t_f[i*ndim+3])*(r[(uid*ndirac+idirac)*nc]-r[(uid*ndirac+igork1)*nc])
793 +conjf(u11t_f[i*ndim+3])*(r[(uid*ndirac+idirac)*nc+1]-r[(uid*ndirac+igork1)*nc+1]))
794 -dk4m_f[did]*(conjf(u12t_f[did*ndim+3])*(r[(did*ndirac+idirac)*nc]+r[(did*ndirac+igork1)*nc])
795 +u11t_f[did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]+r[(did*ndirac+igork1)*nc+1]));
796 }
797#endif
798 }
799#endif
800 return 0;
801}
802int Hdslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t_f, Complex_f *u12t_f,unsigned int *iu,unsigned int *id,\
803 Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, float akappa){
804 /*
805 * @brief Evaluates @f(\Phi=M^\dagger r@f) in single precision
806 *
807 * @param phi: The product
808 * @param r: The array being acted on by M
809 * @param u11t_f: First colour trial field
810 * @param u12t_f: Second colour trial field
811 * @param iu: Upper halo indices
812 * @param id: Lower halo indices
813 * @param gamval_f: Gamma matrices
814 * @param gamin: Indices for dirac terms
815 * @param dk4m_f:
816 * @param dk4p_f:
817 * @param akappa: Hopping parameter
818 *
819 * @see CHalo_swap_all (MPI only)
820 *
821 * @return Zero on success, integer error code otherwise
822 */
823 const char *funcname = "Hdslashd_f";
824 //Get the halos in order. Because C is row major, we need to extract the correct
825 //terms for each halo first. Changing the indices was considered but that caused
826 //issues with the BLAS routines.
827#if(nproc>1)
828 CHalo_swap_all(r, 8);
829#endif
830
831 //Looks like flipping the array ordering for C has meant a lot
832 //of for loops. Sense we're jumping around quite a bit the cache is probably getting refreshed
833 //anyways so memory access patterns mightn't be as big of an limiting factor here anyway
834
835 //Mass term
836#ifdef __NVCC__
837 cuHdslashd_f(phi,r,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,akappa,dimGrid,dimBlock);
838#else
839 memcpy(phi, r, kferm2*sizeof(Complex_f));
840 //Spacelike term
841#pragma omp parallel for
842 for(int i=0;i<kvol;i++){
843#ifndef NO_SPACE
844 for(int mu = 0; mu <ndim-1; mu++){
845 int did=id[mu+ndim*i]; int uid = iu[mu+ndim*i];
846#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamval_f:AVX)
847 for(int idirac=0; idirac<ndirac; idirac++){
848 //FORTRAN had mod((idirac-1),4)+1 to prevent issues with non-zero indexing.
849 int igork1 = gamin[mu*ndirac+idirac];
850 //Can manually vectorise with a pragma?
851 //Wilson + Dirac term in that order. Definitely easier
852 //to read when split into different loops, but should be faster this way
853
854 //Reminder! gamval was rescaled by kappa when we defined it
855 phi[(i*ndirac+idirac)*nc]+=
856 -akappa*(u11t_f[i*ndim+mu]*r[(uid*ndirac+idirac)*nc]
857 +u12t_f[i*ndim+mu]*r[(uid*ndirac+idirac)*nc+1]
858 +conjf(u11t_f[did*ndim+mu])*r[(did*ndirac+idirac)*nc]
859 -u12t_f[did*ndim+mu] *r[(did*ndirac+idirac)*nc+1])
860 -gamval_f[mu*ndirac+idirac]*
861 ( u11t_f[i*ndim+mu]*r[(uid*ndirac+igork1)*nc]
862 +u12t_f[i*ndim+mu]*r[(uid*ndirac+igork1)*nc+1]
863 -conjf(u11t_f[did*ndim+mu])*r[(did*ndirac+igork1)*nc]
864 +u12t_f[did*ndim+mu] *r[(did*ndirac+igork1)*nc+1]);
865
866 phi[(i*ndirac+idirac)*nc+1]+=
867 -akappa*(-conjf(u12t_f[i*ndim+mu])*r[(uid*ndirac+idirac)*nc]
868 +conjf(u11t_f[i*ndim+mu])*r[(uid*ndirac+idirac)*nc+1]
869 +conjf(u12t_f[did*ndim+mu])*r[(did*ndirac+idirac)*nc]
870 +u11t_f[did*ndim+mu] *r[(did*ndirac+idirac)*nc+1])
871 -gamval_f[mu*ndirac+idirac]*
872 (-conjf(u12t_f[i*ndim+mu])*r[(uid*ndirac+igork1)*nc]
873 +conjf(u11t_f[i*ndim+mu])*r[(uid*ndirac+igork1)*nc+1]
874 -conjf(u12t_f[did*ndim+mu])*r[(did*ndirac+igork1)*nc]
875 -u11t_f[did*ndim+mu] *r[(did*ndirac+igork1)*nc+1]);
876 }
877 }
878#endif
879 //Timelike terms
880 int did=id[3+ndim*i]; int uid = iu[3+ndim*i];
881#ifndef NO_TIME
882#pragma omp simd aligned(phi,r,u11t_f,u12t_f,gamval_f:AVX)
883 for(int idirac=0; idirac<ndirac; idirac++){
884 int igork1 = gamin[3*ndirac+idirac];
885 //Factorising for performance, we get (float)dk4?*(float)u1?*(+/-r_wilson -/+ r_dirac)
886 //(float)dk4m and dk4p_f swap under dagger
887 phi[(i*ndirac+idirac)*nc]+=
888 -dk4m_f[i]*(u11t_f[i*ndim+3]*(r[(uid*ndirac+idirac)*nc]+r[(uid*ndirac+igork1)*nc])
889 +u12t_f[i*ndim+3]*(r[(uid*ndirac+idirac)*nc+1]+r[(uid*ndirac+igork1)*nc+1]))
890 -dk4p_f[did]*(conjf(u11t_f[did*ndim+3])*(r[(did*ndirac+idirac)*nc]-r[(did*ndirac+igork1)*nc])
891 -u12t_f[did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]-r[(did*ndirac+igork1)*nc+1]));
892
893 phi[(i*ndirac+idirac)*nc+1]+=
894 -dk4m_f[i]*(-conjf(u12t_f[i*ndim+3])*(r[(uid*ndirac+idirac)*nc]+r[(uid*ndirac+igork1)*nc])
895 +conjf(u11t_f[i*ndim+3])*(r[(uid*ndirac+idirac)*nc+1]+r[(uid*ndirac+igork1)*nc+1]))
896 -dk4p_f[did]*(conjf(u12t_f[did*ndim+3])*(r[(did*ndirac+idirac)*nc]-r[(did*ndirac+igork1)*nc])
897 +u11t_f[did*ndim+3] *(r[(did*ndirac+idirac)*nc+1]-r[(did*ndirac+igork1)*nc+1]));
898 }
899#endif
900 }
901#endif
902 return 0;
903}
904inline int Reunitarise(Complex *u11t, Complex *u12t){
905 /*
906 * @brief Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1
907 *
908 * If you're looking at the FORTRAN code be careful. There are two header files
909 * for the /trial/ header. One with u11 u12 (which was included here originally)
910 * and the other with u11t and u12t.
911 *
912 * @see cuReunitarise (CUDA Wrapper)
913 *
914 * @param u11t, u12t Trial fields to be reunitarised
915 *
916 * @return Zero on success, integer error code otherwise
917 */
918 const char *funcname = "Reunitarise";
919#ifdef __NVCC__
920 cuReunitarise(u11t,u12t,dimGrid,dimBlock);
921#else
922#pragma omp parallel for simd aligned(u11t,u12t:AVX)
923 for(int i=0; i<kvol*ndim; i++){
924 //Declaring anorm inside the loop will hopefully let the compiler know it
925 //is safe to vectorise aggressively
926 double anorm=sqrt(conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]);
927 // Exception handling code. May be faster to leave out as the exit prevents vectorisation.
928 // if(anorm==0){
929 // fprintf(stderr, "Error %i in %s on rank %i: anorm = 0 for μ=%i and i=%i.\nExiting...\n\n",
930 // DIVZERO, funcname, rank, mu, i);
931 // MPI_Finalise();
932 // exit(DIVZERO);
933 // }
934 u11t[i]/=anorm;
935 u12t[i]/=anorm;
936 }
937#endif
938 return 0;
939}
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, float akappa)
Evaluates in single precision.
Definition matrices.c:711
int Dslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa)
Evaluates in double precision.
Definition matrices.c:18
int Dslashd(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa)
Evaluates in double precision.
Definition matrices.c:140
int Hdslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, float akappa)
Evaluates in single precision.
Definition matrices.c:802
int Reunitarise(Complex *u11t, Complex *u12t)
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
Definition matrices.c:904
int Hdslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
Evaluates in double precision.
Definition matrices.c:267
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:584
int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:462
int Hdslashd(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
Evaluates in double precision.
Definition matrices.c:358
Matrix multiplication and related declarations.
int CHalo_swap_all(Complex_f *c, int ncpt)
Calls the functions to send data to both the up and down halos.
int ZHalo_swap_all(Complex *z, int ncpt)
Calls the functions to send data to both the up and down halos.
#define nc
Colours.
Definition sizes.h:173
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define ndirac
Dirac indices.
Definition sizes.h:177
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188