su2hmc
Loading...
Searching...
No Matches
force.c File Reference

Code for force calculations. More...

#include <matrices.h>
Include dependency graph for force.c:

Go to the source code of this file.

Functions

int Gauge_force (double *dSdpi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, float beta)
 Calculates the gauge force due to the Wilson Action at each intermediate time.
 
int Force (double *dSdpi, int iflag, double res1, Complex *X0, Complex *X1, Complex *Phi, Complex *ut[2], Complex_f *ut_f[2], unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk[2], float *dk_f[2], Complex_f jqq, float akappa, float beta, double *ancg)
 Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time.
 

Detailed Description

Code for force calculations.

Definition in file force.c.

Function Documentation

◆ Force()

int Force ( double * dSdpi,
int iflag,
double res1,
Complex * X0,
Complex * X1,
Complex * Phi,
Complex * ut[2],
Complex_f * ut_f[2],
unsigned int * iu,
unsigned int * id,
Complex * gamval,
Complex_f * gamval_f,
int * gamin,
double * dk[2],
float * dk_f[2],
Complex_f jqq,
float akappa,
float beta,
double * ancg )

Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time.

Parameters
dSdpiThe force
iflagInvert before evaluating the force. 0 to invert, one not to. Blame FORTRAN...
res1Conjugate gradient residule
X0Up/down partitioned pseudofermion field
X1Holder for the partitioned fermion field, then the conjugate gradient output
PhiPseudofermion field
utDouble precision colour fields
u2t_fSingle precision colour fields
iu,idLattice indices
gaminGamma indices
gamvalDouble precision gamma matrices rescaled by kappa
gamval_fSingle precision gamma matrices rescaled by kappa
dk\(e^{-\mu}\) and \(e^\mu\)
dk_f\(e^{-\mu}\) and \(e^\mu\) float
jqqDiquark source
akappaHopping parameter
betaInverse gauge coupling
ancgCounter for conjugate gradient iterations
Returns
Zero on success, integer error code otherwise

Definition at line 94 of file force.c.

96 {
97 /*
98 * @brief Calculates the force @f$\frac{dS}{d\pi}@f$ at each intermediate time
99 *
100 * @param dSdpi: The force
101 * @param iflag: Invert before evaluating the force?
102 * @param res1: Conjugate gradient residule
103 * @param X0: Up/down partitioned pseudofermion field
104 * @param X1: Holder for the partitioned fermion field, then the conjugate gradient output
105 * @param Phi: Pseudofermion field
106 * @param ut[0],ut[1] Double precision colour fields
107 * @param ut_f[0],ut_f[1]: Single precision colour fields
108 * @param iu,id: Lattice indices
109 * @param gamin: Gamma indices
110 * @param gamval: Double precision gamma matrices
111 * @param gamval_f: Single precision gamma matrices
112 * @param dk[0]: @f$\left(1+\gamma_0\right)e^{-\mu}@f$
113 * @param dk[1]: @f$\left(1-\gamma_0\right)e^\mu@f$
114 * @param dk_f[0]: @f$\left(1+\gamma_0\right)e^{-\mu}@f$ float
115 * @param dk_f[1]: @f$\left(1-\gamma_0\right)e^\mu@f$ float
116 * @param jqq: Diquark source
117 * @param akappa: Hopping parameter
118 * @param beta: Inverse gauge coupling
119 * @param ancg: Counter for conjugate gradient iterations
120 *
121 * @return Zero on success, integer error code otherwise
122 */
123 const char *funcname = "Force";
124#ifdef __NVCC__
125 int device=-1;
126 cudaGetDevice(&device);
127#endif
128#ifndef NO_GAUGE
129 Gauge_force(dSdpi,ut_f,iu,id,beta);
130#endif
131 if(!akappa)
132 return 0;
133 //X1=(M†M)^{1} Phi
134 int itercg=1;
135#ifdef __NVCC__
136 Complex_f *X1_f, *X2_f;
137 cudaMallocAsync((void **)&X2_f,kferm2*sizeof(Complex_f),streams[0]);
138 cudaMallocAsync((void **)&X1_f,kferm2*sizeof(Complex_f),NULL);
139 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
140#else
141 Complex *X2= (Complex *)aligned_alloc(AVX,kferm2Halo*sizeof(Complex));
142#endif
143 for(int na = 0; na<nf; na++){
144#ifdef __NVCC__
145 cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,NULL);
146#else
147 memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
148#endif
149 if(!iflag){
150#ifdef __NVCC__
151 Complex *smallPhi;
152 cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),streams[0]);
153#else
154 Complex *smallPhi = (Complex *)aligned_alloc(AVX,kferm2*sizeof(Complex));
155#endif
156 Fill_Small_Phi(na, smallPhi, Phi);
157 // Congradq(na, res1,smallPhi, &itercg );
158 Congradq(na,res1,X1,smallPhi,ut_f,iu,id,gamval_f,gamin,dk_f,jqq,akappa,&itercg);
159#ifdef __NVCC__
160 cudaFreeAsync(smallPhi,streams[0]);
161#else
162 free(smallPhi);
163#endif
164 *ancg+=itercg;
165#ifdef __NVCC__
166 Complex blasa=2.0; double blasb=-1.0;
167 cublasZdscal(cublas_handle,kferm2,&blasb,(cuDoubleComplex *)(X0+na*kferm2),1);
168 cublasZaxpy(cublas_handle,kferm2,(cuDoubleComplex *)&blasa,(cuDoubleComplex *)X1,1,(cuDoubleComplex *)(X0+na*kferm2),1);
169 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
170 //HDslash launches a different stream so we need a barrieer
171 cudaDeviceSynchronise();
172#elif (defined __INTEL_MKL__)
173 Complex blasa=2.0; Complex blasb=-1.0;
174 //This is not a general BLAS Routine. BLIS and MKl support it
175 //CUDA and GSL does not support it
176 cblas_zaxpby(kferm2, &blasa, X1, 1, &blasb, X0+na*kferm2, 1);
177#elif defined USE_BLAS
178 Complex blasa=2.0; double blasb=-1.0;
179 cblas_zdscal(kferm2,blasb,X0+na*kferm2,1);
180 cblas_zaxpy(kferm2,&blasa,X1,1,X0+na*kferm2,1);
181#else
182#pragma omp parallel for simd collapse(2)
183 for(int i=0;i<kvol;i++)
184 for(int idirac=0;idirac<ndirac;idirac++){
185 X0[((na*kvol+i)*ndirac+idirac)*nc]=
186 2*X1[(i*ndirac+idirac)*nc]-X0[((na*kvol+i)*ndirac+idirac)*nc];
187 X0[((na*kvol+i)*ndirac+idirac)*nc+1]=
188 2*X1[(i*ndirac+idirac)*nc+1]-X0[((na*kvol+i)*ndirac+idirac)*nc+1];
189 }
190#endif
191 }
192 #ifdef __NVCC__
193 Hdslash_f(X2_f,X1_f,ut_f,iu,id,gamval_f,gamin,dk_f,akappa);
194 #else
195 Hdslash(X2,X1,ut,iu,id,gamval,gamin,dk,akappa);
196 #endif
197#ifdef __NVCC__
198 float blasd=2.0;
199 cudaDeviceSynchronise();
200 cublasCsscal(cublas_handle,kferm2, &blasd, (cuComplex *)X2_f, 1);
201#elif defined USE_BLAS
202 double blasd=2.0;
203 cblas_zdscal(kferm2, blasd, X2, 1);
204#else
205#pragma unroll
206 for(int i=0;i<kferm2;i++)
207 X2[i]*=2;
208#endif
209#if(npx>1)
210 ZHalo_swap_dir(X1,8,0,DOWN);
211 ZHalo_swap_dir(X2,8,0,DOWN);
212#endif
213#if(npy>1)
214 ZHalo_swap_dir(X1,8,1,DOWN);
215 ZHalo_swap_dir(X2,8,1,DOWN);
216#endif
217#if(npz>1)
218 ZHalo_swap_dir(X1,8,2,DOWN);
219 ZHalo_swap_dir(X2,8,2,DOWN);
220#endif
221#if(npt>1)
222 ZHalo_swap_dir(X1,8,3,DOWN);
223 ZHalo_swap_dir(X2,8,3,DOWN);
224#endif
225
226 // The original FORTRAN Comment:
227 // dSdpi=dSdpi-Re(X1*(d(Mdagger)dp)*X2) -- Yikes!
228 // we're gonna need drugs for this one......
229 //
230 // Makes references to X1(.,.,iu(i,mu)) AND X2(.,.,iu(i,mu))
231 // as a result, need to swap the DOWN halos in all dirs for
232 // both these arrays, each of which has 8 cpts
233 //
234#ifdef __NVCC__
235 cuForce(dSdpi,ut_f,X1_f,X2_f,gamval_f,dk_f,iu,gamin,akappa,dimGrid,dimBlock);
236 cudaDeviceSynchronise();
237#else
238#pragma omp parallel for
239 for(int i=0;i<kvol;i++)
240 for(int idirac=0;idirac<ndirac;idirac++){
241 int mu, uid, igork1;
242#ifndef NO_SPACE
243#pragma omp simd //aligned(dSdpi,X1,X2,ut[0],ut[1],iu:AVX)
244 for(mu=0; mu<3; mu++){
245 //Long term ambition. I used the diff command on the different
246 //spacial components of dSdpi and saw a lot of the values required
247 //for them are duplicates (u11(i,mu)*X2(1,idirac,i) is used again with
248 //a minus in front for example. Why not evaluate them first /and then plug
249 //them into the equation? Reduce the number of evaluations needed and look
250 //a bit neater (although harder to follow as a consequence).
251
252 //Up indices
253 uid = iu[mu+ndim*i];
254 igork1 = gamin[mu*ndirac+idirac];
255
256 //REMINDER. Gamma is already scaled by kappa when we defined them. So if yer trying to rederive this from
257 //Montvay and Munster and notice a missing kappa in the code, that is why.
258 dSdpi[(i*nadj)*ndim+mu]+=akappa*creal(I*
259 (conj(X1[(i*ndirac+idirac)*nc])*
260 (-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
261 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
262 +conj(X1[(uid*ndirac+idirac)*nc])*
263 ( ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
264 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
265 +conj(X1[(i*ndirac+idirac)*nc+1])*
266 (ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
267 +ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
268 +conj(X1[(uid*ndirac+idirac)*nc+1])*
269 (-ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
270 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
271 dSdpi[(i*nadj)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
272 (conj(X1[(i*ndirac+idirac)*nc])*
273 (-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
274 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
275 +conj(X1[(uid*ndirac+idirac)*nc])*
276 (-ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
277 +conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
278 +conj(X1[(i*ndirac+idirac)*nc+1])*
279 (ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
280 +ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
281 +conj(X1[(uid*ndirac+idirac)*nc+1])*
282 (ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
283 +conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
284
285 dSdpi[(i*nadj+1)*ndim+mu]+=akappa*creal(
286 (conj(X1[(i*ndirac+idirac)*nc])*
287 (-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
288 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
289 +conj(X1[(uid*ndirac+idirac)*nc])*
290 (-ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
291 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
292 +conj(X1[(i*ndirac+idirac)*nc+1])*
293 (-ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
294 -ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
295 +conj(X1[(uid*ndirac+idirac)*nc+1])*
296 (ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
297 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
298 dSdpi[(i*nadj+1)*ndim+mu]+=creal(gamval[mu*ndirac+idirac]*
299 (conj(X1[(i*ndirac+idirac)*nc])*
300 (-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
301 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
302 +conj(X1[(uid*ndirac+idirac)*nc])*
303 (ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
304 +conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
305 +conj(X1[(i*ndirac+idirac)*nc+1])*
306 (-ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
307 -ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
308 +conj(X1[(uid*ndirac+idirac)*nc+1])*
309 (-ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
310 +conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
311
312 dSdpi[(i*nadj+2)*ndim+mu]+=akappa*creal(I*
313 (conj(X1[(i*ndirac+idirac)*nc])*
314 (ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
315 +ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
316 +conj(X1[(uid*ndirac+idirac)*nc])*
317 (-conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
318 -ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])
319 +conj(X1[(i*ndirac+idirac)*nc+1])*
320 (conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
321 -conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
322 +conj(X1[(uid*ndirac+idirac)*nc+1])*
323 (-conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
324 +ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])));
325 dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
326 (conj(X1[(i*ndirac+idirac)*nc])*
327 (ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
328 +ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
329 +conj(X1[(uid*ndirac+idirac)*nc])*
330 (conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
331 +ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])
332 +conj(X1[(i*ndirac+idirac)*nc+1])*
333 (conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
334 -conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
335 +conj(X1[(uid*ndirac+idirac)*nc+1])*
336 (conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
337 -ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])));
338
339 }
340#endif
341 //We're not done tripping yet!! Time like term is different. dk4? shows up
342 //For consistency we'll leave mu in instead of hard coding.
343 mu=3;
344 uid = iu[mu+ndim*i];
345 igork1 = gamin[mu*ndirac+idirac];
346#ifndef NO_TIME
347 dSdpi[(i*nadj)*ndim+mu]+=creal(I*
348 (conj(X1[(i*ndirac+idirac)*nc])*
349 (dk[0][i]*(-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
350 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
351 +conj(X1[(uid*ndirac+idirac)*nc])*
352 (dk[1][i]* (+ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
353 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
354 +conj(X1[(i*ndirac+idirac)*nc+1])*
355 (dk[0][i]* (ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
356 +ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
357 +conj(X1[(uid*ndirac+idirac)*nc+1])*
358 (dk[1][i]* (-ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
359 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))))
360 +creal(I*
361 (conj(X1[(i*ndirac+idirac)*nc])*
362 (dk[0][i]*(-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
363 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
364 +conj(X1[(uid*ndirac+idirac)*nc])*
365 (-dk[1][i]* (ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
366 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
367 +conj(X1[(i*ndirac+idirac)*nc+1])*
368 (dk[0][i]* (ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
369 +ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
370 +conj(X1[(uid*ndirac+idirac)*nc+1])*
371 (-dk[1][i]* (-ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
372 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
373
374 dSdpi[(i*nadj+1)*ndim+mu]+=creal(
375 conj(X1[(i*ndirac+idirac)*nc])*
376 (dk[0][i]*(-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
377 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
378 +conj(X1[(uid*ndirac+idirac)*nc])*
379 (dk[1][i]* (-ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
380 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
381 +conj(X1[(i*ndirac+idirac)*nc+1])*
382 (dk[0][i]* (-ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
383 -ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
384 +conj(X1[(uid*ndirac+idirac)*nc+1])*
385 (dk[1][i]* ( ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
386 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])))
387 +creal(
388 (conj(X1[(i*ndirac+idirac)*nc])*
389 (dk[0][i]*(-conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
390 +conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
391 +conj(X1[(uid*ndirac+idirac)*nc])*
392 (-dk[1][i]* (-ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
393 -conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
394 +conj(X1[(i*ndirac+idirac)*nc+1])*
395 (dk[0][i]* (-ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
396 -ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
397 +conj(X1[(uid*ndirac+idirac)*nc+1])*
398 (-dk[1][i]* (ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
399 -conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
400
401 dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*
402 (conj(X1[(i*ndirac+idirac)*nc])*
403 (dk[0][i]* (ut[0][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
404 +ut[1][i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
405 +conj(X1[(uid*ndirac+idirac)*nc])*
406 (dk[1][i]*(-conj(ut[0][i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
407 -ut[1][i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))
408 +conj(X1[(i*ndirac+idirac)*nc+1])*
409 (dk[0][i]* (conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
410 -conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
411 +conj(X1[(uid*ndirac+idirac)*nc+1])*
412 (dk[1][i]*(-conj(ut[1][i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
413 +ut[0][i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))))
414 +creal(I*
415 (conj(X1[(i*ndirac+idirac)*nc])*
416 (dk[0][i]* (ut[0][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
417 +ut[1][i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
418 +conj(X1[(uid*ndirac+idirac)*nc])*
419 (-dk[1][i]*(-conj(ut[0][i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
420 -ut[1][i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))
421 +conj(X1[(i*ndirac+idirac)*nc+1])*
422 (dk[0][i]* (conj(ut[1][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
423 -conj(ut[0][i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
424 +conj(X1[(uid*ndirac+idirac)*nc+1])*
425 (-dk[1][i]*(-conj(ut[1][i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
426 +ut[0][i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))));
427
428#endif
429 }
430#endif
431 }
432#ifdef __NVCC__
433 cudaFreeAsync(X1_f,streams[0]); cudaFreeAsync(X2_f,streams[1]);
434#else
435 free(X2);
436#endif
437 return 0;
438}
int Gauge_force(double *dSdpi, Complex_f *ut[2], unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
Definition force.c:6
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk[2], float akappa)
Evaluates in single precision.
Definition matrices.c:712
int Hdslash(Complex *phi, Complex *r, Complex *u11t[2], unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk[2], float akappa)
Evaluates in double precision.
Definition matrices.c:268
#define DOWN
Flag for send down.
Definition par_mpi.h:35
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define nadj
adjacent spatial indices
Definition sizes.h:175
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define nf
Fermion flavours (double it)
Definition sizes.h:151
#define ndirac
Dirac indices.
Definition sizes.h:177
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *ut[2], unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk[2], Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
Definition congrad.c:7
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
Definition su2hmc.c:385

References AVX, Complex, Complex_f, Congradq(), DOWN, Fill_Small_Phi(), Gauge_force(), Hdslash(), Hdslash_f(), kferm2, kferm2Halo, kvol, nadj, nc, ndim, ndirac, nf, and ZHalo_swap_dir().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Gauge_force()

int Gauge_force ( double * dSdpi,
Complex_f * ut[2],
unsigned int * iu,
unsigned int * id,
float beta )

Calculates the gauge force due to the Wilson Action at each intermediate time.

Parameters
dSdpiThe force
utGauge fields
iu,idLattice indices
betaInverse gauge coupling
Returns
Zero on success, integer error code otherwise

Definition at line 6 of file force.c.

6 {
7 /*
8 * Calculates dSdpi due to the Wilson Action at each intermediate time
9 *
10 * Calls:
11 * =====
12 * C_Halo_swap_all, C_gather, C_Halo_swap_dir
13 *
14 * Parameters:
15 * =======
16 * double *dSdpi
17 * Complex_f *ut[0]
18 * Complex_f *ut[1]
19 * unsigned int *iu
20 * unsigned int *id
21 * float beta
22 *
23 * Returns:
24 * =======
25 * Zero on success, integer error code otherwise
26 */
27 const char *funcname = "Gauge_force";
28
29 //We define zero halos for debugging
30 // #ifdef _DEBUG
31 // memset(ut[0][kvol], 0, ndim*halo*sizeof(Complex_f));
32 // memset(ut[1][kvol], 0, ndim*halo*sizeof(Complex_f));
33 // #endif
34 //Was a trial field halo exchange here at one point.
35#ifdef __NVCC__
36 cuGauge_force(ut,dSdpi,beta,iu,id,dimGrid,dimBlock);
37#else
38 Complex_f *Sigma[2], *ush[2];
39 Sigma[0] = (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f));
40 Sigma[1]= (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f));
41 ush[0] = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f));
42 ush[1] = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f));
43 //Holders for directions
44 for(int mu=0; mu<ndim; mu++){
45 memset(Sigma[0],0, kvol*sizeof(Complex_f));
46 memset(Sigma[1],0, kvol*sizeof(Complex_f));
47 for(int nu=0; nu<ndim; nu++)
48 if(nu!=mu){
49 //The +ν Staple
50#pragma omp parallel for simd //aligned(ut[0],ut[1],Sigma[0],Sigma[1],iu:AVX)
51 for(int i=0;i<kvol;i++){
52 int uidm = iu[mu+ndim*i];
53 int uidn = iu[nu+ndim*i];
54 Complex_f a11=ut[0][uidm*ndim+nu]*conj(ut[0][uidn*ndim+mu])+\
55 ut[1][uidm*ndim+nu]*conj(ut[1][uidn*ndim+mu]);
56 Complex_f a12=-ut[0][uidm*ndim+nu]*ut[1][uidn*ndim+mu]+\
57 ut[1][uidm*ndim+nu]*ut[0][uidn*ndim+mu];
58 Sigma[0][i]+=a11*conj(ut[0][i*ndim+nu])+a12*conj(ut[1][i*ndim+nu]);
59 Sigma[1][i]+=-a11*ut[1][i*ndim+nu]+a12*ut[0][i*ndim+nu];
60 }
61 C_gather(ush[0], ut[0], kvol, id, nu);
62 C_gather(ush[1], ut[1], kvol, id, nu);
63#if(nproc>1)
64 CHalo_swap_dir(ush[0], 1, mu, DOWN); CHalo_swap_dir(ush[1], 1, mu, DOWN);
65#endif
66 //Next up, the -ν staple
67#pragma omp parallel for simd //aligned(ut[0],ut[1],ush[0],ush[1],Sigma[0],Sigma[1],iu,id:AVX)
68 for(int i=0;i<kvol;i++){
69 int uidm = iu[mu+ndim*i];
70 int didn = id[nu+ndim*i];
71 //uidm is correct here
72 Complex_f a11=conj(ush[0][uidm])*conj(ut[0][didn*ndim+mu])-\
73 ush[1][uidm]*conj(ut[1][didn*ndim+mu]);
74 Complex_f a12=-conj(ush[0][uidm])*ut[1][didn*ndim+mu]-\
75 ush[1][uidm]*ut[0][didn*ndim+mu];
76 Sigma[0][i]+=a11*ut[0][didn*ndim+nu]-a12*conj(ut[1][didn*ndim+nu]);
77 Sigma[1][i]+=a11*ut[1][didn*ndim+nu]+a12*conj(ut[0][didn*ndim+nu]);
78 }
79 }
80#pragma omp parallel for simd //aligned(ut[0],ut[1],Sigma[0],Sigma[1],dSdpi:AVX)
81 for(int i=0;i<kvol;i++){
82 Complex_f a11 = ut[0][i*ndim+mu]*Sigma[1][i]+ut[1][i*ndim+mu]*conj(Sigma[0][i]);
83 Complex_f a12 = ut[0][i*ndim+mu]*Sigma[0][i]+conj(ut[1][i*ndim+mu])*Sigma[1][i];
84
85 dSdpi[(i*nadj)*ndim+mu]=(double)(beta*cimag(a11));
86 dSdpi[(i*nadj+1)*ndim+mu]=(double)(beta*creal(a11));
87 dSdpi[(i*nadj+2)*ndim+mu]=(double)(beta*cimag(a12));
88 }
89 }
90 free(ush[0]); free(ush[1]); free(Sigma[0]); free(Sigma[1]);
91#endif
92 return 0;
93}
int CHalo_swap_dir(Complex_f *c, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define halo
Total Halo size.
Definition sizes.h:222
int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
Extracts all the single precision gauge links in the direction only.
Definition su2hmc.c:357

References AVX, C_gather(), CHalo_swap_dir(), Complex_f, DOWN, halo, kvol, nadj, and ndim.

Here is the call graph for this function:
Here is the caller graph for this function: