su2hmc
Loading...
Searching...
No Matches
force.c File Reference

Code for force calculations. More...

#include <matrices.h>
Include dependency graph for force.c:

Go to the source code of this file.

Functions

int Gauge_force (double *dSdpi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, float beta)
 Calculates the gauge force due to the Wilson Action at each intermediate time.
 
int Force (double *dSdpi, int iflag, double res1, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancg)
 Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time.
 

Detailed Description

Code for force calculations.

Definition in file force.c.

Function Documentation

◆ Force()

int Force ( double * dSdpi,
int iflag,
double res1,
Complex * X0,
Complex * X1,
Complex * Phi,
Complex * u11t,
Complex * u12t,
Complex_f * u11t_f,
Complex_f * u12t_f,
unsigned int * iu,
unsigned int * id,
Complex * gamval,
Complex_f * gamval_f,
int * gamin,
double * dk4m,
double * dk4p,
float * dk4m_f,
float * dk4p_f,
Complex_f jqq,
float akappa,
float beta,
double * ancg )

Calculates the force \(\frac{dS}{d\pi}\) at each intermediate time.

Parameters
dSdpiThe force
iflagInvert before evaluating the force. 0 to invert, one not to. Blame FORTRAN...
res1Conjugate gradient residule
X0Up/down partitioned pseudofermion field
X1Holder for the partitioned fermion field, then the conjugate gradient output
PhiPseudofermion field
u11t,u12tDouble precision colour fields
u11t_f,u12t_fSingle precision colour fields
iu,idLattice indices
gaminGamma indices
gamvalDouble precision gamma matrices rescaled by kappa
gamval_fSingle precision gamma matrices rescaled by kappa
dk4m\(e^{-\mu}\)
dk4p\(e^\mu\)
dk4m_f\(e^{-\mu}\) float
dk4p_f\(e^\mu\) float
jqqDiquark source
akappaHopping parameter
betaInverse gauge coupling
ancgCounter for conjugate gradient iterations
Returns
Zero on success, integer error code otherwise

Definition at line 131 of file force.c.

134 {
135 /*
136 * @brief Calculates the force @f$\frac{dS}{d\pi}@f$ at each intermediate time
137 *
138 * @param dSdpi: The force
139 * @param iflag: Invert before evaluating the force?
140 * @param res1: Conjugate gradient residule
141 * @param X0: Up/down partitioned pseudofermion field
142 * @param X1: Holder for the partitioned fermion field, then the conjugate gradient output
143 * @param Phi: Pseudofermion field
144 * @param u11t,u12t Double precision colour fields
145 * @param u11t_f,u12t_f: Single precision colour fields
146 * @param iu,id: Lattice indices
147 * @param gamin: Gamma indices
148 * @param gamval: Double precision gamma matrices
149 * @param gamval_f: Single precision gamma matrices
150 * @param dk4m: @f$\left(1+\gamma_0\right)e^{-\mu}@f$
151 * @param dk4p: @f$\left(1-\gamma_0\right)e^\mu@f$
152 * @param dk4m_f: @f$\left(1+\gamma_0\right)e^{-\mu}@f$ float
153 * @param dk4p_f: @f$\left(1-\gamma_0\right)e^\mu@f$ float
154 * @param jqq: Diquark source
155 * @param akappa: Hopping parameter
156 * @param beta: Inverse gauge coupling
157 * @param ancg: Counter for conjugate gradient iterations
158 *
159 * @return Zero on success, integer error code otherwise
160 */
161 const char *funcname = "Force";
162#ifdef __NVCC__
163 int device=-1;
164 cudaGetDevice(&device);
165#endif
166#ifndef NO_GAUGE
167 Gauge_force(dSdpi,u11t_f,u12t_f,iu,id,beta);
168#endif
169 //X1=(M†M)^{1} Phi
170 int itercg=1;
171#ifdef __NVCC__
172 Complex *X2;
173 cudaMallocManaged((void **)&X2,kferm2Halo*sizeof(Complex),cudaMemAttachGlobal);
174#else
175 Complex *X2= (Complex *)aligned_alloc(AVX,kferm2Halo*sizeof(Complex));
176#endif
177 for(int na = 0; na<nf; na++){
178#ifdef __NVCC__
179 cudaMemcpyAsync(X1,X0+na*kferm2,kferm2*sizeof(Complex),cudaMemcpyDeviceToDevice,NULL);
180#else
181 memcpy(X1,X0+na*kferm2,kferm2*sizeof(Complex));
182#endif
183 if(!iflag){
184#ifdef __NVCC__
185 Complex *smallPhi;
186 cudaMallocAsync((void **)&smallPhi,kferm2*sizeof(Complex),streams[0]);
187#else
188 Complex *smallPhi = (Complex *)aligned_alloc(AVX,kferm2*sizeof(Complex));
189#endif
190 Fill_Small_Phi(na, smallPhi, Phi);
191 // Congradq(na, res1,smallPhi, &itercg );
192 Congradq(na,res1,X1,smallPhi,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg);
193#ifdef __NVCC__
194 cudaFreeAsync(smallPhi,streams[0]);
195#else
196 free(smallPhi);
197#endif
198 *ancg+=itercg;
199#ifdef __NVCC__
200 Complex blasa=2.0; double blasb=-1.0;
201 cublasZdscal(cublas_handle,kferm2,&blasb,(cuDoubleComplex *)(X0+na*kferm2),1);
202 cublasZaxpy(cublas_handle,kferm2,(cuDoubleComplex *)&blasa,(cuDoubleComplex *)X1,1,(cuDoubleComplex *)(X0+na*kferm2),1);
203 //HDslash launches a different stream so we need a barrieer
204 cudaDeviceSynchronise();
205#elif (defined __INTEL_MKL__)
206 Complex blasa=2.0; Complex blasb=-1.0;
207 //This is not a general BLAS Routine. BLIS and MKl support it
208 //CUDA and GSL does not support it
209 cblas_zaxpby(kferm2, &blasa, X1, 1, &blasb, X0+na*kferm2, 1);
210#elif defined USE_BLAS
211 Complex blasa=2.0; double blasb=-1.0;
212 cblas_zdscal(kferm2,blasb,X0+na*kferm2,1);
213 cblas_zaxpy(kferm2,&blasa,X1,1,X0+na*kferm2,1);
214#else
215#pragma omp parallel for simd collapse(2)
216 for(int i=0;i<kvol;i++)
217 for(int idirac=0;idirac<ndirac;idirac++){
218 X0[((na*kvol+i)*ndirac+idirac)*nc]=
219 2*X1[(i*ndirac+idirac)*nc]-X0[((na*kvol+i)*ndirac+idirac)*nc];
220 X0[((na*kvol+i)*ndirac+idirac)*nc+1]=
221 2*X1[(i*ndirac+idirac)*nc+1]-X0[((na*kvol+i)*ndirac+idirac)*nc+1];
222 }
223#endif
224 }
225 Hdslash(X2,X1,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,akappa);
226#ifdef __NVCC__
227 double blasd=2.0;
228 cudaDeviceSynchronise();
229 cublasZdscal(cublas_handle,kferm2, &blasd, (cuDoubleComplex *)X2, 1);
230#elif defined USE_BLAS
231 double blasd=2.0;
232 cblas_zdscal(kferm2, blasd, X2, 1);
233#else
234#pragma unroll
235 for(int i=0;i<kferm2;i++)
236 X2[i]*=2;
237#endif
238#if(npx>1)
239 ZHalo_swap_dir(X1,8,0,DOWN);
240 ZHalo_swap_dir(X2,8,0,DOWN);
241#endif
242#if(npy>1)
243 ZHalo_swap_dir(X1,8,1,DOWN);
244 ZHalo_swap_dir(X2,8,1,DOWN);
245#endif
246#if(npz>1)
247 ZHalo_swap_dir(X1,8,2,DOWN);
248 ZHalo_swap_dir(X2,8,2,DOWN);
249#endif
250#if(npt>1)
251 ZHalo_swap_dir(X1,8,3,DOWN);
252 ZHalo_swap_dir(X2,8,3,DOWN);
253#endif
254
255 // The original FORTRAN Comment:
256 // dSdpi=dSdpi-Re(X1*(d(Mdagger)dp)*X2) -- Yikes!
257 // we're gonna need drugs for this one......
258 //
259 // Makes references to X1(.,.,iu(i,mu)) AND X2(.,.,iu(i,mu))
260 // as a result, need to swap the DOWN halos in all dirs for
261 // both these arrays, each of which has 8 cpts
262 //
263#ifdef __NVCC__
264 Complex_f *X1_f, *X2_f;
265 cudaMallocAsync((void **)&X1_f,kferm2*sizeof(Complex_f),NULL);
266 cuComplex_convert(X1_f,X1,kferm2,true,dimBlock,dimGrid);
267 Transpose_c(X1_f,ndirac*nc,kvol,dimGrid,dimBlock);
268
269 cudaMallocAsync((void **)&X2_f,kferm2*sizeof(Complex_f),NULL);
270 cuComplex_convert(X2_f,X2,kferm2,true,dimBlock,dimGrid);
271 Transpose_c(X2_f,ndirac*nc,kvol,dimGrid,dimBlock);
272// Transpose_z(X1,kvol,ndirac*nc,dimGrid,dimBlock); Transpose_z(X2,kvol,ndirac*nc,dimGrid,dimBlock);
273 cuForce(dSdpi,u11t_f,u12t_f,X1_f,X2_f,gamval_f,dk4m_f,dk4p_f,iu,gamin,akappa,dimGrid,dimBlock);
274 cudaDeviceSynchronise();
275 cudaFreeAsync(X1_f,NULL); cudaFreeAsync(X2_f,NULL);
276#else
277#pragma omp parallel for
278 for(int i=0;i<kvol;i++)
279 for(int idirac=0;idirac<ndirac;idirac++){
280 int mu, uid, igork1;
281#ifndef NO_SPACE
282#pragma omp simd aligned(dSdpi,X1,X2,u11t,u12t,iu:AVX)
283 for(mu=0; mu<3; mu++){
284 //Long term ambition. I used the diff command on the different
285 //spacial components of dSdpi and saw a lot of the values required
286 //for them are duplicates (u11(i,mu)*X2(1,idirac,i) is used again with
287 //a minus in front for example. Why not evaluate them first /and then plug
288 //them into the equation? Reduce the number of evaluations needed and look
289 //a bit neater (although harder to follow as a consequence).
290
291 //Up indices
292 uid = iu[mu+ndim*i];
293 igork1 = gamin[mu*ndirac+idirac];
294
295 //REMINDER. Gamma is already scaled by kappa when we defined them. So if yer trying to rederive this from
296 //Montvay and Munster and notice a missing kappa in the code, that is why.
297 dSdpi[(i*nadj)*ndim+mu]+=akappa*creal(I*
298 (conj(X1[(i*ndirac+idirac)*nc])*
299 (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
300 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
301 +conj(X1[(uid*ndirac+idirac)*nc])*
302 ( u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
303 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
304 +conj(X1[(i*ndirac+idirac)*nc+1])*
305 (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
306 +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
307 +conj(X1[(uid*ndirac+idirac)*nc+1])*
308 (-u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
309 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
310 dSdpi[(i*nadj)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
311 (conj(X1[(i*ndirac+idirac)*nc])*
312 (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
313 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
314 +conj(X1[(uid*ndirac+idirac)*nc])*
315 (-u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
316 +conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
317 +conj(X1[(i*ndirac+idirac)*nc+1])*
318 (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
319 +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
320 +conj(X1[(uid*ndirac+idirac)*nc+1])*
321 (u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
322 +conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
323
324 dSdpi[(i*nadj+1)*ndim+mu]+=akappa*creal(
325 (conj(X1[(i*ndirac+idirac)*nc])*
326 (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
327 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
328 +conj(X1[(uid*ndirac+idirac)*nc])*
329 (-u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
330 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])
331 +conj(X1[(i*ndirac+idirac)*nc+1])*
332 (-u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
333 -u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
334 +conj(X1[(uid*ndirac+idirac)*nc+1])*
335 (u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
336 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])));
337 dSdpi[(i*nadj+1)*ndim+mu]+=creal(gamval[mu*ndirac+idirac]*
338 (conj(X1[(i*ndirac+idirac)*nc])*
339 (-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
340 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
341 +conj(X1[(uid*ndirac+idirac)*nc])*
342 (u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
343 +conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])
344 +conj(X1[(i*ndirac+idirac)*nc+1])*
345 (-u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
346 -u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
347 +conj(X1[(uid*ndirac+idirac)*nc+1])*
348 (-u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
349 +conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1])));
350
351 dSdpi[(i*nadj+2)*ndim+mu]+=akappa*creal(I*
352 (conj(X1[(i*ndirac+idirac)*nc])*
353 (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
354 +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1])
355 +conj(X1[(uid*ndirac+idirac)*nc])*
356 (-conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
357 -u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])
358 +conj(X1[(i*ndirac+idirac)*nc+1])*
359 (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
360 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1])
361 +conj(X1[(uid*ndirac+idirac)*nc+1])*
362 (-conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
363 +u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1])));
364 dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*gamval[mu*ndirac+idirac]*
365 (conj(X1[(i*ndirac+idirac)*nc])*
366 (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
367 +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1])
368 +conj(X1[(uid*ndirac+idirac)*nc])*
369 (conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
370 +u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])
371 +conj(X1[(i*ndirac+idirac)*nc+1])*
372 (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
373 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1])
374 +conj(X1[(uid*ndirac+idirac)*nc+1])*
375 (conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
376 -u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1])));
377
378 }
379#endif
380 //We're not done tripping yet!! Time like term is different. dk4? shows up
381 //For consistency we'll leave mu in instead of hard coding.
382 mu=3;
383 uid = iu[mu+ndim*i];
384 igork1 = gamin[mu*ndirac+idirac];
385#ifndef NO_TIME
386 dSdpi[(i*nadj)*ndim+mu]+=creal(I*
387 (conj(X1[(i*ndirac+idirac)*nc])*
388 (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
389 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
390 +conj(X1[(uid*ndirac+idirac)*nc])*
391 (dk4p[i]* (+u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
392 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
393 +conj(X1[(i*ndirac+idirac)*nc+1])*
394 (dk4m[i]* (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
395 +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
396 +conj(X1[(uid*ndirac+idirac)*nc+1])*
397 (dk4p[i]* (-u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
398 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))))
399 +creal(I*
400 (conj(X1[(i*ndirac+idirac)*nc])*
401 (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
402 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
403 +conj(X1[(uid*ndirac+idirac)*nc])*
404 (-dk4p[i]* (u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
405 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
406 +conj(X1[(i*ndirac+idirac)*nc+1])*
407 (dk4m[i]* (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
408 +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
409 +conj(X1[(uid*ndirac+idirac)*nc+1])*
410 (-dk4p[i]* (-u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
411 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
412
413 dSdpi[(i*nadj+1)*ndim+mu]+=creal(
414 conj(X1[(i*ndirac+idirac)*nc])*
415 (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
416 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
417 +conj(X1[(uid*ndirac+idirac)*nc])*
418 (dk4p[i]* (-u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
419 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1]))
420 +conj(X1[(i*ndirac+idirac)*nc+1])*
421 (dk4m[i]* (-u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
422 -u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
423 +conj(X1[(uid*ndirac+idirac)*nc+1])*
424 (dk4p[i]* ( u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc]
425 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc+1])))
426 +creal(
427 (conj(X1[(i*ndirac+idirac)*nc])*
428 (dk4m[i]*(-conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
429 +conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
430 +conj(X1[(uid*ndirac+idirac)*nc])*
431 (-dk4p[i]* (-u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
432 -conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))
433 +conj(X1[(i*ndirac+idirac)*nc+1])*
434 (dk4m[i]* (-u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
435 -u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
436 +conj(X1[(uid*ndirac+idirac)*nc+1])*
437 (-dk4p[i]* (u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc]
438 -conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc+1]))));
439
440 dSdpi[(i*nadj+2)*ndim+mu]+=creal(I*
441 (conj(X1[(i*ndirac+idirac)*nc])*
442 (dk4m[i]* (u11t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc]
443 +u12t[i*ndim+mu] *X2[(uid*ndirac+idirac)*nc+1]))
444 +conj(X1[(uid*ndirac+idirac)*nc])*
445 (dk4p[i]*(-conj(u11t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
446 -u12t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))
447 +conj(X1[(i*ndirac+idirac)*nc+1])*
448 (dk4m[i]* (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc]
449 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+idirac)*nc+1]))
450 +conj(X1[(uid*ndirac+idirac)*nc+1])*
451 (dk4p[i]*(-conj(u12t[i*ndim+mu])*X2[(i*ndirac+idirac)*nc]
452 +u11t[i*ndim+mu] *X2[(i*ndirac+idirac)*nc+1]))))
453 +creal(I*
454 (conj(X1[(i*ndirac+idirac)*nc])*
455 (dk4m[i]* (u11t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc]
456 +u12t[i*ndim+mu] *X2[(uid*ndirac+igork1)*nc+1]))
457 +conj(X1[(uid*ndirac+idirac)*nc])*
458 (-dk4p[i]*(-conj(u11t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
459 -u12t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))
460 +conj(X1[(i*ndirac+idirac)*nc+1])*
461 (dk4m[i]* (conj(u12t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc]
462 -conj(u11t[i*ndim+mu])*X2[(uid*ndirac+igork1)*nc+1]))
463 +conj(X1[(uid*ndirac+idirac)*nc+1])*
464 (-dk4p[i]*(-conj(u12t[i*ndim+mu])*X2[(i*ndirac+igork1)*nc]
465 +u11t[i*ndim+mu] *X2[(i*ndirac+igork1)*nc+1]))));
466
467#endif
468 }
469#endif
470 }
471#ifdef __NVCC__
472 cudaFree(X2);
473#else
474 free(X2);
475#endif
476 return 0;
477}
int Gauge_force(double *dSdpi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
Definition force.c:6
int Hdslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
Evaluates in double precision.
Definition matrices.c:267
#define DOWN
Flag for send down.
Definition par_mpi.h:35
int ZHalo_swap_dir(Complex *z, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define nadj
adjacent spatial indices
Definition sizes.h:175
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define nf
Fermion flavours (double it)
Definition sizes.h:151
#define ndirac
Dirac indices.
Definition sizes.h:177
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188
int Congradq(int na, double res, Complex *X1, Complex *r, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (up/down flavour partitioning). Solves Implements up/down pa...
Definition congrad.c:7
int Fill_Small_Phi(int na, Complex *smallPhi, Complex *Phi)
Definition su2hmc.c:349

References AVX, Complex, Complex_f, Congradq(), DOWN, Fill_Small_Phi(), Gauge_force(), Hdslash(), kferm2, kferm2Halo, kvol, nadj, nc, ndim, ndirac, nf, and ZHalo_swap_dir().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ Gauge_force()

int Gauge_force ( double * dSdpi,
Complex_f * u11t,
Complex_f * u12t,
unsigned int * iu,
unsigned int * id,
float beta )

Calculates the gauge force due to the Wilson Action at each intermediate time.

Parameters
dSdpiThe force
u11t,u12tGauge fields
iu,idLattice indices
betaInverse gauge coupling
Returns
Zero on success, integer error code otherwise

Definition at line 6 of file force.c.

6 {
7 /*
8 * Calculates dSdpi due to the Wilson Action at each intermediate time
9 *
10 * Calls:
11 * =====
12 * C_Halo_swap_all, C_gather, C_Halo_swap_dir
13 *
14 * Parameters:
15 * =======
16 * double *dSdpi
17 * Complex_f *u11t
18 * Complex_f *u12t
19 * unsigned int *iu
20 * unsigned int *id
21 * float beta
22 *
23 * Returns:
24 * =======
25 * Zero on success, integer error code otherwise
26 */
27 const char *funcname = "Gauge_force";
28
29 //We define zero halos for debugging
30 // #ifdef _DEBUG
31 // memset(u11t[kvol], 0, ndim*halo*sizeof(Complex_f));
32 // memset(u12t[kvol], 0, ndim*halo*sizeof(Complex_f));
33 // #endif
34 //Was a trial field halo exchange here at one point.
35#ifdef __NVCC__
36 int device=-1;
37 cudaGetDevice(&device);
38 Complex_f *Sigma11, *Sigma12, *u11sh, *u12sh;
39 cudaMallocAsync((void **)&Sigma11,kvol*sizeof(Complex_f),streams[0]);
40 cudaMallocAsync((void **)&Sigma12,kvol*sizeof(Complex_f),streams[1]);
41 cudaMallocManaged((void **)&u11sh,(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);
42 cudaMallocManaged((void **)&u12sh,(kvol+halo)*sizeof(Complex_f),cudaMemAttachGlobal);
43#else
44 Complex_f *Sigma11 = (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f));
45 Complex_f *Sigma12= (Complex_f *)aligned_alloc(AVX,kvol*sizeof(Complex_f));
46 Complex_f *u11sh = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f));
47 Complex_f *u12sh = (Complex_f *)aligned_alloc(AVX,(kvol+halo)*sizeof(Complex_f));
48#endif
49 //Holders for directions
50 for(int mu=0; mu<ndim; mu++){
51#ifdef __NVCC__
52 cudaMemset(Sigma11,0, kvol*sizeof(Complex_f));
53 cudaMemset(Sigma12,0, kvol*sizeof(Complex_f));
54#else
55 memset(Sigma11,0, kvol*sizeof(Complex_f));
56 memset(Sigma12,0, kvol*sizeof(Complex_f));
57#endif
58 for(int nu=0; nu<ndim; nu++)
59 if(nu!=mu){
60 //The +ν Staple
61#ifdef __NVCC__
62 cuPlus_staple(mu,nu,iu,Sigma11,Sigma12,u11t,u12t,dimGrid,dimBlock);
63#else
64#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12,iu:AVX)
65 for(int i=0;i<kvol;i++){
66 int uidm = iu[mu+ndim*i];
67 int uidn = iu[nu+ndim*i];
68 Complex_f a11=u11t[uidm*ndim+nu]*conj(u11t[uidn*ndim+mu])+\
69 u12t[uidm*ndim+nu]*conj(u12t[uidn*ndim+mu]);
70 Complex_f a12=-u11t[uidm*ndim+nu]*u12t[uidn*ndim+mu]+\
71 u12t[uidm*ndim+nu]*u11t[uidn*ndim+mu];
72 Sigma11[i]+=a11*conj(u11t[i*ndim+nu])+a12*conj(u12t[i*ndim+nu]);
73 Sigma12[i]+=-a11*u12t[i*ndim+nu]+a12*u11t[i*ndim+nu];
74 }
75#endif
76 C_gather(u11sh, u11t, kvol, id, nu);
77 C_gather(u12sh, u12t, kvol, id, nu);
78#if(nproc>1)
79#ifdef __NVCC__
80 //Prefetch to the CPU for until we get NCCL working
81 cudaMemPrefetchAsync(u11sh, kvol*sizeof(Complex_f),cudaCpuDeviceId,streams[0]);
82 cudaMemPrefetchAsync(u12sh, kvol*sizeof(Complex_f),cudaCpuDeviceId,streams[1]);
83#endif
84 CHalo_swap_dir(u11sh, 1, mu, DOWN); CHalo_swap_dir(u12sh, 1, mu, DOWN);
85#ifdef __NVCC__
86 cudaMemPrefetchAsync(u11sh+kvol, halo*sizeof(Complex_f),device,streams[0]);
87 cudaMemPrefetchAsync(u12sh+kvol, halo*sizeof(Complex_f),device,streams[1]);
88#endif
89#endif
90 //Next up, the -ν staple
91#ifdef __NVCC__
92 cudaDeviceSynchronise();
93 cuMinus_staple(mu,nu,iu,id,Sigma11,Sigma12,u11sh,u12sh,u11t,u12t,dimGrid,dimBlock);
94#else
95#pragma omp parallel for simd aligned(u11t,u12t,u11sh,u12sh,Sigma11,Sigma12,iu,id:AVX)
96 for(int i=0;i<kvol;i++){
97 int uidm = iu[mu+ndim*i];
98 int didn = id[nu+ndim*i];
99 //uidm is correct here
100 Complex_f a11=conj(u11sh[uidm])*conj(u11t[didn*ndim+mu])-\
101 u12sh[uidm]*conj(u12t[didn*ndim+mu]);
102 Complex_f a12=-conj(u11sh[uidm])*u12t[didn*ndim+mu]-\
103 u12sh[uidm]*u11t[didn*ndim+mu];
104 Sigma11[i]+=a11*u11t[didn*ndim+nu]-a12*conj(u12t[didn*ndim+nu]);
105 Sigma12[i]+=a11*u12t[didn*ndim+nu]+a12*conj(u11t[didn*ndim+nu]);
106 }
107#endif
108 }
109#ifdef __NVCC__
110 cuGauge_force(mu,Sigma11,Sigma12,u11t,u12t,dSdpi,beta,dimGrid,dimBlock);
111#else
112#pragma omp parallel for simd aligned(u11t,u12t,Sigma11,Sigma12,dSdpi:AVX)
113 for(int i=0;i<kvol;i++){
114 Complex_f a11 = u11t[i*ndim+mu]*Sigma12[i]+u12t[i*ndim+mu]*conj(Sigma11[i]);
115 Complex_f a12 = u11t[i*ndim+mu]*Sigma11[i]+conj(u12t[i*ndim+mu])*Sigma12[i];
116
117 dSdpi[(i*nadj)*ndim+mu]=(double)(beta*cimag(a11));
118 dSdpi[(i*nadj+1)*ndim+mu]=(double)(beta*creal(a11));
119 dSdpi[(i*nadj+2)*ndim+mu]=(double)(beta*cimag(a12));
120 }
121#endif
122 }
123#ifdef __NVCC__
124 cudaDeviceSynchronise();
125 cudaFreeAsync(Sigma11,streams[0]); cudaFreeAsync(Sigma12,streams[1]); cudaFree(u11sh); cudaFree(u12sh);
126#else
127 free(u11sh); free(u12sh); free(Sigma11); free(Sigma12);
128#endif
129 return 0;
130}
int CHalo_swap_dir(Complex_f *c, int ncpt, int idir, int layer)
Swaps the halos along the axis given by idir in the direction given by layer.
#define halo
Total Halo size.
Definition sizes.h:222
int C_gather(Complex_f *x, Complex_f *y, int n, unsigned int *table, unsigned int mu)
Extracts all the single precision gauge links in the direction only.
Definition su2hmc.c:321

References AVX, C_gather(), CHalo_swap_dir(), Complex_f, DOWN, halo, kvol, nadj, and ndim.

Here is the call graph for this function:
Here is the caller graph for this function: