8 unsigned int *iu,
unsigned int *
id,
int *
hu,
int *
hd,
double *dk4m,
double *dk4p,\
9 float *dk4m_f,
float *dk4p_f,
int *gamin,
Complex *gamval,
Complex_f *gamval_f,\
10 Complex_f jqq,
float akappa,
float beta,
double ancg){
22 const char *funcname =
"Diagnostics";
29 printf(
"FLT_EVAL_METHOD is %i. Check online for what this means\n", FLT_EVAL_METHOD);
33 cudaGetDevice(&device);
35 Complex_f *X0_f, *X1_f, *xi_f, *R1_f, *Phi_f;
47 cudaMallocManaged(&pp,
kmomHalo*
sizeof(
double),cudaMemAttachGlobal);
48 cudaMallocManaged(&dSdpi,
kmom*
sizeof(
double),cudaMemAttachGlobal);
58 double *pp = aligned_alloc(
AVX,
kmomHalo*
sizeof(
double));
61 double *dSdpi = aligned_alloc(
AVX,
kmom*
sizeof(
double));
65#pragma omp parallel sections
69 FILE *dk4m_File = fopen(
"dk4m",
"w");
70 for(
int i=0;i<
kvol;i+=4)
71 fprintf(dk4m_File,
"%f\t%f\t%f\t%f\n",dk4m[i],dk4m[i+1],dk4m[i+2],dk4m[i+3]);
75 FILE *dk4p_File = fopen(
"dk4p",
"w");
76 for(
int i=0;i<
kvol;i+=4)
77 fprintf(dk4p_File,
"%f\t%f\t%f\t%f\n",dk4p[i],dk4p[i+1],dk4p[i+2],dk4p[i+3]);
80 for(
int test = 0; test<=9; test++){
84#pragma omp parallel sections num_threads(4)
88 FILE *trial_out = fopen(
"u11t",
"w");
90 fprintf(trial_out,
"%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\n",
91 creal(u11t[i]),cimag(u11t[i]),creal(u11t[i+1]),cimag(u11t[i+1]),
92 creal(u11t[2+i]),cimag(u11t[2+i]),creal(u11t[i+3]),cimag(u11t[i+3]));
97 FILE *trial_out = fopen(
"u12t",
"w");
99 fprintf(trial_out,
"%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\n",
100 creal(u12t[i]),cimag(u12t[i]),creal(u12t[i+1]),cimag(u12t[i+1]),
101 creal(u12t[2+i]),cimag(u12t[2+i]),creal(u12t[i+3]),cimag(u12t[i+3]));
106 FILE *trial_out = fopen(
"u11t_f",
"w");
108 fprintf(trial_out,
"%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\n",
109 creal(u11t_f[i]),cimag(u11t_f[i]),creal(u11t_f[i+1]),cimag(u11t_f[i+1]),
110 creal(u11t_f[2+i]),cimag(u11t_f[2+i]),creal(u11t_f[i+3]),cimag(u11t_f[i+3]));
115 FILE *trial_out = fopen(
"u12t_f",
"w");
117 fprintf(trial_out,
"%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\n",
118 creal(u12t_f[i]),cimag(u12t_f[i]),creal(u12t_f[i+1]),cimag(u12t_f[i+1]),
119 creal(u12t_f[2+i]),cimag(u12t_f[2+i]),creal(u12t_f[i+3]),cimag(u12t_f[i+3]));
125#pragma omp parallel for simd aligned(u11t,u12t:AVX)
127 u11t[i+0]=1+I; u12t[i]=1+I;
128 u11t[i+1]=1-I; u12t[i+1]=1+I;
129 u11t[i+2]=1+I; u12t[i+2]=1-I;
130 u11t[i+3]=1-I; u12t[i+3]=1-I;
131 u11t[i+4]=-1+I; u12t[i+4]=1+I;
132 u11t[i+5]=1+I; u12t[i+5]=-1+I;
133 u11t[i+6]=-1+I; u12t[i+6]=-1+I;
134 u11t[i+7]=-1-I; u12t[i+7]=-1-I;
149#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
158 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm, R1_f, 0, 1/sqrt(2));
159 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm, Phi_f, 0, 1/sqrt(2));
160 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm, xi_f, 0, 1/sqrt(2));
161 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm, R1, 0, 1/sqrt(2));
162 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm, Phi, 0, 1/sqrt(2));
163 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm, xi, 0, 1/sqrt(2));
165#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
171 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm2, X0, 0, 1/sqrt(2));
172 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm2, X1, 0, 1/sqrt(2));
173 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm2, X0_f, 0, 1/sqrt(2));
174 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*
kferm2, X1_f, 0, 1/sqrt(2));
179#pragma omp for simd aligned(dSdpi:AVX) nowait
180 for(
int i=0; i<
kmom; i+=4){
181 double norm = sqrt(dSdpi[i]*dSdpi[i]+dSdpi[i+1]*dSdpi[i+1]+dSdpi[i+2]*dSdpi[i+2]+dSdpi[i+3]*dSdpi[i+3]);
182 dSdpi[i]/=norm; dSdpi[i+1]/=norm; dSdpi[i+2]/=norm;dSdpi[i+3]/=norm;
184 FILE *output_old, *output;
185 FILE *output_f_old, *output_f;
188#pragma omp parallel for simd
189 for(
int i = 0; i<
kferm; i++){
195 output_old = fopen(
"dslash_old",
"w");
196 output_f_old = fopen(
"dslash_f_old",
"w");
198 cudaMemPrefetchAsync(R1,
kferm*
sizeof(
Complex),device,NULL);
199 cudaMemPrefetchAsync(xi,
kferm*
sizeof(
Complex),device,NULL);
202 cudaDeviceSynchronise();
204 for(
int i = 0; i<
kferm; i+=8){
205 fprintf(output_old,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
206 creal(R1[i]),cimag(R1[i]),creal(R1[i+1]),cimag(R1[i+1]),
207 creal(R1[i+2]),cimag(R1[i+2]),creal(R1[i+3]),cimag(R1[i+3]),
208 creal(R1[i+4]),cimag(R1[i+4]),creal(R1[i+5]),cimag(R1[i+5]),
209 creal(R1[i+6]),cimag(R1[i+6]),creal(R1[i+7]),cimag(R1[i+7]) );
210 fprintf(output_f_old,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
211 creal(R1_f[i]),cimag(R1_f[i]),creal(R1_f[i+1]),cimag(R1_f[i+1]),
212 creal(R1_f[i+2]),cimag(R1_f[i+2]),creal(R1_f[i+3]),cimag(R1_f[i+3]),
213 creal(R1_f[i+4]),cimag(R1_f[i+4]),creal(R1_f[i+5]),cimag(R1_f[i+5]),
214 creal(R1_f[i+6]),cimag(R1_f[i+6]),creal(R1_f[i+7]),cimag(R1_f[i+7]));
215 printf(
"Difference in dslash double and float R1[%d] to R1[%d+7]:\n",i,i);
216 printf(
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
217 creal(R1[i]-R1_f[i]),cimag(R1[i]-R1_f[i]),creal(R1[i+1]-R1_f[i+1]),cimag(R1[i+1]-R1_f[i+1]),
218 creal(R1[i+2]-R1_f[i+2]),cimag(R1[i+2]-R1_f[i+2]),creal(R1[i+3]-R1_f[i+3]),cimag(R1[i+3]-R1_f[i+3]),
219 creal(R1[i+4]-R1_f[i+4]),cimag(R1[i+4]-R1_f[i+4]),creal(R1[i+5]-R1_f[i+5]),cimag(R1[i+5]-R1_f[i+5]),
220 creal(R1[i+6]-R1_f[i+6]),cimag(R1[i+6]-R1_f[i+6]),creal(R1[i+7]-R1_f[i+7]),cimag(R1[i+7]-R1_f[i+7]));
222 fclose(output_old); fclose(output_f_old);
223 Dslash(xi,R1,u11t,u12t,iu,
id,gamval,gamin,dk4m,dk4p,jqq,akappa);
225 cudaMemPrefetchAsync(xi,
kferm*
sizeof(
Complex),device,NULL);
230 Transpose_c(u11t_f,
ndim,
kvol,dimGrid,dimBlock);
231 Transpose_c(u12t_f,
ndim,
kvol,dimGrid,dimBlock);
232 cudaDeviceSynchronise();
234 Dslash_f(xi_f,R1_f,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa);
239 Transpose_c(u11t_f,
kvol,
ndim,dimGrid,dimBlock);
240 Transpose_c(u12t_f,
kvol,
ndim,dimGrid,dimBlock);
241 cudaDeviceSynchronise();
243 output = fopen(
"dslash",
"w");
244 output_f = fopen(
"dslash_f",
"w");
245 for(
int i = 0; i<
kferm; i+=8){
246 fprintf(output,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
247 creal(xi[i]),cimag(xi[i]),creal(xi[i+1]),cimag(xi[i+1]),
248 creal(xi[i+2]),cimag(xi[i+2]),creal(xi[i+3]),cimag(xi[i+3]),
249 creal(xi[i+4]),cimag(xi[i+4]),creal(xi[i+5]),cimag(xi[i+5]),
250 creal(xi[i+6]),cimag(xi[i+6]),creal(xi[i+7]),cimag(xi[i+7]) );
251 fprintf(output_f,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
252 creal(xi_f[i]),cimag(xi_f[i]),creal(xi_f[i+1]),cimag(xi_f[i+1]),
253 creal(xi_f[i+2]),cimag(xi_f[i+2]),creal(xi_f[i+3]),cimag(xi_f[i+3]),
254 creal(xi_f[i+4]),cimag(xi_f[i+4]),creal(xi_f[i+5]),cimag(xi_f[i+5]),
255 creal(xi_f[i+6]),cimag(xi_f[i+6]),creal(xi_f[i+7]),cimag(xi_f[i+7]));
256 printf(
"Difference in dslash double and float xi[%d] to xi[%d+7] after mult:\n",i,i);
257 printf(
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
258 creal(xi[i]-xi_f[i]),cimag(xi[i]-xi_f[i]),creal(xi[i+1]-xi_f[i+1]),cimag(xi[i+1]-xi_f[i+1]),
259 creal(xi[i+2]-xi_f[i+2]),cimag(xi[i+2]-xi_f[i+2]),creal(xi[i+3]-xi_f[i+3]),cimag(xi[i+3]-xi_f[i+3]),
260 creal(xi[i+4]-xi_f[i+4]),cimag(xi[i+4]-xi_f[i+4]),creal(xi[i+5]-xi_f[i+5]),cimag(xi[i+5]-xi_f[i+5]),
261 creal(xi[i+6]-xi_f[i+6]),cimag(xi[i+6]-xi_f[i+6]),creal(xi[i+7]-xi_f[i+7]),cimag(xi[i+7]-xi_f[i+7]));
263 fclose(output);fclose(output_f);
266#pragma omp parallel for simd
267 for(
int i = 0; i<
kferm; i++){
273 output_old = fopen(
"dslashd_old",
"w");
274 output_f_old = fopen(
"dslashd_f_old",
"w");
276 cudaMemPrefetchAsync(R1,
kferm*
sizeof(
Complex),device,NULL);
277 cudaMemPrefetchAsync(xi,
kferm*
sizeof(
Complex),device,NULL);
280 cudaDeviceSynchronise();
282 for(
int i = 0; i<
kferm; i+=8){
283 fprintf(output_old,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
284 creal(R1[i]),cimag(R1[i]),creal(R1[i+1]),cimag(R1[i+1]),
285 creal(R1[i+2]),cimag(R1[i+2]),creal(R1[i+3]),cimag(R1[i+3]),
286 creal(R1[i+4]),cimag(R1[i+4]),creal(R1[i+5]),cimag(R1[i+5]),
287 creal(R1[i+6]),cimag(R1[i+6]),creal(R1[i+7]),cimag(R1[i+7]) );
288 fprintf(output_f_old,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
289 creal(R1_f[i]),cimag(R1_f[i]),creal(R1_f[i+1]),cimag(R1_f[i+1]),
290 creal(R1_f[i+2]),cimag(R1_f[i+2]),creal(R1_f[i+3]),cimag(R1_f[i+3]),
291 creal(R1_f[i+4]),cimag(R1_f[i+4]),creal(R1_f[i+5]),cimag(R1_f[i+5]),
292 creal(R1_f[i+6]),cimag(R1_f[i+6]),creal(R1_f[i+7]),cimag(R1_f[i+7]));
293 printf(
"Difference in dslashd double and float R1[%d] to R1[%d+7]:\n",i,i);
294 printf(
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
295 creal(R1[i]-R1_f[i]),cimag(R1[i]-R1_f[i]),creal(R1[i+1]-R1_f[i+1]),cimag(R1[i+1]-R1_f[i+1]),
296 creal(R1[i+2]-R1_f[i+2]),cimag(R1[i+2]-R1_f[i+2]),creal(R1[i+3]-R1_f[i+3]),cimag(R1[i+3]-R1_f[i+3]),
297 creal(R1[i+4]-R1_f[i+4]),cimag(R1[i+4]-R1_f[i+4]),creal(R1[i+5]-R1_f[i+5]),cimag(R1[i+5]-R1_f[i+5]),
298 creal(R1[i+6]-R1_f[i+6]),cimag(R1[i+6]-R1_f[i+6]),creal(R1[i+7]-R1_f[i+7]),cimag(R1[i+7]-R1_f[i+7]));
303 fclose(output_old); fclose(output_f_old);
304 Dslashd(xi,R1,u11t,u12t,iu,
id,gamval,gamin,dk4m,dk4p,jqq,akappa);
306 cudaMemPrefetchAsync(xi,
kferm*
sizeof(
Complex),device,NULL);
311 Transpose_c(u11t_f,
ndim,
kvol,dimGrid,dimBlock);
312 Transpose_c(u12t_f,
ndim,
kvol,dimGrid,dimBlock);
313 cudaDeviceSynchronise();
315 Dslashd_f(xi_f,R1_f,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa);
320 Transpose_c(u11t_f,
kvol,
ndim,dimGrid,dimBlock);
321 Transpose_c(u12t_f,
kvol,
ndim,dimGrid,dimBlock);
322 cudaDeviceSynchronise();
324 output = fopen(
"dslashd",
"w");
325 output_f = fopen(
"dslashd_f",
"w");
326 for(
int i = 0; i<
kferm; i+=8){
327 fprintf(output,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
328 creal(xi[i]),cimag(xi[i]),creal(xi[i+1]),cimag(xi[i+1]),
329 creal(xi[i+2]),cimag(xi[i+2]),creal(xi[i+3]),cimag(xi[i+3]),
330 creal(xi[i+4]),cimag(xi[i+4]),creal(xi[i+5]),cimag(xi[i+5]),
331 creal(xi[i+6]),cimag(xi[i+6]),creal(xi[i+7]),cimag(xi[i+7]) );
332 fprintf(output_f,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
333 creal(xi_f[i]),cimag(xi_f[i]),creal(xi_f[i+1]),cimag(xi_f[i+1]),
334 creal(xi_f[i+2]),cimag(xi_f[i+2]),creal(xi_f[i+3]),cimag(xi_f[i+3]),
335 creal(xi_f[i+4]),cimag(xi_f[i+4]),creal(xi_f[i+5]),cimag(xi_f[i+5]),
336 creal(xi_f[i+6]),cimag(xi_f[i+6]),creal(xi_f[i+7]),cimag(xi_f[i+7]));
337 printf(
"Difference in dslashd double and float xi[%d] to xi[%d+7] after mult:\n",i,i);
338 printf(
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
339 creal(xi[i]-xi_f[i]),cimag(xi[i]-xi_f[i]),creal(xi[i+1]-xi_f[i+1]),cimag(xi[i+1]-xi_f[i+1]),
340 creal(xi[i+2]-xi_f[i+2]),cimag(xi[i+2]-xi_f[i+2]),creal(xi[i+3]-xi_f[i+3]),cimag(xi[i+3]-xi_f[i+3]),
341 creal(xi[i+4]-xi_f[i+4]),cimag(xi[i+4]-xi_f[i+4]),creal(xi[i+5]-xi_f[i+5]),cimag(xi[i+5]-xi_f[i+5]),
342 creal(xi[i+6]-xi_f[i+6]),cimag(xi[i+6]-xi_f[i+6]),creal(xi[i+7]-xi_f[i+7]),cimag(xi[i+7]-xi_f[i+7]));
344 fclose(output);fclose(output_f);
349#pragma omp parallel for simd
350 for(
int i = 0; i<
kferm2; i++){
360 output_old = fopen(
"hdslash_old",
"w");output_f_old = fopen(
"hdslash_f_old",
"w");
361 for(
int i = 0; i<
kferm2; i+=8){
362 fprintf(output_old,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
363 creal(X0[i]),cimag(X0[i]),creal(X0[i+1]),cimag(X0[i+1]),
364 creal(X0[i+2]),cimag(X0[i+2]),creal(X0[i+3]),cimag(X0[i+3]),
365 creal(X0[i+4]),cimag(X0[i+4]),creal(X0[i+5]),cimag(X0[i+5]),
366 creal(X0[i+6]),cimag(X0[i+6]),creal(X0[i+7]),cimag(X0[i+7]));
367 fprintf(output_f_old,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
368 creal(X0_f[i]),cimag(X0_f[i]),creal(X0_f[i+1]),cimag(X0_f[i+1]),
369 creal(X0_f[i+2]),cimag(X0_f[i+2]),creal(X0_f[i+3]),cimag(X0_f[i+3]),
370 creal(X0_f[i+4]),cimag(X0_f[i+4]),creal(X0_f[i+5]),cimag(X0_f[i+5]),
371 creal(X0_f[i+6]),cimag(X0_f[i+6]),creal(X0_f[i+7]),cimag(X0_f[i+7]));
372 printf(
"Difference in hdslash double and float X0[%d] to X0[%d+7]:\n",i,i);
373 printf(
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
374 creal(X0[i]-X0_f[i]),cimag(X0[i]-X0_f[i]),creal(X0[i+1]-X0_f[i+1]),cimag(X0[i+1]-X0_f[i+1]),
375 creal(X0[i+2]-X0_f[i+2]),cimag(X0[i+2]-X0_f[i+2]),creal(X0[i+3]-X0_f[i+3]),cimag(X0[i+3]-X0_f[i+3]),
376 creal(X0[i+4]-X0_f[i+4]),cimag(X0[i+4]-X0_f[i+4]),creal(X0[i+5]-X0_f[i+5]),cimag(X0[i+5]-X0_f[i+5]),
377 creal(X0[i+6]-X0_f[i+6]),cimag(X0[i+6]-X0_f[i+6]),creal(X0[i+7]-X0_f[i+7]),cimag(X0[i+7]-X0_f[i+7]));
379 fclose(output_old);fclose(output_f_old);
385 Transpose_c(u11t_f,
ndim,
kvol,dimGrid,dimBlock);
386 Transpose_c(u12t_f,
ndim,
kvol,dimGrid,dimBlock);
388 Hdslash(X1,X0,u11t,u12t,iu,
id,gamval,gamin,dk4m,dk4p,akappa);
389 Hdslash_f(X1_f,X0_f,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,akappa);
393 Transpose_c(u11t_f,
kvol,
ndim,dimGrid,dimBlock);
394 Transpose_c(u12t_f,
kvol,
ndim,dimGrid,dimBlock);
395 cudaDeviceSynchronise();
397 output = fopen(
"hdslash",
"w"); output_f = fopen(
"hdslash_f",
"w");
398 for(
int i = 0; i<
kferm2; i+=8){
399 fprintf(output,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
400 creal(X1[i]),cimag(X1[i]),creal(X1[i+1]),cimag(X1[i+1]),
401 creal(X1[i+2]),cimag(X1[i+2]),creal(X1[i+3]),cimag(X1[i+3]),
402 creal(X1[i+4]),cimag(X1[i+4]),creal(X1[i+5]),cimag(X1[i+5]),
403 creal(X1[i+6]),cimag(X1[i+6]),creal(X1[i+7]),cimag(X1[i+7]));
404 fprintf(output_f,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
405 creal(X1_f[i]),cimag(X1_f[i]),creal(X1_f[i+1]),cimag(X1_f[i+1]),
406 creal(X1_f[i+2]),cimag(X1_f[i+2]),creal(X1_f[i+3]),cimag(X1_f[i+3]),
407 creal(X1_f[i+4]),cimag(X1_f[i+4]),creal(X1_f[i+5]),cimag(X1_f[i+5]),
408 creal(X1_f[i+6]),cimag(X1_f[i+6]),creal(X1_f[i+7]),cimag(X1_f[i+7]));
409 printf(
"Difference in hdslash double and float X1[%d] to X1[%d+7] after mult.:\n",i,i);
410 printf(
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
411 creal(X1[i]-X1_f[i]),cimag(X1[i]-X1_f[i]),creal(X1[i+1]-X1_f[i+1]),cimag(X1[i+1]-X1_f[i+1]),
412 creal(X1[i+2]-X1_f[i+2]),cimag(X1[i+2]-X1_f[i+2]),creal(X1[i+3]-X1_f[i+3]),cimag(X1[i+3]-X1_f[i+3]),
413 creal(X1[i+4]-X1_f[i+4]),cimag(X1[i+4]-X1_f[i+4]),creal(X1[i+5]-X1_f[i+5]),cimag(X1[i+5]-X1_f[i+5]),
414 creal(X1[i+6]-X1_f[i+6]),cimag(X1[i+6]-X1_f[i+6]),creal(X1[i+7]-X1_f[i+7]),cimag(X1[i+7]-X1_f[i+7]));
416 fclose(output);fclose(output_f);
421 for(
int i = 0; i<
kferm2; i++){
431 output_old = fopen(
"hdslashd_old",
"w");output_f_old = fopen(
"hdslashd_f_old",
"w");
432 for(
int i = 0; i<
kferm2; i+=8){
433 fprintf(output_old,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
434 creal(X0[i]),cimag(X0[i]),creal(X0[i+1]),cimag(X0[i+1]),
435 creal(X0[i+2]),cimag(X0[i+2]),creal(X0[i+3]),cimag(X0[i+3]),
436 creal(X0[i+4]),cimag(X0[i+4]),creal(X0[i+5]),cimag(X0[i+5]),
437 creal(X0[i+6]),cimag(X0[i+6]),creal(X0[i+7]),cimag(X0[i+7]));
438 fprintf(output_f_old,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
439 creal(X0_f[i]),cimag(X0_f[i]),creal(X0_f[i+1]),cimag(X0_f[i+1]),
440 creal(X0_f[i+2]),cimag(X0_f[i+2]),creal(X0_f[i+3]),cimag(X0_f[i+3]),
441 creal(X0_f[i+4]),cimag(X0_f[i+4]),creal(X0_f[i+5]),cimag(X0_f[i+5]),
442 creal(X0_f[i+6]),cimag(X0_f[i+6]),creal(X0_f[i+7]),cimag(X0_f[i+7]));
443 printf(
"Difference in hdslashd double and float X0[%d] to X0[%d+7]:\n",i,i);
444 printf(
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
445 creal(X0[i]-X0_f[i]),cimag(X0[i]-X0_f[i]),creal(X0[i+1]-X0_f[i+1]),cimag(X0[i+1]-X0_f[i+1]),
446 creal(X0[i+2]-X0_f[i+2]),cimag(X0[i+2]-X0_f[i+2]),creal(X0[i+3]-X0_f[i+3]),cimag(X0[i+3]-X0_f[i+3]),
447 creal(X0[i+4]-X0_f[i+4]),cimag(X0[i+4]-X0_f[i+4]),creal(X0[i+5]-X0_f[i+5]),cimag(X0[i+5]-X0_f[i+5]),
448 creal(X0[i+6]-X0_f[i+6]),cimag(X0[i+6]-X0_f[i+6]),creal(X0[i+7]-X0_f[i+7]),cimag(X0[i+7]-X0_f[i+7]));
455 Transpose_c(u11t_f,
ndim,
kvol,dimGrid,dimBlock);
456 Transpose_c(u12t_f,
ndim,
kvol,dimGrid,dimBlock);
458 Hdslashd(X1,X0,u11t,u12t,iu,
id,gamval,gamin,dk4m,dk4p,akappa);
459 Hdslashd_f(X1_f,X0_f,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,akappa);
463 Transpose_c(u11t_f,
kvol,
ndim,dimGrid,dimBlock);
464 Transpose_c(u12t_f,
kvol,
ndim,dimGrid,dimBlock);
465 cudaDeviceSynchronise();
467 output = fopen(
"hdslashd",
"w"); output_f = fopen(
"hdslashd_f",
"w");
468 for(
int i = 0; i<
kferm2; i+=8){
469 fprintf(output,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
470 creal(X1[i]),cimag(X1[i]),creal(X1[i+1]),cimag(X1[i+1]),
471 creal(X1[i+2]),cimag(X1[i+2]),creal(X1[i+3]),cimag(X1[i+3]),
472 creal(X1[i+4]),cimag(X1[i+4]),creal(X1[i+5]),cimag(X1[i+5]),
473 creal(X1[i+6]),cimag(X1[i+6]),creal(X1[i+7]),cimag(X1[i+7]));
474 fprintf(output_f,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
475 creal(X1_f[i]),cimag(X1_f[i]),creal(X1_f[i+1]),cimag(X1_f[i+1]),
476 creal(X1_f[i+2]),cimag(X1_f[i+2]),creal(X1_f[i+3]),cimag(X1_f[i+3]),
477 creal(X1_f[i+4]),cimag(X1_f[i+4]),creal(X1_f[i+5]),cimag(X1_f[i+5]),
478 creal(X1_f[i+6]),cimag(X1_f[i+6]),creal(X1_f[i+7]),cimag(X1_f[i+7]));
479 printf(
"Difference in hdslashd double and float X1[%d] to X1[%d+7] after mult.:\n",i,i);
480 printf(
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
481 creal(X1[i]-X1_f[i]),cimag(X1[i]-X1_f[i]),creal(X1[i+1]-X1_f[i+1]),cimag(X1[i+1]-X1_f[i+1]),
482 creal(X1[i+2]-X1_f[i+2]),cimag(X1[i+2]-X1_f[i+2]),creal(X1[i+3]-X1_f[i+3]),cimag(X1[i+3]-X1_f[i+3]),
483 creal(X1[i+4]-X1_f[i+4]),cimag(X1[i+4]-X1_f[i+4]),creal(X1[i+5]-X1_f[i+5]),cimag(X1[i+5]-X1_f[i+5]),
484 creal(X1[i+6]-X1_f[i+6]),cimag(X1[i+6]-X1_f[i+6]),creal(X1[i+7]-X1_f[i+7]),cimag(X1[i+7]-X1_f[i+7]));
486 fclose(output);fclose(output_f);
489 output_old = fopen(
"hamiltonian_old",
"w");
490 for(
int i = 0; i<
kferm2; i+=8){
491 fprintf(output_old,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
492 creal(X1[i]),cimag(X1[i]),creal(X1[i+1]),cimag(X1[i+1]),
493 creal(X1[i+2]),cimag(X1[i+2]),creal(X1[i+3]),cimag(X1[i+3]),
494 creal(X1[i+4]),cimag(X1[i+4]),creal(X1[i+5]),cimag(X1[i+5]),
495 creal(X1[i+6]),cimag(X1[i+6]),creal(X1[i+7]),cimag(X1[i+7]));
498 output = fopen(
"hamiltonian",
"w");
499 double h,s,ancgh; h=s=ancgh=0;
500 Hamilton(&h,&s,
rescgg,pp,X0,X1,Phi,u11t,u12t,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,\
501 akappa,beta,&ancgh,0);
502 for(
int i = 0; i<
kferm2; i+=8){
503 fprintf(output,
"%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
504 creal(X1[i]),cimag(X1[i]),creal(X1[i+1]),cimag(X1[i+1]),
505 creal(X1[i+2]),cimag(X1[i+2]),creal(X1[i+3]),cimag(X1[i+3]),
506 creal(X1[i+4]),cimag(X1[i+4]),creal(X1[i+5]),cimag(X1[i+5]),
507 creal(X1[i+6]),cimag(X1[i+6]),creal(X1[i+7]),cimag(X1[i+7]));
513 output_old = fopen(
"force_0_old",
"w");
514 for(
int i = 0; i<
kmom; i+=4)
515 fprintf(output_old,
"%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i], dSdpi[i+1], dSdpi[i+2], dSdpi[i+3]);
517 Force(dSdpi, 1,
rescgg,X0,X1,Phi,u11t,u12t,u11t_f,u12t_f,iu,
id,gamval,gamval_f,gamin,dk4m,dk4p,\
518 dk4m_f,dk4p_f,jqq,akappa,beta,&ancg);
519 output = fopen(
"force_0",
"w");
520 for(
int i = 0; i<
kmom; i+=4)
521 fprintf(output,
"%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i], dSdpi[i+1], dSdpi[i+2], dSdpi[i+3]);
525 output_old = fopen(
"force_1_old",
"w");
526 for(
int i = 0; i<
kmom; i+=4)
527 fprintf(output_old,
"%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i], dSdpi[i+1], dSdpi[i+2], dSdpi[i+3]);
529 output = fopen(
"force_1",
"w");
530 Force(dSdpi, 0,
rescgg,X0,X1,Phi,u11t,u12t,u11t_f,u12t_f,iu,
id,gamval,gamval_f,gamin,dk4m,dk4p,\
531 dk4m_f,dk4p_f,jqq,akappa,beta,&ancg);
532 for(
int i = 0; i<
kmom; i+=4)
533 fprintf(output,
"%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i], dSdpi[i+1], dSdpi[i+2], dSdpi[i+3]);
537 output_old = fopen(
"Gauge_Force_old",
"w");
538 for(
int i = 0; i<
kmom; i+=12){
539 fprintf(output_old,
"%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i], dSdpi[i+1], dSdpi[i+2], dSdpi[i+3]);
540 fprintf(output_old,
"%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i+4], dSdpi[i+5], dSdpi[i+6], dSdpi[i+7]);
541 fprintf(output_old,
"%.5f\t%.5f\t%.5f\t%.5f\n\n", dSdpi[i+8], dSdpi[i+9], dSdpi[i+10], dSdpi[i+11]);
545 cudaMemPrefetchAsync(dSdpi,
kmom*
sizeof(
double),device,NULL);
549 cudaDeviceSynchronise();
551 output = fopen(
"Gauge_Force",
"w");
552 for(
int i = 0; i<
kmom; i+=12){
553 fprintf(output,
"%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i], dSdpi[i+1], dSdpi[i+2], dSdpi[i+3]);
554 fprintf(output,
"%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i+4], dSdpi[i+5], dSdpi[i+6], dSdpi[i+7]);
555 fprintf(output,
"%.5f\t%.5f\t%.5f\t%.5f\n\n", dSdpi[i+8], dSdpi[i+9], dSdpi[i+10], dSdpi[i+11]);
561 output_old = fopen(
"PreUpDownPart",
"w");
562 for(
int i=0; i<
kferm; i+=2)
563 fprintf(output_old,
"R1[%d]:\t%.5e+%.5ei\tR1[%d]:\t%.5e+%.5ei\n",\
564 i,creal(R1[i]),cimag(R1[i]),i+1,creal(R1[i+1]),cimag(R1[i+1]));
565 UpDownPart(na,X0,R1);
567 output = fopen(
"UpDownPart",
"w");
568 for(
int i=0; i<
kferm2; i+=2)
569 fprintf(output,
"X0[%d]:\t%.5e+%.5ei\tX0[%d]:\t%.5e+%.5ei\n",\
570 i,creal(X0[i]),cimag(X0[i]),i+1,creal(X0[i+1]),cimag(X0[i+1]));
576 Congradp(0,
respbp, Phi, R1,u11t_f,u12t_f,iu,
id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg);
583 cudaFree(dk4m); cudaFree(dk4p); cudaFree(R1); cudaFree(dSdpi); cudaFree(pp);
584 cudaFree(Phi); cudaFree(u11t); cudaFree(u12t);
585 cudaFree(X0); cudaFree(X1); cudaFree(u11); cudaFree(u12);
586 cudaFree(X0_f); cudaFree(X1_f); cudaFree(u11t_f); cudaFree(u12t_f);
587 cudaFree(
id); cudaFree(iu); cudaFree(
hd); cudaFree(
hu);
589 free(dk4m); free(dk4p); free(R1); free(dSdpi); free(pp);
590 free(Phi); free(u11t); free(u12t); free(xi);
591 free(X0); free(X1); free(u11); free(u12);
592 free(
id); free(iu); free(
hd); free(
hu);
unsigned int * hd
Down halo indices.
unsigned int * hu
Up halo indices.
Matrix multiplication and related declarations.
int Dslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa)
Evaluates in double precision.
int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
int Dslashd(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa)
Evaluates in double precision.
int Hdslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, float akappa)
Evaluates in single precision.
int Hdslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
Evaluates in double precision.
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, float akappa)
Evaluates in single precision.
int Hdslashd(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
Evaluates in double precision.
int Trial_Exchange(Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f)
Exchanges the trial fields.
#define MPI_Finalise()
Avoid any accidents with US/UK spelling.
int * pcoord
The processor grid.
int Gauss_z(Complex *ps, unsigned int n, const Complex mu, const double sigma)
Generates a vector of normally distributed random double precision complex numbers using the Box-Mull...
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
int Gauss_d(double *ps, unsigned int n, const double mu, const double sigma)
Generates a vector of normally distributed random double precision numbers using the Box-Muller Metho...
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
#define rescgg
Conjugate gradient residue for update.
#define kmom
sublattice momentum sites
#define ngorkov
Gor'kov indices.
#define kferm2Halo
Dirac lattice and halo.
#define kvol
Sublattice volume.
#define Complex
Double precision complex number.
#define kferm
sublattice size including Gor'kov indices
#define nf
Fermion flavours (double it)
#define ndirac
Dirac indices.
#define respbp
Conjugate gradient residue for .
#define kmomHalo
Momentum lattice and halo.
#define halo
Total Halo size.
#define Complex_f
Single precision complex number.
#define kferm2
sublattice size including Dirac indices
#define kfermHalo
Gor'kov lattice and halo.
int Force(double *dSdpi, int iflag, double res1, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancg)
Calculates the force at each intermediate time.
int Gauge_force(double *dSdpi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
int Hamilton(double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
Calculate the Hamiltonian.
int Reunitarise(Complex *u11t, Complex *u12t)
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...