su2hmc
Loading...
Searching...
No Matches
diagnostics.c
1#ifdef DIAGNOSTIC
2#include <assert.h>
3#include <complex.h>
4#include <matrices.h>
5#include <string.h>
6
7int Diagnostics(int istart, Complex *u11, Complex *u12,Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f,\
8 unsigned int *iu, unsigned int *id, int *hu, int *hd, double *dk4m, double *dk4p,\
9 float *dk4m_f, float *dk4p_f, int *gamin, Complex *gamval, Complex_f *gamval_f,\
10 Complex_f jqq,float akappa,float beta, double ancg){
11 /*
12 * Routine to check if the multiplication routines are working or not
13 * How I hope this will work is that
14 * 1) Initialise the system
15 * 2) Just after the initialisation of the system but before anything
16 * else call this routine using the C Preprocessor.
17 * 3) Give dummy values for the fields and then do work with them
18 * Caveats? Well this could get messy if we call something we didn't
19 * realise was being called and hadn't initialised it properly (Congradq
20 * springs to mind straight away)
21 */
22 const char *funcname = "Diagnostics";
23
24 //Initialise the arrays being used. Just going to assume MKL is being
25 //used here will also assert the number of flavours for now to avoid issues
26 //later
27 assert(nf==1);
28#include<float.h>
29 printf("FLT_EVAL_METHOD is %i. Check online for what this means\n", FLT_EVAL_METHOD);
30
31#ifdef __NVCC__
32 int device=-1;
33 cudaGetDevice(&device);
34 Complex *xi,*R1,*Phi,*X0,*X1;
35 Complex_f *X0_f, *X1_f, *xi_f, *R1_f, *Phi_f;
36 double *dSdpi,*pp;
37 cudaMallocManaged(&R1,kfermHalo*sizeof(Complex),cudaMemAttachGlobal);
38 cudaMallocManaged(&xi,kfermHalo*sizeof(Complex),cudaMemAttachGlobal);
39 cudaMallocManaged(&R1_f,kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
40 cudaMallocManaged(&xi_f,kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
41 cudaMallocManaged(&Phi,kfermHalo*sizeof(Complex),cudaMemAttachGlobal);
42 cudaMallocManaged(&Phi_f,kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
43 cudaMallocManaged(&X0,kferm2Halo*sizeof(Complex),cudaMemAttachGlobal);
44 cudaMallocManaged(&X1,kferm2Halo*sizeof(Complex),cudaMemAttachGlobal);
45 cudaMallocManaged(&X0_f,kferm2Halo*sizeof(Complex_f),cudaMemAttachGlobal);
46 cudaMallocManaged(&X1_f,kfermHalo*sizeof(Complex_f),cudaMemAttachGlobal);
47 cudaMallocManaged(&pp,kmomHalo*sizeof(double),cudaMemAttachGlobal);
48 cudaMallocManaged(&dSdpi,kmom*sizeof(double),cudaMemAttachGlobal);
49#else
50 Complex *R1= aligned_alloc(AVX,kfermHalo*sizeof(Complex));
51 Complex *xi= aligned_alloc(AVX,kfermHalo*sizeof(Complex));
52 Complex_f *R1_f= aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
53 Complex_f *xi_f= aligned_alloc(AVX,kfermHalo*sizeof(Complex_f));
54 Complex *Phi= aligned_alloc(AVX,nf*kfermHalo*sizeof(Complex));
55 Complex_f *Phi_f= aligned_alloc(AVX,nf*kfermHalo*sizeof(Complex_f));
56 Complex *X0= aligned_alloc(AVX,nf*kferm2Halo*sizeof(Complex));
57 Complex *X1= aligned_alloc(AVX,kferm2Halo*sizeof(Complex));
58 double *pp = aligned_alloc(AVX,kmomHalo*sizeof(double));
59 Complex_f *X0_f= aligned_alloc(AVX,nf*kferm2Halo*sizeof(Complex_f));
60 Complex_f *X1_f= aligned_alloc(AVX,kferm2Halo*sizeof(Complex_f));
61 double *dSdpi = aligned_alloc(AVX,kmom*sizeof(double));
62#endif
63 //pp is the momentum field
64
65#pragma omp parallel sections
66 {
67#pragma omp section
68 {
69 FILE *dk4m_File = fopen("dk4m","w");
70 for(int i=0;i<kvol;i+=4)
71 fprintf(dk4m_File,"%f\t%f\t%f\t%f\n",dk4m[i],dk4m[i+1],dk4m[i+2],dk4m[i+3]);
72 }
73#pragma omp section
74 {
75 FILE *dk4p_File = fopen("dk4p","w");
76 for(int i=0;i<kvol;i+=4)
77 fprintf(dk4p_File,"%f\t%f\t%f\t%f\n",dk4p[i],dk4p[i+1],dk4p[i+2],dk4p[i+3]);
78 }
79 }
80 for(int test = 0; test<=9; test++){
81 //Trial fields shouldn't get modified so were previously set up outside
82 switch(istart){
83 case(1):
84#pragma omp parallel sections num_threads(4)
85 {
86#pragma omp section
87 {
88 FILE *trial_out = fopen("u11t", "w");
89 for(int i=0;i<ndim*(kvol+halo);i+=4)
90 fprintf(trial_out,"%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\n",
91 creal(u11t[i]),cimag(u11t[i]),creal(u11t[i+1]),cimag(u11t[i+1]),
92 creal(u11t[2+i]),cimag(u11t[2+i]),creal(u11t[i+3]),cimag(u11t[i+3]));
93 fclose(trial_out);
94 }
95#pragma omp section
96 {
97 FILE *trial_out = fopen("u12t", "w");
98 for(int i=0;i<ndim*(kvol+halo);i+=4)
99 fprintf(trial_out,"%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\n",
100 creal(u12t[i]),cimag(u12t[i]),creal(u12t[i+1]),cimag(u12t[i+1]),
101 creal(u12t[2+i]),cimag(u12t[2+i]),creal(u12t[i+3]),cimag(u12t[i+3]));
102 fclose(trial_out);
103 }
104#pragma omp section
105 {
106 FILE *trial_out = fopen("u11t_f", "w");
107 for(int i=0;i<ndim*(kvol+halo);i+=4)
108 fprintf(trial_out,"%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\n",
109 creal(u11t_f[i]),cimag(u11t_f[i]),creal(u11t_f[i+1]),cimag(u11t_f[i+1]),
110 creal(u11t_f[2+i]),cimag(u11t_f[2+i]),creal(u11t_f[i+3]),cimag(u11t_f[i+3]));
111 fclose(trial_out);
112 }
113#pragma omp section
114 {
115 FILE *trial_out = fopen("u12t_f", "w");
116 for(int i=0;i<ndim*(kvol+halo);i+=4)
117 fprintf(trial_out,"%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\t%.5f+%.5fI\n",
118 creal(u12t_f[i]),cimag(u12t_f[i]),creal(u12t_f[i+1]),cimag(u12t_f[i+1]),
119 creal(u12t_f[2+i]),cimag(u12t_f[2+i]),creal(u12t_f[i+3]),cimag(u12t_f[i+3]));
120 fclose(trial_out);
121 }
122 }
123 break;
124 case(2):
125#pragma omp parallel for simd aligned(u11t,u12t:AVX)
126 for(int i =0; i<ndim*kvol; i+=8){
127 u11t[i+0]=1+I; u12t[i]=1+I;
128 u11t[i+1]=1-I; u12t[i+1]=1+I;
129 u11t[i+2]=1+I; u12t[i+2]=1-I;
130 u11t[i+3]=1-I; u12t[i+3]=1-I;
131 u11t[i+4]=-1+I; u12t[i+4]=1+I;
132 u11t[i+5]=1+I; u12t[i+5]=-1+I;
133 u11t[i+6]=-1+I; u12t[i+6]=-1+I;
134 u11t[i+7]=-1-I; u12t[i+7]=-1-I;
135 }
136 break;
137 default:
138 //Cold start as a default
139 memcpy(u11t,u11,kvol*ndim*sizeof(Complex));
140 memcpy(u12t,u12,kvol*ndim*sizeof(Complex));
141 break;
142 }
143 Reunitarise(u11t,u12t);
144 Trial_Exchange(u11t,u12t,u11t_f,u12t_f);
145
146 //We reset all the random fields between each test. It's one way of ensuring that errors don't propegate from one
147 //test to another. Since we start from the same seed each time this should give the same results for each test. If
148 //it does not, there's a bug
149#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
150 Gauss_d(pp,kmomHalo,0,1);
151 Gauss_z(R1, kferm, 0, 1/sqrt(2));
152 Gauss_z(Phi, kferm, 0, 1/sqrt(2));
153 Gauss_z(xi, kferm, 0, 1/sqrt(2));
154 Gauss_c(R1_f, kferm, 0, 1/sqrt(2));
155 Gauss_c(Phi_f, kferm, 0, 1/sqrt(2));
156 Gauss_c(xi_f, kferm, 0, 1/sqrt(2));
157#else
158 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, R1_f, 0, 1/sqrt(2));
159 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, Phi_f, 0, 1/sqrt(2));
160 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, xi_f, 0, 1/sqrt(2));
161 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, R1, 0, 1/sqrt(2));
162 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, Phi, 0, 1/sqrt(2));
163 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm, xi, 0, 1/sqrt(2));
164#endif
165#if (defined(USE_RAN2)||defined(__RANLUX__)||!defined(__INTEL_MKL__))
166 Gauss_z(X0, kferm2, 0, 1/sqrt(2));
167 Gauss_z(X1, kferm2, 0, 1/sqrt(2));
168 Gauss_c(X0_f, kferm2, 0, 1/sqrt(2));
169 Gauss_c(X1_f, kferm2, 0, 1/sqrt(2));
170#else
171 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm2, X0, 0, 1/sqrt(2));
172 vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm2, X1, 0, 1/sqrt(2));
173 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm2, X0_f, 0, 1/sqrt(2));
174 vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*kferm2, X1_f, 0, 1/sqrt(2));
175#endif
176
177 //Random nomalised momentum field
178 Gauss_d(dSdpi,kmom,0,1/sqrt(2));
179#pragma omp for simd aligned(dSdpi:AVX) nowait
180 for(int i=0; i<kmom; i+=4){
181 double norm = sqrt(dSdpi[i]*dSdpi[i]+dSdpi[i+1]*dSdpi[i+1]+dSdpi[i+2]*dSdpi[i+2]+dSdpi[i+3]*dSdpi[i+3]);
182 dSdpi[i]/=norm; dSdpi[i+1]/=norm; dSdpi[i+2]/=norm;dSdpi[i+3]/=norm;
183 }
184 FILE *output_old, *output;
185 FILE *output_f_old, *output_f;
186 switch(test){
187 case(0):
188#pragma omp parallel for simd
189 for(int i = 0; i< kferm; i++){
190 R1_f[i]=(Complex_f)R1[i];
191 xi_f[i]=(Complex_f)xi[i];
192 }
193 //NOTE: Each line corresponds to one lattice direction, in the form of colour 0, colour 1.
194 //Each block to one lattice site
195 output_old = fopen("dslash_old", "w");
196 output_f_old = fopen("dslash_f_old", "w");
197#ifdef __NVCC__
198 cudaMemPrefetchAsync(R1,kferm*sizeof(Complex),device,NULL);
199 cudaMemPrefetchAsync(xi,kferm*sizeof(Complex),device,NULL);
200 cudaMemPrefetchAsync(R1_f,kferm*sizeof(Complex_f),device,NULL);
201 cudaMemPrefetchAsync(xi_f,kferm*sizeof(Complex_f),device,NULL);
202 cudaDeviceSynchronise();
203#endif
204 for(int i = 0; i< kferm; i+=8){
205 fprintf(output_old, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
206 creal(R1[i]),cimag(R1[i]),creal(R1[i+1]),cimag(R1[i+1]),
207 creal(R1[i+2]),cimag(R1[i+2]),creal(R1[i+3]),cimag(R1[i+3]),
208 creal(R1[i+4]),cimag(R1[i+4]),creal(R1[i+5]),cimag(R1[i+5]),
209 creal(R1[i+6]),cimag(R1[i+6]),creal(R1[i+7]),cimag(R1[i+7]) );
210 fprintf(output_f_old, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
211 creal(R1_f[i]),cimag(R1_f[i]),creal(R1_f[i+1]),cimag(R1_f[i+1]),
212 creal(R1_f[i+2]),cimag(R1_f[i+2]),creal(R1_f[i+3]),cimag(R1_f[i+3]),
213 creal(R1_f[i+4]),cimag(R1_f[i+4]),creal(R1_f[i+5]),cimag(R1_f[i+5]),
214 creal(R1_f[i+6]),cimag(R1_f[i+6]),creal(R1_f[i+7]),cimag(R1_f[i+7]));
215 printf("Difference in dslash double and float R1[%d] to R1[%d+7]:\n",i,i);
216 printf("%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
217 creal(R1[i]-R1_f[i]),cimag(R1[i]-R1_f[i]),creal(R1[i+1]-R1_f[i+1]),cimag(R1[i+1]-R1_f[i+1]),
218 creal(R1[i+2]-R1_f[i+2]),cimag(R1[i+2]-R1_f[i+2]),creal(R1[i+3]-R1_f[i+3]),cimag(R1[i+3]-R1_f[i+3]),
219 creal(R1[i+4]-R1_f[i+4]),cimag(R1[i+4]-R1_f[i+4]),creal(R1[i+5]-R1_f[i+5]),cimag(R1[i+5]-R1_f[i+5]),
220 creal(R1[i+6]-R1_f[i+6]),cimag(R1[i+6]-R1_f[i+6]),creal(R1[i+7]-R1_f[i+7]),cimag(R1[i+7]-R1_f[i+7]));
221 }
222 fclose(output_old); fclose(output_f_old);
223 Dslash(xi,R1,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa);
224#ifdef __NVCC__
225 cudaMemPrefetchAsync(xi,kferm*sizeof(Complex),device,NULL);
226 //Transpose needed here for Dslash_f
227 Transpose_c(xi_f,ngorkov*nc,kvol,dimGrid,dimBlock);
228 Transpose_c(R1_f,ngorkov*nc,kvol,dimGrid,dimBlock);
229 //Flip all the gauge fields around so memory is coalesced
230 Transpose_c(u11t_f,ndim,kvol,dimGrid,dimBlock);
231 Transpose_c(u12t_f,ndim,kvol,dimGrid,dimBlock);
232 cudaDeviceSynchronise();
233#endif
234 Dslash_f(xi_f,R1_f,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa);
235#ifdef __NVCC__
236 Transpose_c(xi_f,kvol,ngorkov*nc,dimGrid,dimBlock);
237 cudaMemPrefetchAsync(xi_f,kferm*sizeof(Complex_f),device,NULL);
238 Transpose_c(R1_f,kvol,ngorkov*nc,dimGrid,dimBlock);
239 Transpose_c(u11t_f,kvol,ndim,dimGrid,dimBlock);
240 Transpose_c(u12t_f,kvol,ndim,dimGrid,dimBlock);
241 cudaDeviceSynchronise();
242#endif
243 output = fopen("dslash", "w");
244 output_f = fopen("dslash_f", "w");
245 for(int i = 0; i< kferm; i+=8){
246 fprintf(output, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
247 creal(xi[i]),cimag(xi[i]),creal(xi[i+1]),cimag(xi[i+1]),
248 creal(xi[i+2]),cimag(xi[i+2]),creal(xi[i+3]),cimag(xi[i+3]),
249 creal(xi[i+4]),cimag(xi[i+4]),creal(xi[i+5]),cimag(xi[i+5]),
250 creal(xi[i+6]),cimag(xi[i+6]),creal(xi[i+7]),cimag(xi[i+7]) );
251 fprintf(output_f, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
252 creal(xi_f[i]),cimag(xi_f[i]),creal(xi_f[i+1]),cimag(xi_f[i+1]),
253 creal(xi_f[i+2]),cimag(xi_f[i+2]),creal(xi_f[i+3]),cimag(xi_f[i+3]),
254 creal(xi_f[i+4]),cimag(xi_f[i+4]),creal(xi_f[i+5]),cimag(xi_f[i+5]),
255 creal(xi_f[i+6]),cimag(xi_f[i+6]),creal(xi_f[i+7]),cimag(xi_f[i+7]));
256 printf("Difference in dslash double and float xi[%d] to xi[%d+7] after mult:\n",i,i);
257 printf("%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
258 creal(xi[i]-xi_f[i]),cimag(xi[i]-xi_f[i]),creal(xi[i+1]-xi_f[i+1]),cimag(xi[i+1]-xi_f[i+1]),
259 creal(xi[i+2]-xi_f[i+2]),cimag(xi[i+2]-xi_f[i+2]),creal(xi[i+3]-xi_f[i+3]),cimag(xi[i+3]-xi_f[i+3]),
260 creal(xi[i+4]-xi_f[i+4]),cimag(xi[i+4]-xi_f[i+4]),creal(xi[i+5]-xi_f[i+5]),cimag(xi[i+5]-xi_f[i+5]),
261 creal(xi[i+6]-xi_f[i+6]),cimag(xi[i+6]-xi_f[i+6]),creal(xi[i+7]-xi_f[i+7]),cimag(xi[i+7]-xi_f[i+7]));
262 }
263 fclose(output);fclose(output_f);
264 break;
265 case(1):
266#pragma omp parallel for simd
267 for(int i = 0; i< kferm; i++){
268 R1_f[i]=(Complex_f)R1[i];
269 xi_f[i]=(Complex_f)xi[i];
270 }
271 //NOTE: Each line corresponds to one lattice direction, in the form of colour 0, colour 1.
272 //Each block to one lattice site
273 output_old = fopen("dslashd_old", "w");
274 output_f_old = fopen("dslashd_f_old", "w");
275#ifdef __NVCC__
276 cudaMemPrefetchAsync(R1,kferm*sizeof(Complex),device,NULL);
277 cudaMemPrefetchAsync(xi,kferm*sizeof(Complex),device,NULL);
278 cudaMemPrefetchAsync(R1_f,kferm*sizeof(Complex_f),device,NULL);
279 cudaMemPrefetchAsync(xi_f,kferm*sizeof(Complex_f),device,NULL);
280 cudaDeviceSynchronise();
281#endif
282 for(int i = 0; i< kferm; i+=8){
283 fprintf(output_old, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
284 creal(R1[i]),cimag(R1[i]),creal(R1[i+1]),cimag(R1[i+1]),
285 creal(R1[i+2]),cimag(R1[i+2]),creal(R1[i+3]),cimag(R1[i+3]),
286 creal(R1[i+4]),cimag(R1[i+4]),creal(R1[i+5]),cimag(R1[i+5]),
287 creal(R1[i+6]),cimag(R1[i+6]),creal(R1[i+7]),cimag(R1[i+7]) );
288 fprintf(output_f_old, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
289 creal(R1_f[i]),cimag(R1_f[i]),creal(R1_f[i+1]),cimag(R1_f[i+1]),
290 creal(R1_f[i+2]),cimag(R1_f[i+2]),creal(R1_f[i+3]),cimag(R1_f[i+3]),
291 creal(R1_f[i+4]),cimag(R1_f[i+4]),creal(R1_f[i+5]),cimag(R1_f[i+5]),
292 creal(R1_f[i+6]),cimag(R1_f[i+6]),creal(R1_f[i+7]),cimag(R1_f[i+7]));
293 printf("Difference in dslashd double and float R1[%d] to R1[%d+7]:\n",i,i);
294 printf("%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
295 creal(R1[i]-R1_f[i]),cimag(R1[i]-R1_f[i]),creal(R1[i+1]-R1_f[i+1]),cimag(R1[i+1]-R1_f[i+1]),
296 creal(R1[i+2]-R1_f[i+2]),cimag(R1[i+2]-R1_f[i+2]),creal(R1[i+3]-R1_f[i+3]),cimag(R1[i+3]-R1_f[i+3]),
297 creal(R1[i+4]-R1_f[i+4]),cimag(R1[i+4]-R1_f[i+4]),creal(R1[i+5]-R1_f[i+5]),cimag(R1[i+5]-R1_f[i+5]),
298 creal(R1[i+6]-R1_f[i+6]),cimag(R1[i+6]-R1_f[i+6]),creal(R1[i+7]-R1_f[i+7]),cimag(R1[i+7]-R1_f[i+7]));
299 }
300#ifdef __NVCC__
301
302#endif
303 fclose(output_old); fclose(output_f_old);
304 Dslashd(xi,R1,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,jqq,akappa);
305#ifdef __NVCC__
306 cudaMemPrefetchAsync(xi,kferm*sizeof(Complex),device,NULL);
307 //Transpose needed here for Dslashd_f
308 Transpose_c(xi_f,ngorkov*nc,kvol,dimGrid,dimBlock);
309 Transpose_c(R1_f,ngorkov*nc,kvol,dimGrid,dimBlock);
310 //Flip all the gauge fields around so memory is coalesced
311 Transpose_c(u11t_f,ndim,kvol,dimGrid,dimBlock);
312 Transpose_c(u12t_f,ndim,kvol,dimGrid,dimBlock);
313 cudaDeviceSynchronise();
314#endif
315 Dslashd_f(xi_f,R1_f,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa);
316#ifdef __NVCC__
317 Transpose_c(xi_f,kvol,ngorkov*nc,dimGrid,dimBlock);
318 cudaMemPrefetchAsync(xi_f,kferm*sizeof(Complex_f),device,NULL);
319 Transpose_c(R1_f,kvol,ngorkov*nc,dimGrid,dimBlock);
320 Transpose_c(u11t_f,kvol,ndim,dimGrid,dimBlock);
321 Transpose_c(u12t_f,kvol,ndim,dimGrid,dimBlock);
322 cudaDeviceSynchronise();
323#endif
324 output = fopen("dslashd", "w");
325 output_f = fopen("dslashd_f", "w");
326 for(int i = 0; i< kferm; i+=8){
327 fprintf(output, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
328 creal(xi[i]),cimag(xi[i]),creal(xi[i+1]),cimag(xi[i+1]),
329 creal(xi[i+2]),cimag(xi[i+2]),creal(xi[i+3]),cimag(xi[i+3]),
330 creal(xi[i+4]),cimag(xi[i+4]),creal(xi[i+5]),cimag(xi[i+5]),
331 creal(xi[i+6]),cimag(xi[i+6]),creal(xi[i+7]),cimag(xi[i+7]) );
332 fprintf(output_f, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
333 creal(xi_f[i]),cimag(xi_f[i]),creal(xi_f[i+1]),cimag(xi_f[i+1]),
334 creal(xi_f[i+2]),cimag(xi_f[i+2]),creal(xi_f[i+3]),cimag(xi_f[i+3]),
335 creal(xi_f[i+4]),cimag(xi_f[i+4]),creal(xi_f[i+5]),cimag(xi_f[i+5]),
336 creal(xi_f[i+6]),cimag(xi_f[i+6]),creal(xi_f[i+7]),cimag(xi_f[i+7]));
337 printf("Difference in dslashd double and float xi[%d] to xi[%d+7] after mult:\n",i,i);
338 printf("%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
339 creal(xi[i]-xi_f[i]),cimag(xi[i]-xi_f[i]),creal(xi[i+1]-xi_f[i+1]),cimag(xi[i+1]-xi_f[i+1]),
340 creal(xi[i+2]-xi_f[i+2]),cimag(xi[i+2]-xi_f[i+2]),creal(xi[i+3]-xi_f[i+3]),cimag(xi[i+3]-xi_f[i+3]),
341 creal(xi[i+4]-xi_f[i+4]),cimag(xi[i+4]-xi_f[i+4]),creal(xi[i+5]-xi_f[i+5]),cimag(xi[i+5]-xi_f[i+5]),
342 creal(xi[i+6]-xi_f[i+6]),cimag(xi[i+6]-xi_f[i+6]),creal(xi[i+7]-xi_f[i+7]),cimag(xi[i+7]-xi_f[i+7]));
343 }
344 fclose(output);fclose(output_f);
345 break;
346 case(2):
347 //NOTE: Each line corresponds to one lattice direction, in the form of colour 0, colour 1.
348 //Each block to one lattice site
349#pragma omp parallel for simd
350 for(int i = 0; i< kferm2; i++){
351 X0_f[i]=(Complex_f)X0[i];
352 X1_f[i]=(Complex_f)X1[i];
353 }
354#ifdef __NVCC__
355 cudaMemPrefetchAsync(X0,kferm2*sizeof(Complex),device,NULL);
356 cudaMemPrefetchAsync(X1,kferm2*sizeof(Complex),device,NULL);
357 cudaMemPrefetchAsync(X0_f,kferm2*sizeof(Complex_f),device,NULL);
358 cudaMemPrefetchAsync(X1_f,kferm2*sizeof(Complex_f),device,NULL);
359#endif
360 output_old = fopen("hdslash_old", "w");output_f_old = fopen("hdslash_f_old", "w");
361 for(int i = 0; i< kferm2; i+=8){
362 fprintf(output_old, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
363 creal(X0[i]),cimag(X0[i]),creal(X0[i+1]),cimag(X0[i+1]),
364 creal(X0[i+2]),cimag(X0[i+2]),creal(X0[i+3]),cimag(X0[i+3]),
365 creal(X0[i+4]),cimag(X0[i+4]),creal(X0[i+5]),cimag(X0[i+5]),
366 creal(X0[i+6]),cimag(X0[i+6]),creal(X0[i+7]),cimag(X0[i+7]));
367 fprintf(output_f_old, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
368 creal(X0_f[i]),cimag(X0_f[i]),creal(X0_f[i+1]),cimag(X0_f[i+1]),
369 creal(X0_f[i+2]),cimag(X0_f[i+2]),creal(X0_f[i+3]),cimag(X0_f[i+3]),
370 creal(X0_f[i+4]),cimag(X0_f[i+4]),creal(X0_f[i+5]),cimag(X0_f[i+5]),
371 creal(X0_f[i+6]),cimag(X0_f[i+6]),creal(X0_f[i+7]),cimag(X0_f[i+7]));
372 printf("Difference in hdslash double and float X0[%d] to X0[%d+7]:\n",i,i);
373 printf("%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
374 creal(X0[i]-X0_f[i]),cimag(X0[i]-X0_f[i]),creal(X0[i+1]-X0_f[i+1]),cimag(X0[i+1]-X0_f[i+1]),
375 creal(X0[i+2]-X0_f[i+2]),cimag(X0[i+2]-X0_f[i+2]),creal(X0[i+3]-X0_f[i+3]),cimag(X0[i+3]-X0_f[i+3]),
376 creal(X0[i+4]-X0_f[i+4]),cimag(X0[i+4]-X0_f[i+4]),creal(X0[i+5]-X0_f[i+5]),cimag(X0[i+5]-X0_f[i+5]),
377 creal(X0[i+6]-X0_f[i+6]),cimag(X0[i+6]-X0_f[i+6]),creal(X0[i+7]-X0_f[i+7]),cimag(X0[i+7]-X0_f[i+7]));
378 }
379 fclose(output_old);fclose(output_f_old);
380 #ifdef __NVCC__
381 //Transpose needed here for Dslashd
382 Transpose_c(X0_f,ndirac*nc,kvol,dimGrid,dimBlock);
383 Transpose_c(X1_f,ndirac*nc,kvol,dimGrid,dimBlock);
384 //Flip all the gauge fields around so memory is coalesced
385 Transpose_c(u11t_f,ndim,kvol,dimGrid,dimBlock);
386 Transpose_c(u12t_f,ndim,kvol,dimGrid,dimBlock);
387 #endif
388 Hdslash(X1,X0,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,akappa);
389 Hdslash_f(X1_f,X0_f,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,akappa);
390#ifdef __NVCC__
391 Transpose_c(X0_f,kvol,ndirac*nc,dimGrid,dimBlock);
392 Transpose_c(X1_f,kvol,ndirac*nc,dimGrid,dimBlock);
393 Transpose_c(u11t_f,kvol,ndim,dimGrid,dimBlock);
394 Transpose_c(u12t_f,kvol,ndim,dimGrid,dimBlock);
395 cudaDeviceSynchronise();
396#endif
397 output = fopen("hdslash", "w"); output_f = fopen("hdslash_f", "w");
398 for(int i = 0; i< kferm2; i+=8){
399 fprintf(output, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
400 creal(X1[i]),cimag(X1[i]),creal(X1[i+1]),cimag(X1[i+1]),
401 creal(X1[i+2]),cimag(X1[i+2]),creal(X1[i+3]),cimag(X1[i+3]),
402 creal(X1[i+4]),cimag(X1[i+4]),creal(X1[i+5]),cimag(X1[i+5]),
403 creal(X1[i+6]),cimag(X1[i+6]),creal(X1[i+7]),cimag(X1[i+7]));
404 fprintf(output_f, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
405 creal(X1_f[i]),cimag(X1_f[i]),creal(X1_f[i+1]),cimag(X1_f[i+1]),
406 creal(X1_f[i+2]),cimag(X1_f[i+2]),creal(X1_f[i+3]),cimag(X1_f[i+3]),
407 creal(X1_f[i+4]),cimag(X1_f[i+4]),creal(X1_f[i+5]),cimag(X1_f[i+5]),
408 creal(X1_f[i+6]),cimag(X1_f[i+6]),creal(X1_f[i+7]),cimag(X1_f[i+7]));
409 printf("Difference in hdslash double and float X1[%d] to X1[%d+7] after mult.:\n",i,i);
410 printf("%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
411 creal(X1[i]-X1_f[i]),cimag(X1[i]-X1_f[i]),creal(X1[i+1]-X1_f[i+1]),cimag(X1[i+1]-X1_f[i+1]),
412 creal(X1[i+2]-X1_f[i+2]),cimag(X1[i+2]-X1_f[i+2]),creal(X1[i+3]-X1_f[i+3]),cimag(X1[i+3]-X1_f[i+3]),
413 creal(X1[i+4]-X1_f[i+4]),cimag(X1[i+4]-X1_f[i+4]),creal(X1[i+5]-X1_f[i+5]),cimag(X1[i+5]-X1_f[i+5]),
414 creal(X1[i+6]-X1_f[i+6]),cimag(X1[i+6]-X1_f[i+6]),creal(X1[i+7]-X1_f[i+7]),cimag(X1[i+7]-X1_f[i+7]));
415 }
416 fclose(output);fclose(output_f);
417 break;
418 case(3):
419 //NOTE: Each line corresponds to one lattice direction, in the form of colour 0, colour 1.
420 //Each block to one lattice site
421 for(int i = 0; i< kferm2; i++){
422 X0_f[i]=(Complex_f)X0[i];
423 X1_f[i]=(Complex_f)X1[i];
424 }
425#ifdef __NVCC__
426 cudaMemPrefetchAsync(X0,kferm2*sizeof(Complex),device,NULL);
427 cudaMemPrefetchAsync(X1,kferm2*sizeof(Complex),device,NULL);
428 cudaMemPrefetchAsync(X0_f,kferm2*sizeof(Complex_f),device,NULL);
429 cudaMemPrefetchAsync(X1_f,kferm2*sizeof(Complex_f),device,NULL);
430#endif
431 output_old = fopen("hdslashd_old", "w");output_f_old = fopen("hdslashd_f_old", "w");
432 for(int i = 0; i< kferm2; i+=8){
433 fprintf(output_old, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
434 creal(X0[i]),cimag(X0[i]),creal(X0[i+1]),cimag(X0[i+1]),
435 creal(X0[i+2]),cimag(X0[i+2]),creal(X0[i+3]),cimag(X0[i+3]),
436 creal(X0[i+4]),cimag(X0[i+4]),creal(X0[i+5]),cimag(X0[i+5]),
437 creal(X0[i+6]),cimag(X0[i+6]),creal(X0[i+7]),cimag(X0[i+7]));
438 fprintf(output_f_old, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
439 creal(X0_f[i]),cimag(X0_f[i]),creal(X0_f[i+1]),cimag(X0_f[i+1]),
440 creal(X0_f[i+2]),cimag(X0_f[i+2]),creal(X0_f[i+3]),cimag(X0_f[i+3]),
441 creal(X0_f[i+4]),cimag(X0_f[i+4]),creal(X0_f[i+5]),cimag(X0_f[i+5]),
442 creal(X0_f[i+6]),cimag(X0_f[i+6]),creal(X0_f[i+7]),cimag(X0_f[i+7]));
443 printf("Difference in hdslashd double and float X0[%d] to X0[%d+7]:\n",i,i);
444 printf("%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
445 creal(X0[i]-X0_f[i]),cimag(X0[i]-X0_f[i]),creal(X0[i+1]-X0_f[i+1]),cimag(X0[i+1]-X0_f[i+1]),
446 creal(X0[i+2]-X0_f[i+2]),cimag(X0[i+2]-X0_f[i+2]),creal(X0[i+3]-X0_f[i+3]),cimag(X0[i+3]-X0_f[i+3]),
447 creal(X0[i+4]-X0_f[i+4]),cimag(X0[i+4]-X0_f[i+4]),creal(X0[i+5]-X0_f[i+5]),cimag(X0[i+5]-X0_f[i+5]),
448 creal(X0[i+6]-X0_f[i+6]),cimag(X0[i+6]-X0_f[i+6]),creal(X0[i+7]-X0_f[i+7]),cimag(X0[i+7]-X0_f[i+7]));
449 }
450 #ifdef __NVCC__
451 //Transpose needed here for Dslashd
452 Transpose_c(X0_f,ndirac*nc,kvol,dimGrid,dimBlock);
453 Transpose_c(X1_f,ndirac*nc,kvol,dimGrid,dimBlock);
454 //Flip all the gauge fields around so memory is coalesced
455 Transpose_c(u11t_f,ndim,kvol,dimGrid,dimBlock);
456 Transpose_c(u12t_f,ndim,kvol,dimGrid,dimBlock);
457 #endif
458 Hdslashd(X1,X0,u11t,u12t,iu,id,gamval,gamin,dk4m,dk4p,akappa);
459 Hdslashd_f(X1_f,X0_f,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,akappa);
460#ifdef __NVCC__
461 Transpose_c(X0_f,kvol,ndirac*nc,dimGrid,dimBlock);
462 Transpose_c(X1_f,kvol,ndirac*nc,dimGrid,dimBlock);
463 Transpose_c(u11t_f,kvol,ndim,dimGrid,dimBlock);
464 Transpose_c(u12t_f,kvol,ndim,dimGrid,dimBlock);
465 cudaDeviceSynchronise();
466#endif
467 output = fopen("hdslashd", "w"); output_f = fopen("hdslashd_f", "w");
468 for(int i = 0; i< kferm2; i+=8){
469 fprintf(output, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
470 creal(X1[i]),cimag(X1[i]),creal(X1[i+1]),cimag(X1[i+1]),
471 creal(X1[i+2]),cimag(X1[i+2]),creal(X1[i+3]),cimag(X1[i+3]),
472 creal(X1[i+4]),cimag(X1[i+4]),creal(X1[i+5]),cimag(X1[i+5]),
473 creal(X1[i+6]),cimag(X1[i+6]),creal(X1[i+7]),cimag(X1[i+7]));
474 fprintf(output_f, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
475 creal(X1_f[i]),cimag(X1_f[i]),creal(X1_f[i+1]),cimag(X1_f[i+1]),
476 creal(X1_f[i+2]),cimag(X1_f[i+2]),creal(X1_f[i+3]),cimag(X1_f[i+3]),
477 creal(X1_f[i+4]),cimag(X1_f[i+4]),creal(X1_f[i+5]),cimag(X1_f[i+5]),
478 creal(X1_f[i+6]),cimag(X1_f[i+6]),creal(X1_f[i+7]),cimag(X1_f[i+7]));
479 printf("Difference in hdslashd double and float X1[%d] to X1[%d+7] after mult.:\n",i,i);
480 printf("%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
481 creal(X1[i]-X1_f[i]),cimag(X1[i]-X1_f[i]),creal(X1[i+1]-X1_f[i+1]),cimag(X1[i+1]-X1_f[i+1]),
482 creal(X1[i+2]-X1_f[i+2]),cimag(X1[i+2]-X1_f[i+2]),creal(X1[i+3]-X1_f[i+3]),cimag(X1[i+3]-X1_f[i+3]),
483 creal(X1[i+4]-X1_f[i+4]),cimag(X1[i+4]-X1_f[i+4]),creal(X1[i+5]-X1_f[i+5]),cimag(X1[i+5]-X1_f[i+5]),
484 creal(X1[i+6]-X1_f[i+6]),cimag(X1[i+6]-X1_f[i+6]),creal(X1[i+7]-X1_f[i+7]),cimag(X1[i+7]-X1_f[i+7]));
485 }
486 fclose(output);fclose(output_f);
487 break;
488 case(4):
489 output_old = fopen("hamiltonian_old", "w");
490 for(int i = 0; i< kferm2; i+=8){
491 fprintf(output_old, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
492 creal(X1[i]),cimag(X1[i]),creal(X1[i+1]),cimag(X1[i+1]),
493 creal(X1[i+2]),cimag(X1[i+2]),creal(X1[i+3]),cimag(X1[i+3]),
494 creal(X1[i+4]),cimag(X1[i+4]),creal(X1[i+5]),cimag(X1[i+5]),
495 creal(X1[i+6]),cimag(X1[i+6]),creal(X1[i+7]),cimag(X1[i+7]));
496 }
497 fclose(output_old);
498 output = fopen("hamiltonian", "w");
499 double h,s,ancgh; h=s=ancgh=0;
500 Hamilton(&h,&s,rescgg,pp,X0,X1,Phi,u11t,u12t,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,\
501 akappa,beta,&ancgh,0);
502 for(int i = 0; i< kferm2; i+=8){
503 fprintf(output, "%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n%.5f+%.5fI\t%.5f+%.5fI\n\n",
504 creal(X1[i]),cimag(X1[i]),creal(X1[i+1]),cimag(X1[i+1]),
505 creal(X1[i+2]),cimag(X1[i+2]),creal(X1[i+3]),cimag(X1[i+3]),
506 creal(X1[i+4]),cimag(X1[i+4]),creal(X1[i+5]),cimag(X1[i+5]),
507 creal(X1[i+6]),cimag(X1[i+6]),creal(X1[i+7]),cimag(X1[i+7]));
508 }
509 fclose(output);
510 break;
511 //Two force cases because of the flag. This also tests the conjugate gradient works okay
512 case(5):
513 output_old = fopen("force_0_old", "w");
514 for(int i = 0; i< kmom; i+=4)
515 fprintf(output_old, "%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i], dSdpi[i+1], dSdpi[i+2], dSdpi[i+3]);
516 fclose(output_old);
517 Force(dSdpi, 1, rescgg,X0,X1,Phi,u11t,u12t,u11t_f,u12t_f,iu,id,gamval,gamval_f,gamin,dk4m,dk4p,\
518 dk4m_f,dk4p_f,jqq,akappa,beta,&ancg);
519 output = fopen("force_0", "w");
520 for(int i = 0; i< kmom; i+=4)
521 fprintf(output, "%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i], dSdpi[i+1], dSdpi[i+2], dSdpi[i+3]);
522 fclose(output);
523 break;
524 case(6):
525 output_old = fopen("force_1_old", "w");
526 for(int i = 0; i< kmom; i+=4)
527 fprintf(output_old, "%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i], dSdpi[i+1], dSdpi[i+2], dSdpi[i+3]);
528 fclose(output_old);
529 output = fopen("force_1", "w");
530 Force(dSdpi, 0, rescgg,X0,X1,Phi,u11t,u12t,u11t_f,u12t_f,iu,id,gamval,gamval_f,gamin,dk4m,dk4p,\
531 dk4m_f,dk4p_f,jqq,akappa,beta,&ancg);
532 for(int i = 0; i< kmom; i+=4)
533 fprintf(output, "%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i], dSdpi[i+1], dSdpi[i+2], dSdpi[i+3]);
534 fclose(output);
535 break;
536 case(7):
537 output_old = fopen("Gauge_Force_old","w");
538 for(int i = 0; i< kmom; i+=12){
539 fprintf(output_old, "%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i], dSdpi[i+1], dSdpi[i+2], dSdpi[i+3]);
540 fprintf(output_old, "%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i+4], dSdpi[i+5], dSdpi[i+6], dSdpi[i+7]);
541 fprintf(output_old, "%.5f\t%.5f\t%.5f\t%.5f\n\n", dSdpi[i+8], dSdpi[i+9], dSdpi[i+10], dSdpi[i+11]);
542 }
543 fclose(output_old);
544#ifdef __NVCC__
545 cudaMemPrefetchAsync(dSdpi,kmom*sizeof(double),device,NULL);
546#endif
547 Gauge_force(dSdpi,u11t_f,u12t_f,iu,id,beta);
548#ifdef __NVCC__
549 cudaDeviceSynchronise();
550#endif
551 output = fopen("Gauge_Force","w");
552 for(int i = 0; i< kmom; i+=12){
553 fprintf(output, "%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i], dSdpi[i+1], dSdpi[i+2], dSdpi[i+3]);
554 fprintf(output, "%.5f\t%.5f\t%.5f\t%.5f\n", dSdpi[i+4], dSdpi[i+5], dSdpi[i+6], dSdpi[i+7]);
555 fprintf(output, "%.5f\t%.5f\t%.5f\t%.5f\n\n", dSdpi[i+8], dSdpi[i+9], dSdpi[i+10], dSdpi[i+11]);
556 }
557 fclose(output);
558 break;
559 case(8):
560 int na=0;
561 output_old = fopen("PreUpDownPart","w");
562 for(int i=0; i<kferm; i+=2)
563 fprintf(output_old,"R1[%d]:\t%.5e+%.5ei\tR1[%d]:\t%.5e+%.5ei\n",\
564 i,creal(R1[i]),cimag(R1[i]),i+1,creal(R1[i+1]),cimag(R1[i+1]));
565 UpDownPart(na,X0,R1);
566 fclose(output_old);
567 output = fopen("UpDownPart","w");
568 for(int i=0; i<kferm2; i+=2)
569 fprintf(output,"X0[%d]:\t%.5e+%.5ei\tX0[%d]:\t%.5e+%.5ei\n",\
570 i,creal(X0[i]),cimag(X0[i]),i+1,creal(X0[i+1]),cimag(X0[i+1]));
571
572 fclose(output);
573 break;
574 case(9):
575 int itercg=0;
576 Congradp(0, respbp, Phi, R1,u11t_f,u12t_f,iu,id,gamval_f,gamin,dk4m_f,dk4p_f,jqq,akappa,&itercg);
577
578 }
579 }
580 //George Michael's favourite bit of the code
581#ifdef __NVCC__
582 //Make a routine that does this for us
583 cudaFree(dk4m); cudaFree(dk4p); cudaFree(R1); cudaFree(dSdpi); cudaFree(pp);
584 cudaFree(Phi); cudaFree(u11t); cudaFree(u12t);
585 cudaFree(X0); cudaFree(X1); cudaFree(u11); cudaFree(u12);
586 cudaFree(X0_f); cudaFree(X1_f); cudaFree(u11t_f); cudaFree(u12t_f);
587 cudaFree(id); cudaFree(iu); cudaFree(hd); cudaFree(hu);
588#else
589 free(dk4m); free(dk4p); free(R1); free(dSdpi); free(pp);
590 free(Phi); free(u11t); free(u12t); free(xi);
591 free(X0); free(X1); free(u11); free(u12);
592 free(id); free(iu); free(hd); free(hu);
593 free(pcoord);
594#endif
595
596#if(nproc>1)
597 MPI_Finalise();
598#endif
599 exit(0);
600}
601#endif
unsigned int * hd
Down halo indices.
Definition coord.c:15
unsigned int * hu
Up halo indices.
Definition coord.c:15
Matrix multiplication and related declarations.
int Dslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa)
Evaluates in double precision.
Definition matrices.c:18
int Dslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:462
int Dslashd(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, Complex_f jqq, float akappa)
Evaluates in double precision.
Definition matrices.c:140
int Hdslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, float akappa)
Evaluates in single precision.
Definition matrices.c:802
int Hdslash(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
Evaluates in double precision.
Definition matrices.c:267
int Dslashd_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa)
Evaluates in single precision.
Definition matrices.c:584
int Hdslash_f(Complex_f *phi, Complex_f *r, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, float akappa)
Evaluates in single precision.
Definition matrices.c:711
int Hdslashd(Complex *phi, Complex *r, Complex *u11t, Complex *u12t, unsigned int *iu, unsigned int *id, Complex *gamval, int *gamin, double *dk4m, double *dk4p, float akappa)
Evaluates in double precision.
Definition matrices.c:358
int Trial_Exchange(Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f)
Exchanges the trial fields.
Definition par_mpi.c:1178
#define MPI_Finalise()
Avoid any accidents with US/UK spelling.
Definition par_mpi.h:30
int * pcoord
The processor grid.
Definition par_mpi.c:19
int Gauss_z(Complex *ps, unsigned int n, const Complex mu, const double sigma)
Generates a vector of normally distributed random double precision complex numbers using the Box-Mull...
Definition random.c:214
int Gauss_c(Complex_f *ps, unsigned int n, const Complex_f mu, const float sigma)
Generates a vector of normally distributed random single precision complex numbers using the Box-Mull...
Definition random.c:260
int Gauss_d(double *ps, unsigned int n, const double mu, const double sigma)
Generates a vector of normally distributed random double precision numbers using the Box-Muller Metho...
Definition random.c:306
#define AVX
Alignment of arrays. 64 for AVX-512, 32 for AVX/AVX2. 16 for SSE. Since AVX is standard on modern x86...
Definition sizes.h:268
#define nc
Colours.
Definition sizes.h:173
#define rescgg
Conjugate gradient residue for update.
Definition sizes.h:240
#define kmom
sublattice momentum sites
Definition sizes.h:184
#define ngorkov
Gor'kov indices.
Definition sizes.h:181
#define kferm2Halo
Dirac lattice and halo.
Definition sizes.h:227
#define kvol
Sublattice volume.
Definition sizes.h:154
#define Complex
Double precision complex number.
Definition sizes.h:58
#define kferm
sublattice size including Gor'kov indices
Definition sizes.h:186
#define nf
Fermion flavours (double it)
Definition sizes.h:151
#define ndirac
Dirac indices.
Definition sizes.h:177
#define respbp
Conjugate gradient residue for .
Definition sizes.h:238
#define kmomHalo
Momentum lattice and halo.
Definition sizes.h:229
#define halo
Total Halo size.
Definition sizes.h:222
#define Complex_f
Single precision complex number.
Definition sizes.h:56
#define ndim
Dimensions.
Definition sizes.h:179
#define kferm2
sublattice size including Dirac indices
Definition sizes.h:188
#define kfermHalo
Gor'kov lattice and halo.
Definition sizes.h:225
int Force(double *dSdpi, int iflag, double res1, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex *gamval, Complex_f *gamval_f, int *gamin, double *dk4m, double *dk4p, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancg)
Calculates the force at each intermediate time.
Definition force.c:131
int Gauge_force(double *dSdpi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, float beta)
Calculates the gauge force due to the Wilson Action at each intermediate time.
Definition force.c:6
int Hamilton(double *h, double *s, double res2, double *pp, Complex *X0, Complex *X1, Complex *Phi, Complex *u11t, Complex *u12t, Complex_f *u11t_f, Complex_f *u12t_f, unsigned int *iu, unsigned int *id, Complex_f *gamval_f, int *gamin, float *dk4m_f, float *dk4p_f, Complex_f jqq, float akappa, float beta, double *ancgh, int traj)
Calculate the Hamiltonian.
Definition su2hmc.c:208
int Reunitarise(Complex *u11t, Complex *u12t)
Reunitarises u11t and u12t as in conj(u11t[i])*u11t[i]+conj(u12t[i])*u12t[i]=1.
Definition matrices.c:904
int Congradp(int na, double res, Complex *Phi, Complex *xi, Complex_f *u11t, Complex_f *u12t, unsigned int *iu, unsigned int *id, Complex_f *gamval, int *gamin, float *dk4m, float *dk4p, Complex_f jqq, float akappa, int *itercg)
Matrix Inversion via Conjugate Gradient (no up/down flavour partitioning). Solves The matrix multipl...
Definition congrad.c:262