6#ifndef DPHI_GPU_KERNELS_HPP
7#define DPHI_GPU_KERNELS_HPP
10#include "libhr_core.h"
17#define DPHI_T_UP_GPU(ix, iy, in, gauge, r, sn, u) \
18 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
19 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
20 in_gauge_field<REAL>(&u, (gauge), (ix), (iy), 0, UP); \
22 _vector_add_assign_f((sn).c[0], (sn).c[1]); \
23 _suNf_theta_T_multiply_gpu((sn).c[1], u, (sn).c[0]); \
24 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
25 _vector_mul_add_assign_f((r).c[2], -0.5, (sn).c[1]); \
27 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
28 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
30 _vector_add_assign_f((sn).c[0], (sn).c[1]); \
31 _suNf_theta_T_multiply_gpu((sn).c[1], u, (sn).c[0]); \
32 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
33 _vector_mul_add_assign_f((r).c[3], -0.5, (sn).c[1]);
35#define DPHI_T_DN_GPU(ix, iy, in, gauge, r, sn, u) \
36 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
37 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
38 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 0, DOWN); \
40 _vector_sub_assign_f((sn).c[0], (sn).c[1]); \
41 _suNf_theta_T_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
42 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
43 _vector_mul_sub_assign_f((r).c[2], -0.5, (sn).c[1]); \
45 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
46 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
48 _vector_sub_assign_f((sn).c[0], (sn).c[1]); \
49 _suNf_theta_T_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
50 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
51 _vector_mul_sub_assign_f((r).c[3], -0.5, (sn).c[1]);
53#define DPHI_X_UP_GPU(ix, iy, in, gauge, r, sn, u) \
54 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
55 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
56 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 1, UP); \
58 _vector_i_add_assign_f((sn).c[0], (sn).c[1]); \
59 _suNf_theta_X_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
60 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
61 _vector_i_mul_sub_assign_f((r).c[3], -0.5, (sn).c[1]); \
63 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
64 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
66 _vector_i_add_assign_f((sn).c[0], (sn).c[1]); \
67 _suNf_theta_X_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
68 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
69 _vector_i_mul_sub_assign_f((r).c[2], -0.5, (sn).c[1]);
71#define DPHI_X_DN_GPU(ix, iy, in, gauge, r, sn, u) \
72 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
73 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
74 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 1, DOWN); \
76 _vector_i_sub_assign_f((sn).c[0], (sn).c[1]); \
77 _suNf_theta_X_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
78 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
79 _vector_i_mul_add_assign_f((r).c[3], -0.5, (sn).c[1]); \
81 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
82 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
84 _vector_i_sub_assign_f((sn).c[0], (sn).c[1]); \
85 _suNf_theta_X_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
86 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
87 _vector_i_mul_add_assign_f((r).c[2], -0.5, (sn).c[1]);
89#define DPHI_Y_UP_GPU(ix, iy, in, gauge, r, sn, u) \
90 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
91 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
92 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 2, UP); \
94 _vector_add_assign_f((sn).c[0], (sn).c[1]); \
95 _suNf_theta_Y_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
96 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
97 _vector_mul_add_assign_f((r).c[3], -0.5, (sn).c[1]); \
99 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
100 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
102 _vector_sub_assign_f((sn).c[0], (sn).c[1]); \
103 _suNf_theta_Y_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
104 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
105 _vector_mul_sub_assign_f((r).c[2], -0.5, (sn).c[1]);
107#define DPHI_Y_DN_GPU(ix, iy, in, gauge, r, sn, u) \
108 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
109 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
110 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 2, DOWN); \
112 _vector_sub_assign_f((sn).c[0], (sn).c[1]); \
113 _suNf_theta_Y_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
114 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
115 _vector_mul_sub_assign_f((r).c[3], -0.5, (sn).c[1]); \
117 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
118 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
120 _vector_add_assign_f((sn).c[0], (sn).c[1]); \
121 _suNf_theta_Y_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
122 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
123 _vector_mul_add_assign_f((r).c[2], -0.5, (sn).c[1]);
125#define DPHI_Z_UP_GPU(ix, iy, in, gauge, r, sn, u) \
126 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
127 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
128 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 3, UP); \
130 _vector_i_add_assign_f((sn).c[0], (sn).c[1]); \
131 _suNf_theta_Z_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
132 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
133 _vector_i_mul_sub_assign_f((r).c[2], -0.5, (sn).c[1]); \
135 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
136 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
138 _vector_i_sub_assign_f((sn).c[0], (sn).c[1]); \
139 _suNf_theta_Z_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
140 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
141 _vector_i_mul_add_assign_f((r).c[3], -0.5, (sn).c[1]);
143#define DPHI_Z_DN_GPU(ix, iy, in, gauge, r, sn, u) \
144 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
145 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
146 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 3, DOWN); \
148 _vector_i_sub_assign_f((sn).c[0], (sn).c[1]); \
149 _suNf_theta_Z_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
150 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
151 _vector_i_mul_add_assign_f((r).c[2], -0.5, (sn).c[1]); \
153 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
154 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
156 _vector_i_add_assign_f((sn).c[0], (sn).c[1]); \
157 _suNf_theta_Z_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
158 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
159 _vector_i_mul_sub_assign_f((r).c[3], -0.5, (sn).c[1]);
163#define prefetch_gauge(ix, u_cpx, gauge, tgt_comp, vcomp, idn_gpu) \
164 read_gpu<REAL>(0, &(u_cpx[0]), gauge, ix, NF * tgt_comp + vcomp, 4); \
165 read_gpu<REAL>(0, &(u_cpx[2]), gauge, ix, NF * tgt_comp + vcomp + NF * NF, 4); \
166 read_gpu<REAL>(0, &(u_cpx[4]), gauge, ix, NF * tgt_comp + vcomp + 2 * NF * NF, 4); \
167 read_gpu<REAL>(0, &(u_cpx[6]), gauge, ix, NF * tgt_comp + vcomp + 3 * NF * NF, 4); \
168 read_gpu<REAL>(0, &(u_cpx[1]), gauge, idn_gpu[4 * ix], NF * vcomp + tgt_comp, 4); \
169 read_gpu<REAL>(0, &(u_cpx[3]), gauge, idn_gpu[4 * ix + 1], NF * vcomp + tgt_comp + NF * NF, 4); \
170 read_gpu<REAL>(0, &(u_cpx[5]), gauge, idn_gpu[4 * ix + 2], NF * vcomp + tgt_comp + 2 * NF * NF, 4); \
171 read_gpu<REAL>(0, &(u_cpx[7]), gauge, idn_gpu[4 * ix + 3], NF * vcomp + tgt_comp + 3 * NF * NF, 4);
173#define write_out(r0, r1, r2, r3, out, ix, tgt_comp) \
174 _complex_mul(r0, -0.5, r0); \
175 _complex_mul(r1, -0.5, r1); \
176 _complex_mul(r2, -0.5, r2); \
177 _complex_mul(r3, -0.5, r3); \
178 write_gpu<REAL>(0, &r0, out, ix, tgt_comp, 1); \
179 write_gpu<REAL>(0, &r1, out, ix, tgt_comp + NF, 1); \
180 write_gpu<REAL>(0, &r2, out, ix, tgt_comp + 2 * NF, 1); \
181 write_gpu<REAL>(0, &r3, out, ix, tgt_comp + 3 * NF, 1);
183#define DPHI_T_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
184 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
185 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
186 _complex_mul_assign(r0, u_cpx[0], sn_cpx0); \
187 _complex_mul_assign(r2, u_cpx[0], sn_cpx0); \
188 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
189 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
190 _complex_mul_assign(r1, u_cpx[0], sn_cpx0); \
191 _complex_mul_assign(r3, u_cpx[0], sn_cpx0);
194#define DPHI_T_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
195 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
196 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
197 _complex_mul_assign(r0, (u_cpx[1]), sn_cpx0); \
198 _complex_mul_sub_assign(r2, (u_cpx[1]), sn_cpx0); \
199 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
200 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
201 _complex_mul_assign(r1, (u_cpx[1]), sn_cpx0); \
202 _complex_mul_sub_assign(r3, (u_cpx[1]), sn_cpx0);
204#define DPHI_T_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
205 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
206 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
207 _complex_mul_assign(r0, conj(u_cpx[1]), sn_cpx0); \
208 _complex_mul_sub_assign(r2, conj(u_cpx[1]), sn_cpx0); \
209 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
210 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
211 _complex_mul_assign(r1, conj(u_cpx[1]), sn_cpx0); \
212 _complex_mul_sub_assign(r3, conj(u_cpx[1]), sn_cpx0);
215#define DPHI_X_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
216 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
217 _complex_i_plus(sn_cpx0, sn_cpx0); \
218 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
219 _complex_mul_assign(r0, u_cpx[2], sn_cpx0); \
220 _complex_i_minus(sn_cpx0, sn_cpx0); \
221 _complex_mul_assign(r3, u_cpx[2], sn_cpx0); \
222 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
223 _complex_i_plus(sn_cpx0, sn_cpx0); \
224 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
225 _complex_mul_assign(r1, u_cpx[2], sn_cpx0); \
226 _complex_i_minus(sn_cpx0, sn_cpx0); \
227 _complex_mul_assign(r2, u_cpx[2], sn_cpx0);
230#define DPHI_X_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
231 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
232 _complex_i_minus(sn_cpx0, sn_cpx0); \
233 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
234 _complex_mul_assign(r0, (u_cpx[3]), sn_cpx0); \
235 _complex_i_plus(sn_cpx0, sn_cpx0); \
236 _complex_mul_assign(r3, (u_cpx[3]), sn_cpx0); \
237 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
238 _complex_i_minus(sn_cpx0, sn_cpx0); \
239 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
240 _complex_mul_assign(r1, (u_cpx[3]), sn_cpx0); \
241 _complex_i_plus(sn_cpx0, sn_cpx0); \
242 _complex_mul_assign(r2, (u_cpx[3]), sn_cpx0);
245#define DPHI_X_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
246 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
247 _complex_i_minus(sn_cpx0, sn_cpx0); \
248 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
249 _complex_mul_assign(r0, conj(u_cpx[3]), sn_cpx0); \
250 _complex_i_plus(sn_cpx0, sn_cpx0); \
251 _complex_mul_assign(r3, conj(u_cpx[3]), sn_cpx0); \
252 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
253 _complex_i_minus(sn_cpx0, sn_cpx0); \
254 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
255 _complex_mul_assign(r1, conj(u_cpx[3]), sn_cpx0); \
256 _complex_i_plus(sn_cpx0, sn_cpx0); \
257 _complex_mul_assign(r2, conj(u_cpx[3]), sn_cpx0);
261#define DPHI_Y_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
262 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
263 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
264 _complex_mul_assign(r0, u_cpx[4], sn_cpx0); \
265 _complex_mul_assign(r3, u_cpx[4], sn_cpx0); \
266 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
267 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
268 _complex_mul_assign(r1, u_cpx[4], sn_cpx0); \
269 _complex_mul_sub_assign(r2, u_cpx[4], sn_cpx0);
272#define DPHI_Y_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
273 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
274 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
275 _complex_mul_assign(r0, (u_cpx[5]), sn_cpx0); \
276 _complex_mul_sub_assign(r3, (u_cpx[5]), sn_cpx0); \
277 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
278 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
279 _complex_mul_assign(r1, (u_cpx[5]), sn_cpx0); \
280 _complex_mul_assign(r2, (u_cpx[5]), sn_cpx0);
283#define DPHI_Y_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
284 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
285 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
286 _complex_mul_assign(r0, conj(u_cpx[5]), sn_cpx0); \
287 _complex_mul_sub_assign(r3, conj(u_cpx[5]), sn_cpx0); \
288 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
289 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
290 _complex_mul_assign(r1, conj(u_cpx[5]), sn_cpx0); \
291 _complex_mul_assign(r2, conj(u_cpx[5]), sn_cpx0);
295#define DPHI_Z_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
296 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
297 _complex_i_plus(sn_cpx0, sn_cpx0); \
298 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
299 _complex_mul_assign(r0, u_cpx[6], sn_cpx0); \
300 _complex_i_minus(sn_cpx0, sn_cpx0); \
301 _complex_mul_assign(r2, u_cpx[6], sn_cpx0); \
302 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
303 _complex_i_minus(sn_cpx0, sn_cpx0); \
304 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
305 _complex_mul_assign(r1, u_cpx[6], sn_cpx0); \
306 _complex_i_plus(sn_cpx0, sn_cpx0); \
307 _complex_mul_assign(r3, u_cpx[6], sn_cpx0);
310#define DPHI_Z_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
311 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
312 _complex_i_minus(sn_cpx0, sn_cpx0); \
313 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
314 _complex_mul_assign(r0, (u_cpx[7]), sn_cpx0); \
315 _complex_i_plus(sn_cpx0, sn_cpx0); \
316 _complex_mul_assign(r2, (u_cpx[7]), sn_cpx0); \
317 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
318 _complex_i_plus(sn_cpx0, sn_cpx0); \
319 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
320 _complex_mul_assign(r1, (u_cpx[7]), sn_cpx0); \
321 _complex_i_minus(sn_cpx0, sn_cpx0); \
322 _complex_mul_assign(r3, (u_cpx[7]), sn_cpx0);
325#define DPHI_Z_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
326 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
327 _complex_i_minus(sn_cpx0, sn_cpx0); \
328 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
329 _complex_mul_assign(r0, conj(u_cpx[7]), sn_cpx0); \
330 _complex_i_plus(sn_cpx0, sn_cpx0); \
331 _complex_mul_assign(r2, conj(u_cpx[7]), sn_cpx0); \
332 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
333 _complex_i_plus(sn_cpx0, sn_cpx0); \
334 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
335 _complex_mul_assign(r1, conj(u_cpx[7]), sn_cpx0); \
336 _complex_i_minus(sn_cpx0, sn_cpx0); \
337 _complex_mul_assign(r3, conj(u_cpx[7]), sn_cpx0);
342#define read_reduced(iy, in, sn, piece, base_in) \
344 const int block_offset = base_in; \
345 const HSPINOR_TYPE *in_offset = (HSPINOR_TYPE *)((in) + block_offset); \
346 const int iy_loc = (iy)-block_offset; \
347 read_gpu<REAL>(0, &(sn), in_offset, iy_loc, 0, 1); \
350#define DPHI_RED_T_UP_GPU(r, sn) \
351 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
352 _vector_mul_add_assign_f(r.c[2], -0.5, sn.c[0]); \
353 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
354 _vector_mul_add_assign_f(r.c[3], -0.5, sn.c[1]);
356#define DPHI_RED_T_DN_GPU(r, sn) \
357 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
358 _vector_mul_sub_assign_f(r.c[2], -0.5, sn.c[0]); \
359 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
360 _vector_mul_sub_assign_f(r.c[3], -0.5, sn.c[1]);
362#define DPHI_RED_X_UP_GPU(r, sn) \
363 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
364 _vector_i_mul_sub_assign_f(r.c[3], -0.5, sn.c[0]); \
365 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
366 _vector_i_mul_sub_assign_f(r.c[2], -0.5, sn.c[1]);
368#define DPHI_RED_X_DN_GPU(r, sn) \
369 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
370 _vector_i_mul_add_assign_f(r.c[3], -0.5, sn.c[0]); \
371 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
372 _vector_i_mul_add_assign_f(r.c[2], -0.5, sn.c[1]);
374#define DPHI_RED_Y_UP_GPU(r, sn) \
375 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
376 _vector_mul_add_assign_f(r.c[3], -0.5, sn.c[0]); \
377 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
378 _vector_mul_sub_assign_f(r.c[2], -0.5, sn.c[1]);
380#define DPHI_RED_Y_DN_GPU(r, sn) \
381 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
382 _vector_mul_sub_assign_f(r.c[3], -0.5, sn.c[0]); \
383 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
384 _vector_mul_add_assign_f(r.c[2], -0.5, sn.c[1]);
386#define DPHI_RED_Z_UP_GPU(r, sn) \
387 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
388 _vector_i_mul_sub_assign_f(r.c[2], -0.5, sn.c[0]); \
389 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
390 _vector_i_mul_add_assign_f(r.c[3], -0.5, sn.c[1]);
392#define DPHI_RED_Z_DN_GPU(r, sn) \
393 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
394 _vector_i_mul_add_assign_f(r.c[2], -0.5, sn.c[0]); \
395 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
396 _vector_i_mul_sub_assign_f(r.c[3], -0.5, sn.c[1]);
400template <
typename HSPINOR_TYPE,
class REAL,
typename COMPLEX,
typename SITE_TYPE,
typename GAUGE_TYPE>
401__global__
void Dphi_gpu_inner_kernel(SITE_TYPE *in, SITE_TYPE *out,
const GAUGE_TYPE *gauge,
int vol_in,
int vol_out,
402 int base_in,
int base_out, gd_type piece,
char *imask_gpu,
int *iup_gpu,
int *idn_gpu) {
403 for (
int id = blockIdx.x * blockDim.x + threadIdx.x;
id < vol_out;
id += gridDim.x * blockDim.x) {
404 int ix =
id + base_out;
413 if (imask_gpu[ix] & T_UP_MASK) {
414 const int iy = iup_gpu[4 * ix];
415 DPHI_T_UP_GPU(ix, iy, in, gauge, r, sn, u);
419 if (imask_gpu[ix] & T_DN_MASK) {
420 const int iy = idn_gpu[4 * ix];
421 DPHI_T_DN_GPU(ix, iy, in, gauge, r, sn, u);
425 if (imask_gpu[ix] & X_UP_MASK) {
426 const int iy = iup_gpu[4 * ix + 1];
427 DPHI_X_UP_GPU(ix, iy, in, gauge, r, sn, u);
431 if (imask_gpu[ix] & X_DN_MASK) {
432 const int iy = idn_gpu[4 * ix + 1];
433 DPHI_X_DN_GPU(ix, iy, in, gauge, r, sn, u);
437 if (imask_gpu[ix] & Y_UP_MASK) {
438 const int iy = iup_gpu[4 * ix + 2];
439 DPHI_Y_UP_GPU(ix, iy, in, gauge, r, sn, u);
443 if (imask_gpu[ix] & Y_DN_MASK) {
444 const int iy = idn_gpu[4 * ix + 2];
445 DPHI_Y_DN_GPU(ix, iy, in, gauge, r, sn, u);
449 if (imask_gpu[ix] & Z_UP_MASK) {
450 const int iy = iup_gpu[4 * ix + 3];
451 DPHI_Z_UP_GPU(ix, iy, in, gauge, r, sn, u);
455 if (imask_gpu[ix] & Z_DN_MASK) {
456 const int iy = idn_gpu[4 * ix + 3];
457 DPHI_Z_DN_GPU(ix, iy, in, gauge, r, sn, u);
460 write_out_spinor_field<REAL>(&r, out, ix);
466template <
typename HSPINOR_TYPE,
class REAL,
typename COMPLEX,
typename SITE_TYPE,
typename GAUGE_TYPE>
467__global__
void Dphi_gpu_inner_kernel(SITE_TYPE *in, SITE_TYPE *out,
const GAUGE_TYPE *gauge,
int vol_in,
int vol_out,
468 int base_in,
int base_out, gd_type piece,
char *imask_gpu,
int *iup_gpu,
int *idn_gpu) {
469 for (
int id = blockIdx.x * blockDim.x + threadIdx.x;
id < vol_out * NF;
id += gridDim.x * blockDim.x) {
471 int divider = SUBBLOCKS;
472 while (vol_out % divider != 0) {
476 const int ix =
id % divider + (
id / (divider * NF)) * divider + base_out;
477 const int tgt_comp = (
id / divider) % NF;
485 COMPLEX r0, r1, r2, r3;
491 for (
int vcomp = 0; vcomp < NF; vcomp++) {
492 prefetch_gauge(ix, u_cpx, gauge, tgt_comp, vcomp, idn_gpu);
495 if (imask_gpu[ix] & T_UP_MASK) {
496 const int iy = iup_gpu[4 * ix];
497 DPHI_T_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
501 if (imask_gpu[ix] & T_DN_MASK) {
502 const int iy = idn_gpu[4 * ix];
503 DPHI_T_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
507 if (imask_gpu[ix] & X_UP_MASK) {
508 const int iy = iup_gpu[4 * ix + 1];
509 DPHI_X_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
513 if (imask_gpu[ix] & X_DN_MASK) {
514 const int iy = idn_gpu[4 * ix + 1];
515 DPHI_X_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
519 if (imask_gpu[ix] & Y_UP_MASK) {
520 const int iy = iup_gpu[4 * ix + 2];
521 DPHI_Y_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
525 if (imask_gpu[ix] & Y_DN_MASK) {
526 const int iy = idn_gpu[4 * ix + 2];
527 DPHI_Y_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
531 if (imask_gpu[ix] & Z_UP_MASK) {
532 const int iy = iup_gpu[4 * ix + 3];
533 DPHI_Z_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
537 if (imask_gpu[ix] & Z_DN_MASK) {
538 const int iy = idn_gpu[4 * ix + 3];
539 DPHI_Z_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
543 write_out(r0, r1, r2, r3, out, ix, tgt_comp);
549template <
typename HSPINOR_TYPE,
class REAL,
typename SITE_TYPE,
typename GAUGE_TYPE>
550__global__
void Dphi_gpu_boundary_kernel(SITE_TYPE *in, SITE_TYPE *out,
const GAUGE_TYPE *gauge,
int vol_in,
int vol_out,
551 int base_in,
int base_out, gd_type piece,
char mask,
int *iup_gpu,
int *idn_gpu) {
552 for (
int id = blockIdx.x * blockDim.x + threadIdx.x;
id < vol_in;
id += gridDim.x * blockDim.x) {
554 int iy =
id + base_in;
563 if (mask & T_UP_MASK) {
564 ix = find_neighbor(iy, DOWN, 0, iup_gpu, idn_gpu);
565 read_reduced(iy, in, sn, piece, base_in);
566 DPHI_RED_T_UP_GPU(r, sn);
570 if (mask & T_DN_MASK) {
571 ix = find_neighbor(iy, UP, 0, iup_gpu, idn_gpu);
572 read_reduced(iy, in, sn, piece, base_in);
573 DPHI_RED_T_DN_GPU(r, sn);
577 if (mask & X_UP_MASK) {
578 ix = find_neighbor(iy, DOWN, 1, iup_gpu, idn_gpu);
579 read_reduced(iy, in, sn, piece, base_in);
580 DPHI_RED_X_UP_GPU(r, sn);
584 if (mask & X_DN_MASK) {
585 ix = find_neighbor(iy, UP, 1, iup_gpu, idn_gpu);
586 read_reduced(iy, in, sn, piece, base_in);
587 DPHI_RED_X_DN_GPU(r, sn);
591 if (mask & Y_UP_MASK) {
592 ix = find_neighbor(iy, DOWN, 2, iup_gpu, idn_gpu);
593 read_reduced(iy, in, sn, piece, base_in);
594 DPHI_RED_Y_UP_GPU(r, sn);
598 if (mask & Y_DN_MASK) {
599 ix = find_neighbor(iy, UP, 2, iup_gpu, idn_gpu);
600 read_reduced(iy, in, sn, piece, base_in);
601 DPHI_RED_Y_DN_GPU(r, sn);
605 if (mask & Z_UP_MASK) {
606 ix = find_neighbor(iy, DOWN, 3, iup_gpu, idn_gpu);
607 read_reduced(iy, in, sn, piece, base_in);
608 DPHI_RED_Z_UP_GPU(r, sn);
612 if (mask & Z_DN_MASK) {
613 ix = find_neighbor(iy, UP, 3, iup_gpu, idn_gpu);
614 read_reduced(iy, in, sn, piece, base_in);
615 DPHI_RED_Z_DN_GPU(r, sn);
618 write_assign_atomic_gpu<REAL>(0, &r, out, ix, 0, 1);
623template <
typename VECTOR_TYPE,
class REAL,
typename COMPLEX,
typename SITE_TYPE>
624__global__
void Cphi_gpu_kernel_(SITE_TYPE *dptr, SITE_TYPE *sptr, suNfc *cl_term,
double mass,
int assign,
int N,
626 for (
int id = blockIdx.x * blockDim.x + threadIdx.x;
id < N * NF * 4;
id += gridDim.x * blockDim.x) {
627 const int dir =
id / (N * NF);
628 const int id2 =
id % (N * NF);
629 int divider = SUBBLOCKS;
630 while (N % divider != 0) {
633 const int tgt_comp = (id2 / divider) % NF;
634 const int ix = id2 % divider + (id2 / (divider * NF)) * divider;
643 for (
int vcomp = 0; vcomp < NF; vcomp++) {
644 read_gpu<REAL>(0, &s0, sptr, ix, vcomp, 1);
645 read_gpu<REAL>(0, &s1, sptr, ix, vcomp + NF, 1);
646 read_gpu<double>(0, &cl_0, cl_term, ix, NF * tgt_comp + vcomp, 4);
647 read_gpu<double>(0, &cl_1, cl_term, ix, NF * tgt_comp + vcomp + NF * NF, 4);
651 if (vcomp == tgt_comp) { v3 += mass * s0; }
655 write_assign_gpu<REAL>(0, &v3, dptr, ix, tgt_comp, 1);
657 write_gpu<REAL>(0, &v3, dptr, ix, tgt_comp, 1);
664 for (
int vcomp = 0; vcomp < NF; vcomp++) {
666 read_gpu<REAL>(0, &s0, sptr, ix, vcomp, 1);
667 read_gpu<REAL>(0, &s1, sptr, ix, vcomp + NF, 1);
668 read_gpu<double>(0, &cl_0, cl_term, ix, NF * tgt_comp + vcomp, 4);
669 read_gpu<double>(0, &cl_1, cl_term, ix, NF * vcomp + tgt_comp + NF * NF, 4);
670 v1 = conj(cl_1) * s0;
673 if (vcomp == tgt_comp) { v3 += mass * s1; }
677 write_assign_gpu<REAL>(0, &v3, dptr, ix, tgt_comp + NF, 1);
679 write_gpu<REAL>(0, &v3, dptr, ix, tgt_comp + NF, 1);
686 for (
int vcomp = 0; vcomp < NF; vcomp++) {
687 read_gpu<REAL>(0, &s0, sptr, ix, vcomp + 2 * NF, 1);
688 read_gpu<REAL>(0, &s1, sptr, ix, vcomp + 3 * NF, 1);
689 read_gpu<double>(0, &cl_0, cl_term, ix, NF * tgt_comp + vcomp + 2 * NF * NF, 4);
690 read_gpu<double>(0, &cl_1, cl_term, ix, NF * tgt_comp + vcomp + 3 * NF * NF, 4);
694 if (vcomp == tgt_comp) { v3 += mass * s0; }
698 write_assign_gpu<REAL>(0, &v3, dptr, ix, tgt_comp + 2 * NF, 1);
700 write_gpu<REAL>(0, &v3, dptr, ix, tgt_comp + 2 * NF, 1);
707 for (
int vcomp = 0; vcomp < NF; vcomp++) {
708 read_gpu<REAL>(0, &s0, sptr, ix, vcomp + 2 * NF, 1);
709 read_gpu<REAL>(0, &s1, sptr, ix, vcomp + 3 * NF, 1);
710 read_gpu<double>(0, &cl_0, cl_term, ix, NF * tgt_comp + vcomp + 2 * NF * NF, 4);
711 read_gpu<double>(0, &cl_1, cl_term, ix, NF * vcomp + tgt_comp + 3 * NF * NF, 4);
712 v1 = conj(cl_1) * s0;
715 if (vcomp == tgt_comp) { v3 += mass * s1; }
719 write_assign_gpu<REAL>(0, &v3, dptr, ix, tgt_comp + 3 * NF, 1);
721 write_gpu<REAL>(0, &v3, dptr, ix, tgt_comp + 3 * NF, 1);
727template <
typename VECTOR_TYPE,
typename COMPLEX,
class REAL,
typename SITE_TYPE>
728__global__
void Cphi_inv_kernel_(SITE_TYPE *dptr, SITE_TYPE *sptr, ldl_t *ldl_gpu,
int mass,
int assign,
int N,
730 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += gridDim.x * blockDim.x) {
732 hr_complex *up, *dn, c;
736 read_gpu<double>(0, &site_ldl, ldl_gpu, ix, 0, 1);
737 read_gpu<REAL>(0, &in, sptr, ix, 0, 1);
744 for (
int i = 0; i < 2 * NF; i++) {
745 for (
int k = 0; k < i; k++) {
746 n = i * (i + 1) / 2 + k;
747 _complex_mul_sub_assign(x[i], (COMPLEX)up[n], x[k]);
748 _complex_mul_sub_assign(x[i + 2 * NF], (COMPLEX)dn[n], x[k + 2 * NF]);
752 for (
int i = 2 * NF - 1; i >= 0; i--) {
753 n = i * (i + 1) / 2 + i;
754 _complex_mulr(x[i], 1. / ((REAL)creal(up[n])), x[i]);
755 _complex_mulr(x[i + 2 * NF], 1. / ((REAL)creal(dn[n])), x[i + 2 * NF]);
756 for (
int k = i + 1; k < 2 * NF; k++) {
757 n = k * (k + 1) / 2 + i;
759 c = (COMPLEX)conj(up[n]);
760 _complex_mul_sub_assign(x[i], c, x[k]);
761 c = (COMPLEX)conj(dn[n]);
762 _complex_mul_sub_assign(x[i + 2 * NF], c, x[k + 2 * NF]);
767 read_gpu<REAL>(0, &out, dptr, ix, 0, 1);
768 _spinor_add_assign_f(in, out);
771 write_gpu<REAL>(0, &in, dptr, ix, 0, 1);
779template <
typename VECTOR_TYPE,
class REAL,
typename SITE_TYPE>
780__global__
void Cphi_gpu_kernel_(SITE_TYPE *dptr, SITE_TYPE *sptr, suNfc *cl_term_expAplus, suNfc *cl_term_expAminus,
781 double mass,
double invexpmass,
int assign,
int N,
int block_start,
int NN_loc,
783 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += gridDim.x * blockDim.x) {
787 read_gpu<double>(0, &expAplus[0], cl_term_expAplus, ix, 0, 4);
788 read_gpu<double>(0, &expAplus[1], cl_term_expAplus, ix, 1, 4);
789 read_gpu<double>(0, &expAplus[2], cl_term_expAplus, ix, 2, 4);
790 read_gpu<double>(0, &expAplus[3], cl_term_expAplus, ix, 3, 4);
791 read_gpu<double>(0, &expAminus[0], cl_term_expAminus, ix, 0, 4);
792 read_gpu<double>(0, &expAminus[1], cl_term_expAminus, ix, 1, 4);
793 read_gpu<double>(0, &expAminus[2], cl_term_expAminus, ix, 2, 4);
794 read_gpu<double>(0, &expAminus[3], cl_term_expAminus, ix, 3, 4);
797 SITE_TYPE out, in, tmp;
799 read_gpu<REAL>(0, &in, sptr, ix, 0, 1);
802 _suNfc_multiply(v1, expAplus[0], in.c[0]);
803 _suNfc_multiply(v2, expAplus[1], in.c[1]);
804 _vector_add_f(tmp.c[0], v1, v2);
807 _suNfc_multiply(v1, expAplus[2], in.c[0]);
808 _suNfc_multiply(v2, expAplus[3], in.c[1]);
809 _vector_add_f(tmp.c[1], v1, v2);
812 _suNfc_multiply(v1, expAminus[0], in.c[2]);
813 _suNfc_multiply(v2, expAminus[1], in.c[3]);
814 _vector_add_f(tmp.c[2], v1, v2);
817 _suNfc_multiply(v1, expAminus[2], in.c[2]);
818 _suNfc_multiply(v2, expAminus[3], in.c[3]);
819 _vector_add_f(tmp.c[3], v1, v2);
822 read_gpu<REAL>(0, &out, dptr, ix, 0, 1);
823 _spinor_add_assign_f(tmp, out);
826 write_gpu<REAL>(0, &tmp, dptr, ix, 0, 1);
This file contains information on the geometry of the local lattice, block decomposed geometry,...