HiRep 0.1
Loading...
Searching...
No Matches
Dphi_gpu_kernels.hpp
1/***************************************************************************\
2* Copyright (c) 2023, Sofie Martins *
3* All rights reserved. *
4\***************************************************************************/
5
6#ifndef DPHI_GPU_KERNELS_HPP
7#define DPHI_GPU_KERNELS_HPP
8
9#include "geometry.h"
10#include "libhr_core.h"
11#include "update.h"
12#include "utils.h"
13
14#define SUBBLOCKS 32
15
16#ifndef LARGE_N
17#define DPHI_T_UP_GPU(ix, iy, in, gauge, r, sn, u) \
18 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
19 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
20 in_gauge_field<REAL>(&u, (gauge), (ix), (iy), 0, UP); \
21 \
22 _vector_add_assign_f((sn).c[0], (sn).c[1]); \
23 _suNf_theta_T_multiply_gpu((sn).c[1], u, (sn).c[0]); \
24 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
25 _vector_mul_add_assign_f((r).c[2], -0.5, (sn).c[1]); \
26 \
27 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
28 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
29 \
30 _vector_add_assign_f((sn).c[0], (sn).c[1]); \
31 _suNf_theta_T_multiply_gpu((sn).c[1], u, (sn).c[0]); \
32 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
33 _vector_mul_add_assign_f((r).c[3], -0.5, (sn).c[1]);
34
35#define DPHI_T_DN_GPU(ix, iy, in, gauge, r, sn, u) \
36 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
37 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
38 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 0, DOWN); \
39 \
40 _vector_sub_assign_f((sn).c[0], (sn).c[1]); \
41 _suNf_theta_T_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
42 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
43 _vector_mul_sub_assign_f((r).c[2], -0.5, (sn).c[1]); \
44 \
45 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
46 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
47 \
48 _vector_sub_assign_f((sn).c[0], (sn).c[1]); \
49 _suNf_theta_T_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
50 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
51 _vector_mul_sub_assign_f((r).c[3], -0.5, (sn).c[1]);
52
53#define DPHI_X_UP_GPU(ix, iy, in, gauge, r, sn, u) \
54 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
55 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
56 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 1, UP); \
57 \
58 _vector_i_add_assign_f((sn).c[0], (sn).c[1]); \
59 _suNf_theta_X_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
60 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
61 _vector_i_mul_sub_assign_f((r).c[3], -0.5, (sn).c[1]); \
62 \
63 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
64 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
65 \
66 _vector_i_add_assign_f((sn).c[0], (sn).c[1]); \
67 _suNf_theta_X_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
68 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
69 _vector_i_mul_sub_assign_f((r).c[2], -0.5, (sn).c[1]);
70
71#define DPHI_X_DN_GPU(ix, iy, in, gauge, r, sn, u) \
72 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
73 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
74 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 1, DOWN); \
75 \
76 _vector_i_sub_assign_f((sn).c[0], (sn).c[1]); \
77 _suNf_theta_X_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
78 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
79 _vector_i_mul_add_assign_f((r).c[3], -0.5, (sn).c[1]); \
80 \
81 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
82 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
83 \
84 _vector_i_sub_assign_f((sn).c[0], (sn).c[1]); \
85 _suNf_theta_X_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
86 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
87 _vector_i_mul_add_assign_f((r).c[2], -0.5, (sn).c[1]);
88
89#define DPHI_Y_UP_GPU(ix, iy, in, gauge, r, sn, u) \
90 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
91 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
92 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 2, UP); \
93 \
94 _vector_add_assign_f((sn).c[0], (sn).c[1]); \
95 _suNf_theta_Y_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
96 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
97 _vector_mul_add_assign_f((r).c[3], -0.5, (sn).c[1]); \
98 \
99 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
100 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
101 \
102 _vector_sub_assign_f((sn).c[0], (sn).c[1]); \
103 _suNf_theta_Y_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
104 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
105 _vector_mul_sub_assign_f((r).c[2], -0.5, (sn).c[1]);
106
107#define DPHI_Y_DN_GPU(ix, iy, in, gauge, r, sn, u) \
108 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
109 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
110 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 2, DOWN); \
111 \
112 _vector_sub_assign_f((sn).c[0], (sn).c[1]); \
113 _suNf_theta_Y_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
114 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
115 _vector_mul_sub_assign_f((r).c[3], -0.5, (sn).c[1]); \
116 \
117 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
118 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
119 \
120 _vector_add_assign_f((sn).c[0], (sn).c[1]); \
121 _suNf_theta_Y_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
122 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
123 _vector_mul_add_assign_f((r).c[2], -0.5, (sn).c[1]);
124
125#define DPHI_Z_UP_GPU(ix, iy, in, gauge, r, sn, u) \
126 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
127 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
128 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 3, UP); \
129 \
130 _vector_i_add_assign_f((sn).c[0], (sn).c[1]); \
131 _suNf_theta_Z_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
132 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
133 _vector_i_mul_sub_assign_f((r).c[2], -0.5, (sn).c[1]); \
134 \
135 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
136 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
137 \
138 _vector_i_sub_assign_f((sn).c[0], (sn).c[1]); \
139 _suNf_theta_Z_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
140 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
141 _vector_i_mul_add_assign_f((r).c[3], -0.5, (sn).c[1]);
142
143#define DPHI_Z_DN_GPU(ix, iy, in, gauge, r, sn, u) \
144 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 0); \
145 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 2); \
146 in_gauge_field<REAL>(&(u), (gauge), (ix), (iy), 3, DOWN); \
147 \
148 _vector_i_sub_assign_f((sn).c[0], (sn).c[1]); \
149 _suNf_theta_Z_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
150 _vector_mul_add_assign_f((r).c[0], -0.5, (sn).c[1]); \
151 _vector_i_mul_add_assign_f((r).c[2], -0.5, (sn).c[1]); \
152 \
153 in_spinor_field<REAL>(&((sn).c[0]), (in), (iy), 1); \
154 in_spinor_field<REAL>(&((sn).c[1]), (in), (iy), 3); \
155 \
156 _vector_i_add_assign_f((sn).c[0], (sn).c[1]); \
157 _suNf_theta_Z_inverse_multiply_gpu((sn).c[1], (u), (sn).c[0]); \
158 _vector_mul_add_assign_f((r).c[1], -0.5, (sn).c[1]); \
159 _vector_i_mul_sub_assign_f((r).c[3], -0.5, (sn).c[1]);
160
161#else
162
163#define prefetch_gauge(ix, u_cpx, gauge, tgt_comp, vcomp, idn_gpu) \
164 read_gpu<REAL>(0, &(u_cpx[0]), gauge, ix, NF * tgt_comp + vcomp, 4); \
165 read_gpu<REAL>(0, &(u_cpx[2]), gauge, ix, NF * tgt_comp + vcomp + NF * NF, 4); \
166 read_gpu<REAL>(0, &(u_cpx[4]), gauge, ix, NF * tgt_comp + vcomp + 2 * NF * NF, 4); \
167 read_gpu<REAL>(0, &(u_cpx[6]), gauge, ix, NF * tgt_comp + vcomp + 3 * NF * NF, 4); \
168 read_gpu<REAL>(0, &(u_cpx[1]), gauge, idn_gpu[4 * ix], NF * vcomp + tgt_comp, 4); \
169 read_gpu<REAL>(0, &(u_cpx[3]), gauge, idn_gpu[4 * ix + 1], NF * vcomp + tgt_comp + NF * NF, 4); \
170 read_gpu<REAL>(0, &(u_cpx[5]), gauge, idn_gpu[4 * ix + 2], NF * vcomp + tgt_comp + 2 * NF * NF, 4); \
171 read_gpu<REAL>(0, &(u_cpx[7]), gauge, idn_gpu[4 * ix + 3], NF * vcomp + tgt_comp + 3 * NF * NF, 4);
172
173#define write_out(r0, r1, r2, r3, out, ix, tgt_comp) \
174 _complex_mul(r0, -0.5, r0); \
175 _complex_mul(r1, -0.5, r1); \
176 _complex_mul(r2, -0.5, r2); \
177 _complex_mul(r3, -0.5, r3); \
178 write_gpu<REAL>(0, &r0, out, ix, tgt_comp, 1); \
179 write_gpu<REAL>(0, &r1, out, ix, tgt_comp + NF, 1); \
180 write_gpu<REAL>(0, &r2, out, ix, tgt_comp + 2 * NF, 1); \
181 write_gpu<REAL>(0, &r3, out, ix, tgt_comp + 3 * NF, 1);
182
183#define DPHI_T_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
184 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
185 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
186 _complex_mul_assign(r0, u_cpx[0], sn_cpx0); \
187 _complex_mul_assign(r2, u_cpx[0], sn_cpx0); \
188 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
189 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
190 _complex_mul_assign(r1, u_cpx[0], sn_cpx0); \
191 _complex_mul_assign(r3, u_cpx[0], sn_cpx0);
192
193#ifdef REPR_IS_REAL
194#define DPHI_T_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
195 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
196 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
197 _complex_mul_assign(r0, (u_cpx[1]), sn_cpx0); \
198 _complex_mul_sub_assign(r2, (u_cpx[1]), sn_cpx0); \
199 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
200 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
201 _complex_mul_assign(r1, (u_cpx[1]), sn_cpx0); \
202 _complex_mul_sub_assign(r3, (u_cpx[1]), sn_cpx0);
203#else
204#define DPHI_T_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
205 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
206 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
207 _complex_mul_assign(r0, conj(u_cpx[1]), sn_cpx0); \
208 _complex_mul_sub_assign(r2, conj(u_cpx[1]), sn_cpx0); \
209 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
210 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
211 _complex_mul_assign(r1, conj(u_cpx[1]), sn_cpx0); \
212 _complex_mul_sub_assign(r3, conj(u_cpx[1]), sn_cpx0);
213#endif
214
215#define DPHI_X_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
216 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
217 _complex_i_plus(sn_cpx0, sn_cpx0); \
218 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
219 _complex_mul_assign(r0, u_cpx[2], sn_cpx0); \
220 _complex_i_minus(sn_cpx0, sn_cpx0); \
221 _complex_mul_assign(r3, u_cpx[2], sn_cpx0); \
222 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
223 _complex_i_plus(sn_cpx0, sn_cpx0); \
224 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
225 _complex_mul_assign(r1, u_cpx[2], sn_cpx0); \
226 _complex_i_minus(sn_cpx0, sn_cpx0); \
227 _complex_mul_assign(r2, u_cpx[2], sn_cpx0);
228
229#ifdef REPR_IS_REAL
230#define DPHI_X_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
231 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
232 _complex_i_minus(sn_cpx0, sn_cpx0); \
233 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
234 _complex_mul_assign(r0, (u_cpx[3]), sn_cpx0); \
235 _complex_i_plus(sn_cpx0, sn_cpx0); \
236 _complex_mul_assign(r3, (u_cpx[3]), sn_cpx0); \
237 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
238 _complex_i_minus(sn_cpx0, sn_cpx0); \
239 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
240 _complex_mul_assign(r1, (u_cpx[3]), sn_cpx0); \
241 _complex_i_plus(sn_cpx0, sn_cpx0); \
242 _complex_mul_assign(r2, (u_cpx[3]), sn_cpx0);
243
244#else
245#define DPHI_X_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
246 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
247 _complex_i_minus(sn_cpx0, sn_cpx0); \
248 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
249 _complex_mul_assign(r0, conj(u_cpx[3]), sn_cpx0); \
250 _complex_i_plus(sn_cpx0, sn_cpx0); \
251 _complex_mul_assign(r3, conj(u_cpx[3]), sn_cpx0); \
252 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
253 _complex_i_minus(sn_cpx0, sn_cpx0); \
254 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
255 _complex_mul_assign(r1, conj(u_cpx[3]), sn_cpx0); \
256 _complex_i_plus(sn_cpx0, sn_cpx0); \
257 _complex_mul_assign(r2, conj(u_cpx[3]), sn_cpx0);
258
259#endif
260
261#define DPHI_Y_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
262 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
263 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
264 _complex_mul_assign(r0, u_cpx[4], sn_cpx0); \
265 _complex_mul_assign(r3, u_cpx[4], sn_cpx0); \
266 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
267 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
268 _complex_mul_assign(r1, u_cpx[4], sn_cpx0); \
269 _complex_mul_sub_assign(r2, u_cpx[4], sn_cpx0);
270
271#ifdef REPR_IS_REAL
272#define DPHI_Y_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
273 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
274 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
275 _complex_mul_assign(r0, (u_cpx[5]), sn_cpx0); \
276 _complex_mul_sub_assign(r3, (u_cpx[5]), sn_cpx0); \
277 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
278 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
279 _complex_mul_assign(r1, (u_cpx[5]), sn_cpx0); \
280 _complex_mul_assign(r2, (u_cpx[5]), sn_cpx0);
281
282#else
283#define DPHI_Y_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
284 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
285 read_sub_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
286 _complex_mul_assign(r0, conj(u_cpx[5]), sn_cpx0); \
287 _complex_mul_sub_assign(r3, conj(u_cpx[5]), sn_cpx0); \
288 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
289 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
290 _complex_mul_assign(r1, conj(u_cpx[5]), sn_cpx0); \
291 _complex_mul_assign(r2, conj(u_cpx[5]), sn_cpx0);
292
293#endif
294
295#define DPHI_Z_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
296 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
297 _complex_i_plus(sn_cpx0, sn_cpx0); \
298 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
299 _complex_mul_assign(r0, u_cpx[6], sn_cpx0); \
300 _complex_i_minus(sn_cpx0, sn_cpx0); \
301 _complex_mul_assign(r2, u_cpx[6], sn_cpx0); \
302 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
303 _complex_i_minus(sn_cpx0, sn_cpx0); \
304 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
305 _complex_mul_assign(r1, u_cpx[6], sn_cpx0); \
306 _complex_i_plus(sn_cpx0, sn_cpx0); \
307 _complex_mul_assign(r3, u_cpx[6], sn_cpx0);
308
309#ifdef REPR_IS_REAL
310#define DPHI_Z_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
311 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
312 _complex_i_minus(sn_cpx0, sn_cpx0); \
313 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
314 _complex_mul_assign(r0, (u_cpx[7]), sn_cpx0); \
315 _complex_i_plus(sn_cpx0, sn_cpx0); \
316 _complex_mul_assign(r2, (u_cpx[7]), sn_cpx0); \
317 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
318 _complex_i_plus(sn_cpx0, sn_cpx0); \
319 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
320 _complex_mul_assign(r1, (u_cpx[7]), sn_cpx0); \
321 _complex_i_minus(sn_cpx0, sn_cpx0); \
322 _complex_mul_assign(r3, (u_cpx[7]), sn_cpx0);
323
324#else
325#define DPHI_Z_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx) \
326 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 2 * NF, 1); \
327 _complex_i_minus(sn_cpx0, sn_cpx0); \
328 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp, 1); \
329 _complex_mul_assign(r0, conj(u_cpx[7]), sn_cpx0); \
330 _complex_i_plus(sn_cpx0, sn_cpx0); \
331 _complex_mul_assign(r2, conj(u_cpx[7]), sn_cpx0); \
332 read_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + 3 * NF, 1); \
333 _complex_i_plus(sn_cpx0, sn_cpx0); \
334 read_assign_gpu<REAL>(0, &(sn_cpx0), in, iy, vcomp + NF, 1); \
335 _complex_mul_assign(r1, conj(u_cpx[7]), sn_cpx0); \
336 _complex_i_minus(sn_cpx0, sn_cpx0); \
337 _complex_mul_assign(r3, conj(u_cpx[7]), sn_cpx0);
338
339#endif
340#endif
341
342#define read_reduced(iy, in, sn, piece, base_in) \
343 do { \
344 const int block_offset = base_in; \
345 const HSPINOR_TYPE *in_offset = (HSPINOR_TYPE *)((in) + block_offset); \
346 const int iy_loc = (iy)-block_offset; \
347 read_gpu<REAL>(0, &(sn), in_offset, iy_loc, 0, 1); \
348 } while (0)
349
350#define DPHI_RED_T_UP_GPU(r, sn) \
351 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
352 _vector_mul_add_assign_f(r.c[2], -0.5, sn.c[0]); \
353 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
354 _vector_mul_add_assign_f(r.c[3], -0.5, sn.c[1]);
355
356#define DPHI_RED_T_DN_GPU(r, sn) \
357 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
358 _vector_mul_sub_assign_f(r.c[2], -0.5, sn.c[0]); \
359 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
360 _vector_mul_sub_assign_f(r.c[3], -0.5, sn.c[1]);
361
362#define DPHI_RED_X_UP_GPU(r, sn) \
363 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
364 _vector_i_mul_sub_assign_f(r.c[3], -0.5, sn.c[0]); \
365 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
366 _vector_i_mul_sub_assign_f(r.c[2], -0.5, sn.c[1]);
367
368#define DPHI_RED_X_DN_GPU(r, sn) \
369 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
370 _vector_i_mul_add_assign_f(r.c[3], -0.5, sn.c[0]); \
371 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
372 _vector_i_mul_add_assign_f(r.c[2], -0.5, sn.c[1]);
373
374#define DPHI_RED_Y_UP_GPU(r, sn) \
375 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
376 _vector_mul_add_assign_f(r.c[3], -0.5, sn.c[0]); \
377 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
378 _vector_mul_sub_assign_f(r.c[2], -0.5, sn.c[1]);
379
380#define DPHI_RED_Y_DN_GPU(r, sn) \
381 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
382 _vector_mul_sub_assign_f(r.c[3], -0.5, sn.c[0]); \
383 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
384 _vector_mul_add_assign_f(r.c[2], -0.5, sn.c[1]);
385
386#define DPHI_RED_Z_UP_GPU(r, sn) \
387 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
388 _vector_i_mul_sub_assign_f(r.c[2], -0.5, sn.c[0]); \
389 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
390 _vector_i_mul_add_assign_f(r.c[3], -0.5, sn.c[1]);
391
392#define DPHI_RED_Z_DN_GPU(r, sn) \
393 _vector_mul_add_assign_f(r.c[0], -0.5, sn.c[0]); \
394 _vector_i_mul_add_assign_f(r.c[2], -0.5, sn.c[0]); \
395 _vector_mul_add_assign_f(r.c[1], -0.5, sn.c[1]); \
396 _vector_i_mul_sub_assign_f(r.c[3], -0.5, sn.c[1]);
397
398#ifndef LARGE_N
399
400template <typename HSPINOR_TYPE, class REAL, typename COMPLEX, typename SITE_TYPE, typename GAUGE_TYPE>
401__global__ void Dphi_gpu_inner_kernel(SITE_TYPE *in, SITE_TYPE *out, const GAUGE_TYPE *gauge, int vol_in, int vol_out,
402 int base_in, int base_out, gd_type piece, char *imask_gpu, int *iup_gpu, int *idn_gpu) {
403 for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < vol_out; id += gridDim.x * blockDim.x) {
404 int ix = id + base_out;
405
406 SITE_TYPE r;
407 HSPINOR_TYPE sn;
408 GAUGE_TYPE u;
409
410 _spinor_zero_f(r);
411
412 /******************************* direction +0 *********************************/
413 if (imask_gpu[ix] & T_UP_MASK) {
414 const int iy = iup_gpu[4 * ix];
415 DPHI_T_UP_GPU(ix, iy, in, gauge, r, sn, u);
416 }
417
418 /******************************* direction -0 *********************************/
419 if (imask_gpu[ix] & T_DN_MASK) {
420 const int iy = idn_gpu[4 * ix];
421 DPHI_T_DN_GPU(ix, iy, in, gauge, r, sn, u);
422 }
423
424 /******************************* direction +1 *********************************/
425 if (imask_gpu[ix] & X_UP_MASK) {
426 const int iy = iup_gpu[4 * ix + 1];
427 DPHI_X_UP_GPU(ix, iy, in, gauge, r, sn, u);
428 }
429
430 /******************************* direction -1 *********************************/
431 if (imask_gpu[ix] & X_DN_MASK) {
432 const int iy = idn_gpu[4 * ix + 1];
433 DPHI_X_DN_GPU(ix, iy, in, gauge, r, sn, u);
434 }
435
436 /******************************* direction +2 *********************************/
437 if (imask_gpu[ix] & Y_UP_MASK) {
438 const int iy = iup_gpu[4 * ix + 2];
439 DPHI_Y_UP_GPU(ix, iy, in, gauge, r, sn, u);
440 }
441
442 /******************************* direction -2 *********************************/
443 if (imask_gpu[ix] & Y_DN_MASK) {
444 const int iy = idn_gpu[4 * ix + 2];
445 DPHI_Y_DN_GPU(ix, iy, in, gauge, r, sn, u);
446 }
447
448 /******************************* direction +3 *********************************/
449 if (imask_gpu[ix] & Z_UP_MASK) {
450 const int iy = iup_gpu[4 * ix + 3];
451 DPHI_Z_UP_GPU(ix, iy, in, gauge, r, sn, u);
452 }
453
454 /******************************* direction -3 *********************************/
455 if (imask_gpu[ix] & Z_DN_MASK) {
456 const int iy = idn_gpu[4 * ix + 3];
457 DPHI_Z_DN_GPU(ix, iy, in, gauge, r, sn, u);
458 }
459
460 write_out_spinor_field<REAL>(&r, out, ix);
461 }
462}
463
464#else
465
466template <typename HSPINOR_TYPE, class REAL, typename COMPLEX, typename SITE_TYPE, typename GAUGE_TYPE>
467__global__ void Dphi_gpu_inner_kernel(SITE_TYPE *in, SITE_TYPE *out, const GAUGE_TYPE *gauge, int vol_in, int vol_out,
468 int base_in, int base_out, gd_type piece, char *imask_gpu, int *iup_gpu, int *idn_gpu) {
469 for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < vol_out * NF; id += gridDim.x * blockDim.x) {
470 // This might degrade performance for unusually sized lattices
471 int divider = SUBBLOCKS;
472 while (vol_out % divider != 0) {
473 divider--;
474 }
475
476 const int ix = id % divider + (id / (divider * NF)) * divider + base_out;
477 const int tgt_comp = (id / divider) % NF;
478
479 COMPLEX sn_cpx0;
480#ifdef REPR_IS_REAL
481 REAL u_cpx[8];
482#else
483 COMPLEX u_cpx[8];
484#endif
485 COMPLEX r0, r1, r2, r3;
486 _complex_0(r0);
487 _complex_0(r1);
488 _complex_0(r2);
489 _complex_0(r3);
490
491 for (int vcomp = 0; vcomp < NF; vcomp++) {
492 prefetch_gauge(ix, u_cpx, gauge, tgt_comp, vcomp, idn_gpu);
493
494 /******************************* direction +0 *********************************/
495 if (imask_gpu[ix] & T_UP_MASK) {
496 const int iy = iup_gpu[4 * ix];
497 DPHI_T_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
498 }
499
500 /******************************* direction -0 *********************************/
501 if (imask_gpu[ix] & T_DN_MASK) {
502 const int iy = idn_gpu[4 * ix];
503 DPHI_T_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
504 }
505
506 /******************************* direction +1 *********************************/
507 if (imask_gpu[ix] & X_UP_MASK) {
508 const int iy = iup_gpu[4 * ix + 1];
509 DPHI_X_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
510 }
511
512 /******************************* direction -1 *********************************/
513 if (imask_gpu[ix] & X_DN_MASK) {
514 const int iy = idn_gpu[4 * ix + 1];
515 DPHI_X_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
516 }
517
518 /******************************* direction +2 *********************************/
519 if (imask_gpu[ix] & Y_UP_MASK) {
520 const int iy = iup_gpu[4 * ix + 2];
521 DPHI_Y_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
522 }
523
524 /******************************* direction -2 *********************************/
525 if (imask_gpu[ix] & Y_DN_MASK) {
526 const int iy = idn_gpu[4 * ix + 2];
527 DPHI_Y_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
528 }
529
530 /******************************* direction +3 *********************************/
531 if (imask_gpu[ix] & Z_UP_MASK) {
532 const int iy = iup_gpu[4 * ix + 3];
533 DPHI_Z_UP_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
534 }
535
536 /******************************* direction -3 *********************************/
537 if (imask_gpu[ix] & Z_DN_MASK) {
538 const int iy = idn_gpu[4 * ix + 3];
539 DPHI_Z_DN_GPU(ix, iy, vcomp, in, gauge, r0, r1, r2, r3, sn_cpx0, u_cpx);
540 }
541 }
542
543 write_out(r0, r1, r2, r3, out, ix, tgt_comp);
544 }
545}
546#endif
547
548// Cannot run two boundary kernels at the same time -> race condition
549template <typename HSPINOR_TYPE, class REAL, typename SITE_TYPE, typename GAUGE_TYPE>
550__global__ void Dphi_gpu_boundary_kernel(SITE_TYPE *in, SITE_TYPE *out, const GAUGE_TYPE *gauge, int vol_in, int vol_out,
551 int base_in, int base_out, gd_type piece, char mask, int *iup_gpu, int *idn_gpu) {
552 for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < vol_in; id += gridDim.x * blockDim.x) {
553 int ix = 0;
554 int iy = id + base_in;
555
556 SITE_TYPE r;
557 HSPINOR_TYPE sn;
558 GAUGE_TYPE u;
559
560 _spinor_zero_f(r);
561
562 /******************************* direction +0 *********************************/
563 if (mask & T_UP_MASK) {
564 ix = find_neighbor(iy, DOWN, 0, iup_gpu, idn_gpu);
565 read_reduced(iy, in, sn, piece, base_in);
566 DPHI_RED_T_UP_GPU(r, sn);
567 }
568
569 /******************************* direction -0 *********************************/
570 if (mask & T_DN_MASK) {
571 ix = find_neighbor(iy, UP, 0, iup_gpu, idn_gpu);
572 read_reduced(iy, in, sn, piece, base_in);
573 DPHI_RED_T_DN_GPU(r, sn);
574 }
575
576 /******************************* direction +1 *********************************/
577 if (mask & X_UP_MASK) {
578 ix = find_neighbor(iy, DOWN, 1, iup_gpu, idn_gpu);
579 read_reduced(iy, in, sn, piece, base_in);
580 DPHI_RED_X_UP_GPU(r, sn);
581 }
582
583 /******************************* direction -1 *********************************/
584 if (mask & X_DN_MASK) {
585 ix = find_neighbor(iy, UP, 1, iup_gpu, idn_gpu);
586 read_reduced(iy, in, sn, piece, base_in);
587 DPHI_RED_X_DN_GPU(r, sn);
588 }
589
590 /******************************* direction +2 *********************************/
591 if (mask & Y_UP_MASK) {
592 ix = find_neighbor(iy, DOWN, 2, iup_gpu, idn_gpu);
593 read_reduced(iy, in, sn, piece, base_in);
594 DPHI_RED_Y_UP_GPU(r, sn);
595 }
596
597 /******************************* direction -2 *********************************/
598 if (mask & Y_DN_MASK) {
599 ix = find_neighbor(iy, UP, 2, iup_gpu, idn_gpu);
600 read_reduced(iy, in, sn, piece, base_in);
601 DPHI_RED_Y_DN_GPU(r, sn);
602 }
603
604 /******************************* direction +3 *********************************/
605 if (mask & Z_UP_MASK) {
606 ix = find_neighbor(iy, DOWN, 3, iup_gpu, idn_gpu);
607 read_reduced(iy, in, sn, piece, base_in);
608 DPHI_RED_Z_UP_GPU(r, sn);
609 }
610
611 /******************************* direction -3 *********************************/
612 if (mask & Z_DN_MASK) {
613 ix = find_neighbor(iy, UP, 3, iup_gpu, idn_gpu);
614 read_reduced(iy, in, sn, piece, base_in);
615 DPHI_RED_Z_DN_GPU(r, sn);
616 }
617
618 write_assign_atomic_gpu<REAL>(0, &r, out, ix, 0, 1);
619 }
620}
621
622#ifdef WITH_CLOVER
623template <typename VECTOR_TYPE, class REAL, typename COMPLEX, typename SITE_TYPE>
624__global__ void Cphi_gpu_kernel_(SITE_TYPE *dptr, SITE_TYPE *sptr, suNfc *cl_term, double mass, int assign, int N,
625 int block_start) {
626 for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < N * NF * 4; id += gridDim.x * blockDim.x) {
627 const int dir = id / (N * NF);
628 const int id2 = id % (N * NF);
629 int divider = SUBBLOCKS;
630 while (N % divider != 0) {
631 divider--;
632 }
633 const int tgt_comp = (id2 / divider) % NF;
634 const int ix = id2 % divider + (id2 / (divider * NF)) * divider;
635
636 COMPLEX v1, v2, v3;
637 COMPLEX s0, s1, out;
638 COMPLEX cl_0, cl_1;
639
640 if (dir == 0) {
641 v3 = 0.0;
642
643 for (int vcomp = 0; vcomp < NF; vcomp++) {
644 read_gpu<REAL>(0, &s0, sptr, ix, vcomp, 1);
645 read_gpu<REAL>(0, &s1, sptr, ix, vcomp + NF, 1);
646 read_gpu<double>(0, &cl_0, cl_term, ix, NF * tgt_comp + vcomp, 4);
647 read_gpu<double>(0, &cl_1, cl_term, ix, NF * tgt_comp + vcomp + NF * NF, 4);
648 v1 = cl_0 * s0;
649 v2 = cl_1 * s1;
650 v3 += v1 + v2;
651 if (vcomp == tgt_comp) { v3 += mass * s0; }
652 }
653
654 if (assign) {
655 write_assign_gpu<REAL>(0, &v3, dptr, ix, tgt_comp, 1);
656 } else {
657 write_gpu<REAL>(0, &v3, dptr, ix, tgt_comp, 1);
658 }
659 }
660
661 if (dir == 1) {
662 v3 = 0.0;
663
664 for (int vcomp = 0; vcomp < NF; vcomp++) {
665 // this reread might not be ideal
666 read_gpu<REAL>(0, &s0, sptr, ix, vcomp, 1);
667 read_gpu<REAL>(0, &s1, sptr, ix, vcomp + NF, 1);
668 read_gpu<double>(0, &cl_0, cl_term, ix, NF * tgt_comp + vcomp, 4);
669 read_gpu<double>(0, &cl_1, cl_term, ix, NF * vcomp + tgt_comp + NF * NF, 4);
670 v1 = conj(cl_1) * s0;
671 v2 = cl_0 * s1;
672 v3 += (v1 - v2);
673 if (vcomp == tgt_comp) { v3 += mass * s1; }
674 }
675
676 if (assign) {
677 write_assign_gpu<REAL>(0, &v3, dptr, ix, tgt_comp + NF, 1);
678 } else {
679 write_gpu<REAL>(0, &v3, dptr, ix, tgt_comp + NF, 1);
680 }
681 }
682
683 if (dir == 2) {
684 v3 = 0.0;
685
686 for (int vcomp = 0; vcomp < NF; vcomp++) {
687 read_gpu<REAL>(0, &s0, sptr, ix, vcomp + 2 * NF, 1);
688 read_gpu<REAL>(0, &s1, sptr, ix, vcomp + 3 * NF, 1);
689 read_gpu<double>(0, &cl_0, cl_term, ix, NF * tgt_comp + vcomp + 2 * NF * NF, 4);
690 read_gpu<double>(0, &cl_1, cl_term, ix, NF * tgt_comp + vcomp + 3 * NF * NF, 4);
691 v1 = cl_0 * s0;
692 v2 = cl_1 * s1;
693 v3 += v1 + v2;
694 if (vcomp == tgt_comp) { v3 += mass * s0; }
695 }
696
697 if (assign) {
698 write_assign_gpu<REAL>(0, &v3, dptr, ix, tgt_comp + 2 * NF, 1);
699 } else {
700 write_gpu<REAL>(0, &v3, dptr, ix, tgt_comp + 2 * NF, 1);
701 }
702 }
703
704 if (dir == 3) {
705 v3 = 0.0;
706
707 for (int vcomp = 0; vcomp < NF; vcomp++) {
708 read_gpu<REAL>(0, &s0, sptr, ix, vcomp + 2 * NF, 1);
709 read_gpu<REAL>(0, &s1, sptr, ix, vcomp + 3 * NF, 1);
710 read_gpu<double>(0, &cl_0, cl_term, ix, NF * tgt_comp + vcomp + 2 * NF * NF, 4);
711 read_gpu<double>(0, &cl_1, cl_term, ix, NF * vcomp + tgt_comp + 3 * NF * NF, 4);
712 v1 = conj(cl_1) * s0;
713 v2 = cl_0 * s1;
714 v3 += (v1 - v2);
715 if (vcomp == tgt_comp) { v3 += mass * s1; }
716 }
717
718 if (assign) {
719 write_assign_gpu<REAL>(0, &v3, dptr, ix, tgt_comp + 3 * NF, 1);
720 } else {
721 write_gpu<REAL>(0, &v3, dptr, ix, tgt_comp + 3 * NF, 1);
722 }
723 }
724 }
725}
726
727template <typename VECTOR_TYPE, typename COMPLEX, class REAL, typename SITE_TYPE>
728__global__ void Cphi_inv_kernel_(SITE_TYPE *dptr, SITE_TYPE *sptr, ldl_t *ldl_gpu, int mass, int assign, int N,
729 int block_start) {
730 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += gridDim.x * blockDim.x) {
731 ldl_t site_ldl;
732 hr_complex *up, *dn, c;
733 SITE_TYPE out, in;
734 int n;
735
736 read_gpu<double>(0, &site_ldl, ldl_gpu, ix, 0, 1);
737 read_gpu<REAL>(0, &in, sptr, ix, 0, 1);
738
739 up = site_ldl.up;
740 dn = site_ldl.dn;
741
742 COMPLEX *x;
743 x = (COMPLEX *)&in;
744 for (int i = 0; i < 2 * NF; i++) {
745 for (int k = 0; k < i; k++) {
746 n = i * (i + 1) / 2 + k;
747 _complex_mul_sub_assign(x[i], (COMPLEX)up[n], x[k]);
748 _complex_mul_sub_assign(x[i + 2 * NF], (COMPLEX)dn[n], x[k + 2 * NF]);
749 }
750 }
751
752 for (int i = 2 * NF - 1; i >= 0; i--) {
753 n = i * (i + 1) / 2 + i;
754 _complex_mulr(x[i], 1. / ((REAL)creal(up[n])), x[i]);
755 _complex_mulr(x[i + 2 * NF], 1. / ((REAL)creal(dn[n])), x[i + 2 * NF]);
756 for (int k = i + 1; k < 2 * NF; k++) {
757 n = k * (k + 1) / 2 + i;
758
759 c = (COMPLEX)conj(up[n]);
760 _complex_mul_sub_assign(x[i], c, x[k]);
761 c = (COMPLEX)conj(dn[n]);
762 _complex_mul_sub_assign(x[i + 2 * NF], c, x[k + 2 * NF]);
763 }
764 }
765
766 if (assign) {
767 read_gpu<REAL>(0, &out, dptr, ix, 0, 1);
768 _spinor_add_assign_f(in, out);
769 }
770
771 write_gpu<REAL>(0, &in, dptr, ix, 0, 1);
772 }
773}
774
775#endif
776
777#ifdef WITH_EXPCLOVER
778
779template <typename VECTOR_TYPE, class REAL, typename SITE_TYPE>
780__global__ void Cphi_gpu_kernel_(SITE_TYPE *dptr, SITE_TYPE *sptr, suNfc *cl_term_expAplus, suNfc *cl_term_expAminus,
781 double mass, double invexpmass, int assign, int N, int block_start, int NN_loc,
782 int NNexp_loc) {
783 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += gridDim.x * blockDim.x) {
784 suNfc expAplus[4];
785 suNfc expAminus[4];
786
787 read_gpu<double>(0, &expAplus[0], cl_term_expAplus, ix, 0, 4);
788 read_gpu<double>(0, &expAplus[1], cl_term_expAplus, ix, 1, 4);
789 read_gpu<double>(0, &expAplus[2], cl_term_expAplus, ix, 2, 4);
790 read_gpu<double>(0, &expAplus[3], cl_term_expAplus, ix, 3, 4);
791 read_gpu<double>(0, &expAminus[0], cl_term_expAminus, ix, 0, 4);
792 read_gpu<double>(0, &expAminus[1], cl_term_expAminus, ix, 1, 4);
793 read_gpu<double>(0, &expAminus[2], cl_term_expAminus, ix, 2, 4);
794 read_gpu<double>(0, &expAminus[3], cl_term_expAminus, ix, 3, 4);
795
796 VECTOR_TYPE v1, v2;
797 SITE_TYPE out, in, tmp;
798
799 read_gpu<REAL>(0, &in, sptr, ix, 0, 1);
800
801 // Comp 0
802 _suNfc_multiply(v1, expAplus[0], in.c[0]);
803 _suNfc_multiply(v2, expAplus[1], in.c[1]);
804 _vector_add_f(tmp.c[0], v1, v2);
805
806 // Comp 1
807 _suNfc_multiply(v1, expAplus[2], in.c[0]);
808 _suNfc_multiply(v2, expAplus[3], in.c[1]);
809 _vector_add_f(tmp.c[1], v1, v2);
810
811 // Comp 2
812 _suNfc_multiply(v1, expAminus[0], in.c[2]);
813 _suNfc_multiply(v2, expAminus[1], in.c[3]);
814 _vector_add_f(tmp.c[2], v1, v2);
815
816 // Comp 3
817 _suNfc_multiply(v1, expAminus[2], in.c[2]);
818 _suNfc_multiply(v2, expAminus[3], in.c[3]);
819 _vector_add_f(tmp.c[3], v1, v2);
820
821 if (assign) {
822 read_gpu<REAL>(0, &out, dptr, ix, 0, 1);
823 _spinor_add_assign_f(tmp, out);
824 }
825
826 write_gpu<REAL>(0, &tmp, dptr, ix, 0, 1);
827 }
828}
829
830#endif
831#endif
This file contains information on the geometry of the local lattice, block decomposed geometry,...