HiRep 0.1
Loading...
Searching...
No Matches
linear_algebra_gpu_kernels.hpp
1#ifndef LINEAR_ALGEBRA_GPU_CU
2#define LINEAR_ALGEBRA_GPU_CU
3
4#include "geometry.h"
5#include "Utils/generics.h"
6#include "inverters.h"
7
8/* Re <s1,s2> */
9template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
10__global__ void prod_re_gpu(SITE_TYPE *s1, SITE_TYPE *s2, quad_double *resField, int N) {
11 SITE_TYPE site1;
12 SITE_TYPE site2;
13 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
14 resField[ix] = quad_double(0.0);
15 for (int mu = 0; mu < FIELD_DIM; ++mu) {
16 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
17 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
18 resField[ix].val += prod_re(&site1, &site2);
19 }
20 }
21}
22
23/* Im <s1,s2> */
24template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
25__global__ void prod_im_gpu(SITE_TYPE *s1, SITE_TYPE *s2, quad_double *resField, int N) {
26 SITE_TYPE site1;
27 SITE_TYPE site2;
28 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
29 resField[ix] = quad_double(0.0);
30 for (int mu = 0; mu < FIELD_DIM; ++mu) {
31 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
32 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
33 resField[ix].val += prod_im(&site1, &site2);
34 }
35 }
36}
37
38/* <s1,s2> */
39template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
40__global__ void prod_gpu(SITE_TYPE *s1, SITE_TYPE *s2, hr_complex *resField, int N) {
41 SITE_TYPE site1;
42 SITE_TYPE site2;
43 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
44 _complex_0(resField[ix]);
45 for (int mu = 0; mu < FIELD_DIM; mu++) {
46 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
47 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
48 _complex_add(resField[ix], resField[ix], prod(&site1, &site2));
49 }
50 }
51}
52
53/* Re <g5*s1,s2> */
54template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
55__global__ void g5_prod_re_gpu(SITE_TYPE *s1, SITE_TYPE *s2, double *resField, int N) {
56 SITE_TYPE site1;
57 SITE_TYPE site2;
58 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
59 resField[ix] = 0.0;
60 for (int mu = 0; mu < FIELD_DIM; mu++) {
61 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
62 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
63 resField[ix] += g5_prod_re(&site1, &site2);
64 }
65 }
66}
67
68/* Im <g5*s1,s2> */
69template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
70__global__ void g5_prod_im_gpu(SITE_TYPE *s1, SITE_TYPE *s2, double *resField, int N) {
71 SITE_TYPE site1;
72 SITE_TYPE site2;
73 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
74 resField[ix] = 0.0;
75 for (int mu = 0; mu < FIELD_DIM; mu++) {
76 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
77 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
78 resField[ix] += g5_prod_im(&site1, &site2);
79 }
80 }
81}
82
83/* Re <s1,s1> */
84template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
85__global__ void sqnorm_gpu(SITE_TYPE *s1, quad_double *resField, int N) {
86 SITE_TYPE site1;
87 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
88 resField[ix] = quad_double(0.0);
89 for (int mu = 0; mu < FIELD_DIM; mu++) {
90 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
91 resField[ix].val += prod_re(&site1, &site1);
92 }
93 }
94}
95
96/* Re <s1,s1> */
97template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
98__global__ void max_gpu(SITE_TYPE *s1, double *resField, int N) {
99 SITE_TYPE site1;
100 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
101 resField[ix] = 0.0;
102 for (int mu = 0; mu < FIELD_DIM; mu++) {
103 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
104 resField[ix] = resField[ix] > max(&site1) ? resField[ix] : max(&site1);
105 }
106 }
107}
108
109/* s1=s2 */
110template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
111__global__ void id_gpu(SITE_TYPE *s1, SITE_TYPE *s2, int N) {
112 SITE_TYPE site;
113 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
114 for (int mu = 0; mu < FIELD_DIM; ++mu) {
115 read_gpu<REAL>(N, &site, s2, ix, mu, FIELD_DIM);
116 write_gpu<REAL>(N, &site, s1, ix, mu, FIELD_DIM);
117 }
118 }
119}
120
121/* s1+=r*s2 r real */
122template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
123__global__ void mul_add_assign_gpu(SITE_TYPE *s1, REAL r, SITE_TYPE *s2, int N) {
124 SITE_TYPE site1;
125 SITE_TYPE site2;
126 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
127 for (int mu = 0; mu < FIELD_DIM; ++mu) {
128 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
129 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
130 mul_add_assign(&site1, r, &site2);
131 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
132 }
133 }
134}
135
136/* s1+=c*s2 c complex */
137template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE, typename COMPLEX>
138__global__ void mulc_add_assign_gpu(SITE_TYPE *s1, COMPLEX c, SITE_TYPE *s2, int N) {
139 SITE_TYPE site1;
140 SITE_TYPE site2;
141 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
142 for (int mu = 0; mu < FIELD_DIM; ++mu) {
143 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
144 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
145 mulc_add_assign(&site1, c, &site2);
146 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
147 }
148 }
149}
150
151/* s1+=c*g5*s2 c complex */
152template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE, typename COMPLEX>
153__global__ void g5_mulc_add_assign_gpu(SITE_TYPE *s1, COMPLEX c, SITE_TYPE *s2, int N) {
154 SITE_TYPE site1;
155 SITE_TYPE site2;
156 SITE_TYPE tmp;
157 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
158 for (int mu = 0; mu < FIELD_DIM; mu++) {
159 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
160 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
161 mulc(&tmp, c, &site2);
162 g5(&site2, &tmp);
163 add_assign(&site1, &site2);
164 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
165 }
166 }
167}
168
169/* s1=r*s2 r real */
170template <unsigned int FIELD_DIM, typename SITE_TYPE, typename REAL>
171__global__ void mul_gpu(SITE_TYPE *s1, REAL r, SITE_TYPE *s2, int N) {
172 SITE_TYPE site1;
173 SITE_TYPE site2;
174 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
175 for (int mu = 0; mu < FIELD_DIM; mu++) {
176 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
177 mul(&site1, r, &site2);
178 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
179 }
180 }
181}
182
183/* s1=c*s2 c complex */
184template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE, typename COMPLEX>
185__global__ void mulc_gpu(SITE_TYPE *s1, COMPLEX c, SITE_TYPE *s2, int N) {
186 SITE_TYPE site1;
187 SITE_TYPE site2;
188 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
189 for (int mu = 0; mu < FIELD_DIM; mu++) {
190 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
191 mulc(&site1, c, &site2);
192 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
193 }
194 }
195}
196
197/* r=s1+s2 */
198template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
199__global__ void add_gpu(SITE_TYPE *r, SITE_TYPE *s1, SITE_TYPE *s2, int N) {
200 SITE_TYPE site1;
201 SITE_TYPE site2;
202 SITE_TYPE res;
203 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
204 for (int mu = 0; mu < FIELD_DIM; mu++) {
205 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
206 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
207 add(&res, &site1, &site2);
208 write_gpu<REAL>(N, &res, r, ix, mu, FIELD_DIM);
209 }
210 }
211}
212
213/* r=s1-s2 */
214template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
215__global__ void sub_gpu(SITE_TYPE *r, SITE_TYPE *s1, SITE_TYPE *s2, int N) {
216 SITE_TYPE site1;
217 SITE_TYPE site2;
218 SITE_TYPE res;
219 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
220 for (int mu = 0; mu < FIELD_DIM; mu++) {
221 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
222 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
223 sub(&res, &site1, &site2);
224 write_gpu<REAL>(N, &res, r, ix, mu, FIELD_DIM);
225 }
226 }
227}
228
229/* s1+=s2 */
230template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
231__global__ void add_assign_gpu(SITE_TYPE *s1, SITE_TYPE *s2, int N) {
232 SITE_TYPE site1;
233 SITE_TYPE site2;
234 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
235 for (int mu = 0; mu < FIELD_DIM; mu++) {
236 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
237 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
238 add_assign(&site1, &site2);
239 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
240 }
241 }
242}
243
244/* s1-=s2 */
245template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
246__global__ void sub_assign_gpu(SITE_TYPE *s1, SITE_TYPE *s2, int N) {
247 SITE_TYPE site1;
248 SITE_TYPE site2;
249 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
250 for (int mu = 0; mu < FIELD_DIM; mu++) {
251 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
252 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
253 sub_assign(&site1, &site2);
254 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
255 }
256 }
257}
258
259/* s1=0 */
260template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE> __global__ void zero_gpu(SITE_TYPE *s1, int N) {
261 SITE_TYPE site;
262 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
263 for (int mu = 0; mu < FIELD_DIM; mu++) {
264 zero(&site);
265 write_gpu<REAL>(N, &site, s1, ix, mu, FIELD_DIM);
266 }
267 }
268}
269
270/* s1=-s2 */
271template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
272__global__ void minus_gpu(SITE_TYPE *s1, SITE_TYPE *s2, int N) {
273 SITE_TYPE site1;
274 SITE_TYPE site2;
275 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
276 for (int mu = 0; mu < FIELD_DIM; mu++) {
277 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
278 minus(&site1, &site2);
279 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
280 }
281 }
282}
283
284/* s1=r1*s2+r2*s3 r1,r2 real */
285template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
286__global__ void lc_gpu(SITE_TYPE *s1, REAL r1, SITE_TYPE *s2, REAL r2, SITE_TYPE *s3, int N) {
287 SITE_TYPE site1;
288 SITE_TYPE site2;
289 SITE_TYPE site3;
290 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
291 for (int mu = 0; mu < FIELD_DIM; mu++) {
292 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
293 read_gpu<REAL>(N, &site3, s3, ix, mu, FIELD_DIM);
294 lc(&site1, r1, &site2, r2, &site3);
295 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
296 }
297 }
298}
299
300/* s1+=r1*s2+r2*s3 r1,r2 real */
301template <unsigned int FIELD_DIM, typename SITE_TYPE, typename REAL>
302__global__ void lc_add_assign_gpu(SITE_TYPE *s1, REAL r1, SITE_TYPE *s2, REAL r2, SITE_TYPE *s3, int N) {
303 SITE_TYPE site1;
304 SITE_TYPE site2;
305 SITE_TYPE site3;
306 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
307 for (int mu = 0; mu < FIELD_DIM; mu++) {
308 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
309 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
310 read_gpu<REAL>(N, &site3, s3, ix, mu, FIELD_DIM);
311 lc_add_assign(&site1, r1, &site2, r2, &site3);
312 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
313 }
314 }
315}
316
317/* s1=cd1*s2+cd2*s3 cd1, cd2 complex */
318template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE, typename COMPLEX>
319__global__ void clc_gpu(SITE_TYPE *s1, COMPLEX c1, SITE_TYPE *s2, COMPLEX c2, SITE_TYPE *s3, int N) {
320 SITE_TYPE site1;
321 SITE_TYPE site2;
322 SITE_TYPE site3;
323 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
324 for (int mu = 0; mu < FIELD_DIM; mu++) {
325 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
326 read_gpu<REAL>(N, &site3, s3, ix, mu, FIELD_DIM);
327 clc(&site1, c1, &site2, c2, &site3);
328 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
329 }
330 }
331}
332
333/* s1+=cd1*s2+cd2*s3 cd1,cd2 complex */
334template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE, typename COMPLEX>
335__global__ void clc_add_assign_gpu(SITE_TYPE *s1, COMPLEX c1, SITE_TYPE *s2, COMPLEX c2, SITE_TYPE *s3, int N) {
336 SITE_TYPE site1;
337 SITE_TYPE site2;
338 SITE_TYPE site3;
339 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
340 for (int mu = 0; mu < FIELD_DIM; mu++) {
341 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
342 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
343 read_gpu<REAL>(N, &site3, s3, ix, mu, FIELD_DIM);
344 clc_add_assign(&site1, c1, &site2, c2, &site3);
345 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
346 }
347 }
348}
349
350/* s1=g5*s2 */
351template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
352__global__ void g5_gpu(SITE_TYPE *s1, SITE_TYPE *s2, int N) {
353 SITE_TYPE site1;
354 SITE_TYPE site2;
355 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
356 for (int mu = 0; mu < FIELD_DIM; mu++) {
357 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
358 g5(&site1, &site2);
359 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
360 }
361 }
362}
363
364/* s1=g5*s1 */
365template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE> __global__ void g5_assign_gpu(SITE_TYPE *s1, int N) {
366 SITE_TYPE site1;
367 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
368 for (int mu = 0; mu < FIELD_DIM; mu++) {
369 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
370 g5_assign(&site1);
371 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
372 }
373 }
374}
375
376template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
377__global__ void g0_gpu(SITE_TYPE *s1, SITE_TYPE *s2, int N) {
378 SITE_TYPE site1;
379 SITE_TYPE site2;
380 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
381 for (int mu = 0; mu < FIELD_DIM; mu++) {
382 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
383 g0(&site1, &site2);
384 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
385 }
386 }
387}
388
389template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
390__global__ void g1_gpu(SITE_TYPE *s1, SITE_TYPE *s2, int N) {
391 SITE_TYPE site1;
392 SITE_TYPE site2;
393 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
394 for (int mu = 0; mu < FIELD_DIM; mu++) {
395 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
396 g1(&site1, &site2);
397 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
398 }
399 }
400}
401
402template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
403__global__ void g2_gpu(SITE_TYPE *s1, SITE_TYPE *s2, int N) {
404 SITE_TYPE site1;
405 SITE_TYPE site2;
406 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
407 for (int mu = 0; mu < FIELD_DIM; mu++) {
408 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
409 g2(&site1, &site2);
410 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
411 }
412 }
413}
414
415template <unsigned int FIELD_DIM, typename REAL, typename SITE_TYPE>
416__global__ void g3_gpu(SITE_TYPE *s1, SITE_TYPE *s2, int N) {
417 SITE_TYPE site1;
418 SITE_TYPE site2;
419 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
420 for (int mu = 0; mu < FIELD_DIM; mu++) {
421 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
422 g3(&site1, &site2);
423 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
424 }
425 }
426}
427
428/* tools per eva.c */
429template <unsigned int FIELD_DIM, typename SITE_TYPE, typename REAL>
430__global__ void lc1_gpu(REAL r, SITE_TYPE *s1, SITE_TYPE *s2, int N) {
431 SITE_TYPE site1;
432 SITE_TYPE site2;
433 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
434 for (int mu = 0; mu < FIELD_DIM; mu++) {
435 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
436 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
437 mul_add_assign(&site1, r, &site2);
438 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
439 }
440 }
441}
442
443template <unsigned int FIELD_DIM, typename SITE_TYPE, typename REAL>
444__global__ void lc2_gpu(REAL r1, REAL r2, SITE_TYPE *s1, SITE_TYPE *s2, int N) {
445 SITE_TYPE site1;
446 SITE_TYPE site2;
447 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
448 for (int mu = 0; mu < FIELD_DIM; mu++) {
449 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
450 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
451 lc(&site1, r1, &site1, r2, &site2);
452 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
453 }
454 }
455}
456
457template <unsigned int FIELD_DIM, typename SITE_TYPE, typename REAL>
458__global__ void lc3_gpu(REAL r1, REAL r2, SITE_TYPE *s1, SITE_TYPE *s2, SITE_TYPE *s3, int N) {
459 SITE_TYPE site1;
460 SITE_TYPE site2;
461 SITE_TYPE site3;
462 for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
463 for (int mu = 0; mu < FIELD_DIM; mu++) {
464 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
465 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
466 read_gpu<REAL>(N, &site3, s3, ix, mu, FIELD_DIM);
467 lc_add_assign(&site3, r1, &site1, r2, &site2);
468 minus(&site3, &site3);
469 write_gpu<REAL>(N, &site3, s3, ix, mu, FIELD_DIM);
470 }
471 }
472}
473
474#endif
This file contains information on the geometry of the local lattice, block decomposed geometry,...