1#ifndef LINEAR_ALGEBRA_GPU_CU
2#define LINEAR_ALGEBRA_GPU_CU
5#include "Utils/generics.h"
9template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
10__global__
void prod_re_gpu(SITE_TYPE *s1, SITE_TYPE *s2, quad_double *resField,
int N) {
13 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
14 resField[ix] = quad_double(0.0);
15 for (
int mu = 0; mu < FIELD_DIM; ++mu) {
16 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
17 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
18 resField[ix].val += prod_re(&site1, &site2);
24template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
25__global__
void prod_im_gpu(SITE_TYPE *s1, SITE_TYPE *s2, quad_double *resField,
int N) {
28 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
29 resField[ix] = quad_double(0.0);
30 for (
int mu = 0; mu < FIELD_DIM; ++mu) {
31 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
32 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
33 resField[ix].val += prod_im(&site1, &site2);
39template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
40__global__
void prod_gpu(SITE_TYPE *s1, SITE_TYPE *s2, hr_complex *resField,
int N) {
43 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
44 _complex_0(resField[ix]);
45 for (
int mu = 0; mu < FIELD_DIM; mu++) {
46 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
47 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
48 _complex_add(resField[ix], resField[ix], prod(&site1, &site2));
54template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
55__global__
void g5_prod_re_gpu(SITE_TYPE *s1, SITE_TYPE *s2,
double *resField,
int N) {
58 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
60 for (
int mu = 0; mu < FIELD_DIM; mu++) {
61 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
62 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
63 resField[ix] += g5_prod_re(&site1, &site2);
69template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
70__global__
void g5_prod_im_gpu(SITE_TYPE *s1, SITE_TYPE *s2,
double *resField,
int N) {
73 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
75 for (
int mu = 0; mu < FIELD_DIM; mu++) {
76 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
77 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
78 resField[ix] += g5_prod_im(&site1, &site2);
84template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
85__global__
void sqnorm_gpu(SITE_TYPE *s1, quad_double *resField,
int N) {
87 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
88 resField[ix] = quad_double(0.0);
89 for (
int mu = 0; mu < FIELD_DIM; mu++) {
90 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
91 resField[ix].val += prod_re(&site1, &site1);
97template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
98__global__
void max_gpu(SITE_TYPE *s1,
double *resField,
int N) {
100 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
102 for (
int mu = 0; mu < FIELD_DIM; mu++) {
103 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
104 resField[ix] = resField[ix] > max(&site1) ? resField[ix] : max(&site1);
110template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
111__global__
void id_gpu(SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
113 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
114 for (
int mu = 0; mu < FIELD_DIM; ++mu) {
115 read_gpu<REAL>(N, &site, s2, ix, mu, FIELD_DIM);
116 write_gpu<REAL>(N, &site, s1, ix, mu, FIELD_DIM);
122template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
123__global__
void mul_add_assign_gpu(SITE_TYPE *s1, REAL r, SITE_TYPE *s2,
int N) {
126 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
127 for (
int mu = 0; mu < FIELD_DIM; ++mu) {
128 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
129 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
130 mul_add_assign(&site1, r, &site2);
131 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
137template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE,
typename COMPLEX>
138__global__
void mulc_add_assign_gpu(SITE_TYPE *s1, COMPLEX c, SITE_TYPE *s2,
int N) {
141 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
142 for (
int mu = 0; mu < FIELD_DIM; ++mu) {
143 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
144 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
145 mulc_add_assign(&site1, c, &site2);
146 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
152template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE,
typename COMPLEX>
153__global__
void g5_mulc_add_assign_gpu(SITE_TYPE *s1, COMPLEX c, SITE_TYPE *s2,
int N) {
157 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
158 for (
int mu = 0; mu < FIELD_DIM; mu++) {
159 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
160 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
161 mulc(&tmp, c, &site2);
163 add_assign(&site1, &site2);
164 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
170template <
unsigned int FIELD_DIM,
typename SITE_TYPE,
typename REAL>
171__global__
void mul_gpu(SITE_TYPE *s1, REAL r, SITE_TYPE *s2,
int N) {
174 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
175 for (
int mu = 0; mu < FIELD_DIM; mu++) {
176 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
177 mul(&site1, r, &site2);
178 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
184template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE,
typename COMPLEX>
185__global__
void mulc_gpu(SITE_TYPE *s1, COMPLEX c, SITE_TYPE *s2,
int N) {
188 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
189 for (
int mu = 0; mu < FIELD_DIM; mu++) {
190 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
191 mulc(&site1, c, &site2);
192 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
198template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
199__global__
void add_gpu(SITE_TYPE *r, SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
203 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
204 for (
int mu = 0; mu < FIELD_DIM; mu++) {
205 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
206 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
207 add(&res, &site1, &site2);
208 write_gpu<REAL>(N, &res, r, ix, mu, FIELD_DIM);
214template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
215__global__
void sub_gpu(SITE_TYPE *r, SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
219 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
220 for (
int mu = 0; mu < FIELD_DIM; mu++) {
221 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
222 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
223 sub(&res, &site1, &site2);
224 write_gpu<REAL>(N, &res, r, ix, mu, FIELD_DIM);
230template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
231__global__
void add_assign_gpu(SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
234 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
235 for (
int mu = 0; mu < FIELD_DIM; mu++) {
236 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
237 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
238 add_assign(&site1, &site2);
239 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
245template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
246__global__
void sub_assign_gpu(SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
249 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
250 for (
int mu = 0; mu < FIELD_DIM; mu++) {
251 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
252 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
253 sub_assign(&site1, &site2);
254 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
260template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE> __global__
void zero_gpu(SITE_TYPE *s1,
int N) {
262 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
263 for (
int mu = 0; mu < FIELD_DIM; mu++) {
265 write_gpu<REAL>(N, &site, s1, ix, mu, FIELD_DIM);
271template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
272__global__
void minus_gpu(SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
275 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
276 for (
int mu = 0; mu < FIELD_DIM; mu++) {
277 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
278 minus(&site1, &site2);
279 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
285template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
286__global__
void lc_gpu(SITE_TYPE *s1, REAL r1, SITE_TYPE *s2, REAL r2, SITE_TYPE *s3,
int N) {
290 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
291 for (
int mu = 0; mu < FIELD_DIM; mu++) {
292 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
293 read_gpu<REAL>(N, &site3, s3, ix, mu, FIELD_DIM);
294 lc(&site1, r1, &site2, r2, &site3);
295 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
301template <
unsigned int FIELD_DIM,
typename SITE_TYPE,
typename REAL>
302__global__
void lc_add_assign_gpu(SITE_TYPE *s1, REAL r1, SITE_TYPE *s2, REAL r2, SITE_TYPE *s3,
int N) {
306 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
307 for (
int mu = 0; mu < FIELD_DIM; mu++) {
308 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
309 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
310 read_gpu<REAL>(N, &site3, s3, ix, mu, FIELD_DIM);
311 lc_add_assign(&site1, r1, &site2, r2, &site3);
312 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
318template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE,
typename COMPLEX>
319__global__
void clc_gpu(SITE_TYPE *s1, COMPLEX c1, SITE_TYPE *s2, COMPLEX c2, SITE_TYPE *s3,
int N) {
323 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
324 for (
int mu = 0; mu < FIELD_DIM; mu++) {
325 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
326 read_gpu<REAL>(N, &site3, s3, ix, mu, FIELD_DIM);
327 clc(&site1, c1, &site2, c2, &site3);
328 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
334template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE,
typename COMPLEX>
335__global__
void clc_add_assign_gpu(SITE_TYPE *s1, COMPLEX c1, SITE_TYPE *s2, COMPLEX c2, SITE_TYPE *s3,
int N) {
339 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
340 for (
int mu = 0; mu < FIELD_DIM; mu++) {
341 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
342 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
343 read_gpu<REAL>(N, &site3, s3, ix, mu, FIELD_DIM);
344 clc_add_assign(&site1, c1, &site2, c2, &site3);
345 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
351template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
352__global__
void g5_gpu(SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
355 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
356 for (
int mu = 0; mu < FIELD_DIM; mu++) {
357 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
359 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
365template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE> __global__
void g5_assign_gpu(SITE_TYPE *s1,
int N) {
367 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
368 for (
int mu = 0; mu < FIELD_DIM; mu++) {
369 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
371 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
376template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
377__global__
void g0_gpu(SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
380 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
381 for (
int mu = 0; mu < FIELD_DIM; mu++) {
382 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
384 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
389template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
390__global__
void g1_gpu(SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
393 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
394 for (
int mu = 0; mu < FIELD_DIM; mu++) {
395 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
397 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
402template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
403__global__
void g2_gpu(SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
406 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
407 for (
int mu = 0; mu < FIELD_DIM; mu++) {
408 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
410 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
415template <
unsigned int FIELD_DIM,
typename REAL,
typename SITE_TYPE>
416__global__
void g3_gpu(SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
419 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
420 for (
int mu = 0; mu < FIELD_DIM; mu++) {
421 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
423 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
429template <
unsigned int FIELD_DIM,
typename SITE_TYPE,
typename REAL>
430__global__
void lc1_gpu(REAL r, SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
433 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
434 for (
int mu = 0; mu < FIELD_DIM; mu++) {
435 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
436 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
437 mul_add_assign(&site1, r, &site2);
438 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
443template <
unsigned int FIELD_DIM,
typename SITE_TYPE,
typename REAL>
444__global__
void lc2_gpu(REAL r1, REAL r2, SITE_TYPE *s1, SITE_TYPE *s2,
int N) {
447 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
448 for (
int mu = 0; mu < FIELD_DIM; mu++) {
449 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
450 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
451 lc(&site1, r1, &site1, r2, &site2);
452 write_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
457template <
unsigned int FIELD_DIM,
typename SITE_TYPE,
typename REAL>
458__global__
void lc3_gpu(REAL r1, REAL r2, SITE_TYPE *s1, SITE_TYPE *s2, SITE_TYPE *s3,
int N) {
462 for (
int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < N; ix += blockDim.x * gridDim.x) {
463 for (
int mu = 0; mu < FIELD_DIM; mu++) {
464 read_gpu<REAL>(N, &site1, s1, ix, mu, FIELD_DIM);
465 read_gpu<REAL>(N, &site2, s2, ix, mu, FIELD_DIM);
466 read_gpu<REAL>(N, &site3, s3, ix, mu, FIELD_DIM);
467 lc_add_assign(&site3, r1, &site1, r2, &site2);
468 minus(&site3, &site3);
469 write_gpu<REAL>(N, &site3, s3, ix, mu, FIELD_DIM);
This file contains information on the geometry of the local lattice, block decomposed geometry,...