9#ifndef STRIDED_READS_GPU_HPP
10#define STRIDED_READS_GPU_HPP
14enum DIRECTION { UP = 0, DOWN = 1 };
18template <
typename REAL,
typename FIELD_TYPE,
typename SITE_TYPE>
19__host__ __device__ __forceinline__
void read_sub_assign_gpu(
int stride, SITE_TYPE *s,
const FIELD_TYPE *in,
int ix,
int comp,
21 const int field_dim =
sizeof(FIELD_TYPE) /
sizeof(REAL);
22 const int n_components =
sizeof(SITE_TYPE) /
sizeof(REAL);
23 int iz = ((ix / THREADSIZE) * THREADSIZE) * dim * field_dim + (ix % THREADSIZE) + ((comp)*n_components) * (THREADSIZE);
24 const int _stride = THREADSIZE;
25 REAL *in_cpx = (REAL *)in;
26 REAL *in_comp_cpx = (REAL *)s;
27 for (
int i = 0; i < n_components; ++i) {
28 in_comp_cpx[i] -= in_cpx[iz];
33template <
typename REAL,
typename FIELD_TYPE,
typename SITE_TYPE>
34__host__ __device__ __forceinline__
void read_assign_gpu(
int stride, SITE_TYPE *s,
const FIELD_TYPE *in,
int ix,
int comp,
36 const int field_dim =
sizeof(FIELD_TYPE) /
sizeof(REAL);
37 const int n_components =
sizeof(SITE_TYPE) /
sizeof(REAL);
38 int iz = ((ix / THREADSIZE) * THREADSIZE) * dim * field_dim + (ix % THREADSIZE) + ((comp)*n_components) * (THREADSIZE);
39 const int _stride = THREADSIZE;
40 REAL *in_cpx = (REAL *)in;
41 REAL *in_comp_cpx = (REAL *)s;
42 for (
int i = 0; i < n_components; ++i) {
43 in_comp_cpx[i] += in_cpx[iz];
48template <
typename REAL,
typename FIELD_TYPE,
typename SITE_TYPE>
49__host__ __device__ __forceinline__
void read_gpu(
int stride, SITE_TYPE *s,
const FIELD_TYPE *in,
size_t ix,
int comp,
51 const int field_dim =
sizeof(FIELD_TYPE) /
sizeof(REAL);
52 const int n_components =
sizeof(SITE_TYPE) /
sizeof(REAL);
53 size_t iz = ((ix / THREADSIZE) * THREADSIZE) * dim * field_dim + (ix % THREADSIZE) + ((comp)*n_components) * (THREADSIZE);
54 const int _stride = THREADSIZE;
55 REAL *in_cpx = (REAL *)in;
56 REAL *in_comp_cpx = (REAL *)s;
57 for (
int i = 0; i < n_components; ++i) {
58 in_comp_cpx[i] = in_cpx[iz];
63template <
typename REAL,
typename FIELD_TYPE,
typename SITE_TYPE>
64__host__ __device__ __forceinline__
void write_gpu(
int stride, SITE_TYPE *s, FIELD_TYPE *out,
size_t ix,
int comp,
int dim) {
65 const int field_dim =
sizeof(FIELD_TYPE) /
sizeof(REAL);
66 const int n_components =
sizeof(SITE_TYPE) /
sizeof(REAL);
67 size_t iz = ((ix / THREADSIZE) * THREADSIZE) * dim * field_dim + (ix % THREADSIZE) + (comp)*n_components * (THREADSIZE);
68 const int _stride = THREADSIZE;
69 REAL *out_cpx = (REAL *)out;
70 REAL *out_comp_cpx = (REAL *)s;
71 for (
int i = 0; i < n_components; ++i) {
72 out_cpx[iz] = out_comp_cpx[i];
77template <
typename REAL,
typename VECTOR_TYPE,
typename SITE_TYPE>
78__device__ __forceinline__
void in_spinor_field(VECTOR_TYPE *v, SITE_TYPE *in,
int iy,
int comp) {
79 read_gpu<REAL>(0, v, in, iy, comp, 1);
82template <
typename REAL,
typename GAUGE_TYPE>
83__device__ __forceinline__
void in_gauge_field(GAUGE_TYPE *u,
const GAUGE_TYPE *in,
int ix,
int iy,
int comp,
int dir) {
85 read_gpu<REAL>(0, u, in, ix, comp, 4);
86 }
else if (dir == DOWN) {
87 read_gpu<REAL>(0, u, in, iy, comp, 4);
91template <
typename REAL,
typename SITE_TYPE>
92__device__ __forceinline__
void write_out_spinor_field(SITE_TYPE *r, SITE_TYPE *in,
int ix) {
93 write_gpu<REAL>(0, r, in, ix, 0, 1);
96template <
typename REAL,
typename FIELD_TYPE,
typename SITE_TYPE>
97__host__ __device__ __forceinline__
void write_assign_gpu(
int stride, SITE_TYPE *s, FIELD_TYPE *out,
int ix,
int comp,
99 const int field_dim =
sizeof(FIELD_TYPE) /
sizeof(REAL);
100 const int n_components =
sizeof(SITE_TYPE) /
sizeof(REAL);
101 int iz = ((ix / THREADSIZE) * THREADSIZE) * dim * field_dim + (ix % THREADSIZE) + (comp)*n_components * (THREADSIZE);
102 const int _stride = THREADSIZE;
103 REAL *out_cpx = (REAL *)out;
104 REAL *out_comp_cpx = (REAL *)s;
105 for (
int i = 0; i < n_components; ++i) {
106 out_cpx[iz] += out_comp_cpx[i];
111template <
typename REAL,
typename FIELD_TYPE,
typename SITE_TYPE>
112__host__ __device__ __forceinline__
void write_assign_atomic_gpu(
int stride, SITE_TYPE *s, FIELD_TYPE *out,
int ix,
int comp,
114 const int field_dim =
sizeof(FIELD_TYPE) /
sizeof(REAL);
115 const int n_components =
sizeof(SITE_TYPE) /
sizeof(REAL);
116 int iz = ((ix / THREADSIZE) * THREADSIZE) * dim * field_dim + (ix % THREADSIZE) + (comp)*n_components * (THREADSIZE);
117 const int _stride = THREADSIZE;
118 REAL *out_cpx = (REAL *)out;
119 REAL *out_comp_cpx = (REAL *)s;
120 for (
int i = 0; i < n_components; ++i) {
123#if (__CUDA_ARCH__ >= 600) || defined(HIP)
124 atomicAdd(&out_cpx[iz], out_comp_cpx[i]);
126 out_cpx[iz] += out_comp_cpx[i];
132__device__ __forceinline__
void read_clover(hr_complex *c,
const ldl_t *in,
int ix,
int dn,
int j) {
134 const int n_components =
sizeof(ldl_t) /
sizeof(
double);
138 int iz = ((ix / THREADSIZE) * THREADSIZE) * n_components;
142 iz += ix % THREADSIZE;
146 iz += 2 * j * THREADSIZE + dn * n_components * THREADSIZE / 2;
148 double *in_cpx = (
double *)in;
149 double *in_comp_cpx = (
double *)c;
150 for (
int i = 0; i < 2; ++i) {
151 in_comp_cpx[i] = in_cpx[iz];
156__device__ __forceinline__
void write_clover(hr_complex *c, ldl_t *out,
int ix,
int dn,
int j) {
157 const int n_components =
sizeof(ldl_t) /
sizeof(
double);
158 int iz = ((ix / THREADSIZE) * THREADSIZE) * n_components;
159 iz += ix % THREADSIZE;
160 iz += 2 * j * THREADSIZE + dn * n_components * THREADSIZE / 2;
161 double *out_cpx = (
double *)out;
162 double *out_comp_cpx = (
double *)c;
163 for (
int i = 0; i < 2; ++i) {
164 out_cpx[iz] = out_comp_cpx[i];
169__device__ __forceinline__
void read_force(hr_complex *c,
const suNf *in,
int ix,
int comp,
int i,
int j) {
170 const int n_components =
sizeof(suNf) /
sizeof(
double);
171 int iz = ((ix / THREADSIZE) * THREADSIZE) * n_components * 6;
172 iz += ix % THREADSIZE;
176 iz += (i * NF + j) * THREADSIZE + comp * n_components * THREADSIZE;
178 iz += 2 * (i * NF + j) * THREADSIZE + comp * n_components * THREADSIZE;
181 double *in_cpx = (
double *)in;
182 double *in_comp_cpx = (
double *)c;
183 for (
int i = 0; i < 2; ++i) {
184 in_comp_cpx[i] = in_cpx[iz];
189__device__ __forceinline__
void write_force(hr_complex *c, suNf *out,
int ix,
int comp,
int i,
int j) {
190 const int n_components =
sizeof(suNf) /
sizeof(
double);
191 int iz = ((ix / THREADSIZE) * THREADSIZE) * n_components * 6;
192 iz += ix % THREADSIZE;
194 iz += (i * NF + j) * THREADSIZE + comp * n_components * THREADSIZE;
196 iz += 2 * (i * NF + j) * THREADSIZE + comp * n_components * THREADSIZE;
198 double *out_cpx = (
double *)out;
199 double *out_comp_cpx = (
double *)c;
200 for (
int i = 0; i < 2; ++i) {
201 out_cpx[iz] = out_comp_cpx[i];
206__device__ __forceinline__
void read_clover_term_comp(hr_complex *c,
const suNfc *in,
int ix,
int comp,
int i,
int j) {
207 const int n_components =
sizeof(suNfc) /
sizeof(
double);
208 int iz = ((ix / THREADSIZE) * THREADSIZE) * 4 * n_components;
209 iz += ix % (THREADSIZE);
210 iz += comp * n_components * (THREADSIZE);
211 iz += (i * NF + j) * 2 * (THREADSIZE);
212 double *in_cpx = (
double *)in;
213 double *in_comp_cpx = (
double *)c;
214 for (
int i = 0; i < 2; ++i) {
215 in_comp_cpx[i] = in_cpx[iz];
220__device__ __forceinline__
void write_clover_term_comp(hr_complex *c, suNfc *out,
int ix,
int comp,
int i,
int j) {
221 const int n_components =
sizeof(suNfc) /
sizeof(
double);
222 int iz = ((ix / THREADSIZE) * THREADSIZE) * 4 * n_components;
223 iz += ix % (THREADSIZE);
224 iz += comp * n_components * (THREADSIZE);
225 iz += (i * NF + j) * 2 * (THREADSIZE);
226 double *out_cpx = (
double *)out;
227 double *out_comp_cpx = (
double *)c;
228 for (
int i = 0; i < 2; ++i) {
229 out_cpx[iz] = out_comp_cpx[i];
240#define _FIELD_TYPE spinor_field
241#define _SITE_TYPE suNf_spinor
244#include "TMPL/strided_reads_gpu.h.tmpl"
246#define _FIELD_TYPE spinor_field_flt
247#define _SITE_TYPE suNf_spinor_flt
250#include "TMPL/strided_reads_gpu.h.tmpl"
252#define _FIELD_TYPE scalar_field
253#define _SITE_TYPE double
256#include "TMPL/strided_reads_gpu.h.tmpl"
258#define _FIELD_TYPE suNg_field
259#define _SITE_TYPE suNg
262#include "TMPL/strided_reads_gpu.h.tmpl"
264#define _FIELD_TYPE suNg_field_flt
265#define _SITE_TYPE suNg_flt
268#include "TMPL/strided_reads_gpu.h.tmpl"
270#define _FIELD_TYPE suNf_field
271#define _SITE_TYPE suNf
274#include "TMPL/strided_reads_gpu.h.tmpl"
276#define _FIELD_TYPE suNf_field_flt
277#define _SITE_TYPE suNf_flt
280#include "TMPL/strided_reads_gpu.h.tmpl"
282#define _FIELD_TYPE suNg_scalar_field
283#define _SITE_TYPE suNg_vector
286#include "TMPL/strided_reads_gpu.h.tmpl"
288#define _FIELD_TYPE suNg_av_field
289#define _SITE_TYPE suNg_algebra_vector
292#include "TMPL/strided_reads_gpu.h.tmpl"
294#define _FIELD_TYPE gtransf
295#define _SITE_TYPE suNg
298#include "TMPL/strided_reads_gpu.h.tmpl"
300#define _FIELD_TYPE ldl_field
301#define _SITE_TYPE ldl_t
304#include "TMPL/strided_reads_gpu.h.tmpl"
306#define _FIELD_TYPE clover_term
307#define _SITE_TYPE suNfc
310#include "TMPL/strided_reads_gpu.h.tmpl"
312#define _FIELD_TYPE clover_force
313#define _SITE_TYPE suNf
316#include "TMPL/strided_reads_gpu.h.tmpl"
318#define _FIELD_TYPE staple_field
319#define _SITE_TYPE suNg
322#include "TMPL/strided_reads_gpu.h.tmpl"