HiRep 0.1
Loading...
Searching...
No Matches
strided_reads_gpu.h
1/***************************************************************************\
2* Copyright (c) 2023, Sofie Martins *
3* All rights reserved. *
4\***************************************************************************/
5
8
9#ifndef STRIDED_READS_GPU_HPP
10#define STRIDED_READS_GPU_HPP
11
12#define THREADSIZE 32
13
14enum DIRECTION { UP = 0, DOWN = 1 };
15
16#ifdef __cplusplus
17
18template <typename REAL, typename FIELD_TYPE, typename SITE_TYPE>
19__host__ __device__ __forceinline__ void read_sub_assign_gpu(int stride, SITE_TYPE *s, const FIELD_TYPE *in, int ix, int comp,
20 int dim) {
21 const int field_dim = sizeof(FIELD_TYPE) / sizeof(REAL);
22 const int n_components = sizeof(SITE_TYPE) / sizeof(REAL);
23 int iz = ((ix / THREADSIZE) * THREADSIZE) * dim * field_dim + (ix % THREADSIZE) + ((comp)*n_components) * (THREADSIZE);
24 const int _stride = THREADSIZE;
25 REAL *in_cpx = (REAL *)in;
26 REAL *in_comp_cpx = (REAL *)s;
27 for (int i = 0; i < n_components; ++i) {
28 in_comp_cpx[i] -= in_cpx[iz];
29 iz += _stride;
30 }
31}
32
33template <typename REAL, typename FIELD_TYPE, typename SITE_TYPE>
34__host__ __device__ __forceinline__ void read_assign_gpu(int stride, SITE_TYPE *s, const FIELD_TYPE *in, int ix, int comp,
35 int dim) {
36 const int field_dim = sizeof(FIELD_TYPE) / sizeof(REAL);
37 const int n_components = sizeof(SITE_TYPE) / sizeof(REAL);
38 int iz = ((ix / THREADSIZE) * THREADSIZE) * dim * field_dim + (ix % THREADSIZE) + ((comp)*n_components) * (THREADSIZE);
39 const int _stride = THREADSIZE;
40 REAL *in_cpx = (REAL *)in;
41 REAL *in_comp_cpx = (REAL *)s;
42 for (int i = 0; i < n_components; ++i) {
43 in_comp_cpx[i] += in_cpx[iz];
44 iz += _stride;
45 }
46}
47
48template <typename REAL, typename FIELD_TYPE, typename SITE_TYPE>
49__host__ __device__ __forceinline__ void read_gpu(int stride, SITE_TYPE *s, const FIELD_TYPE *in, size_t ix, int comp,
50 int dim) {
51 const int field_dim = sizeof(FIELD_TYPE) / sizeof(REAL);
52 const int n_components = sizeof(SITE_TYPE) / sizeof(REAL);
53 size_t iz = ((ix / THREADSIZE) * THREADSIZE) * dim * field_dim + (ix % THREADSIZE) + ((comp)*n_components) * (THREADSIZE);
54 const int _stride = THREADSIZE;
55 REAL *in_cpx = (REAL *)in;
56 REAL *in_comp_cpx = (REAL *)s;
57 for (int i = 0; i < n_components; ++i) {
58 in_comp_cpx[i] = in_cpx[iz];
59 iz += _stride;
60 }
61}
62
63template <typename REAL, typename FIELD_TYPE, typename SITE_TYPE>
64__host__ __device__ __forceinline__ void write_gpu(int stride, SITE_TYPE *s, FIELD_TYPE *out, size_t ix, int comp, int dim) {
65 const int field_dim = sizeof(FIELD_TYPE) / sizeof(REAL);
66 const int n_components = sizeof(SITE_TYPE) / sizeof(REAL);
67 size_t iz = ((ix / THREADSIZE) * THREADSIZE) * dim * field_dim + (ix % THREADSIZE) + (comp)*n_components * (THREADSIZE);
68 const int _stride = THREADSIZE;
69 REAL *out_cpx = (REAL *)out;
70 REAL *out_comp_cpx = (REAL *)s;
71 for (int i = 0; i < n_components; ++i) {
72 out_cpx[iz] = out_comp_cpx[i];
73 iz += _stride;
74 }
75}
76
77template <typename REAL, typename VECTOR_TYPE, typename SITE_TYPE>
78__device__ __forceinline__ void in_spinor_field(VECTOR_TYPE *v, SITE_TYPE *in, int iy, int comp) {
79 read_gpu<REAL>(0, v, in, iy, comp, 1);
80}
81
82template <typename REAL, typename GAUGE_TYPE>
83__device__ __forceinline__ void in_gauge_field(GAUGE_TYPE *u, const GAUGE_TYPE *in, int ix, int iy, int comp, int dir) {
84 if (dir == UP) {
85 read_gpu<REAL>(0, u, in, ix, comp, 4);
86 } else if (dir == DOWN) {
87 read_gpu<REAL>(0, u, in, iy, comp, 4);
88 }
89}
90
91template <typename REAL, typename SITE_TYPE>
92__device__ __forceinline__ void write_out_spinor_field(SITE_TYPE *r, SITE_TYPE *in, int ix) {
93 write_gpu<REAL>(0, r, in, ix, 0, 1);
94}
95
96template <typename REAL, typename FIELD_TYPE, typename SITE_TYPE>
97__host__ __device__ __forceinline__ void write_assign_gpu(int stride, SITE_TYPE *s, FIELD_TYPE *out, int ix, int comp,
98 int dim) {
99 const int field_dim = sizeof(FIELD_TYPE) / sizeof(REAL);
100 const int n_components = sizeof(SITE_TYPE) / sizeof(REAL);
101 int iz = ((ix / THREADSIZE) * THREADSIZE) * dim * field_dim + (ix % THREADSIZE) + (comp)*n_components * (THREADSIZE);
102 const int _stride = THREADSIZE;
103 REAL *out_cpx = (REAL *)out;
104 REAL *out_comp_cpx = (REAL *)s;
105 for (int i = 0; i < n_components; ++i) {
106 out_cpx[iz] += out_comp_cpx[i];
107 iz += _stride;
108 }
109}
110
111template <typename REAL, typename FIELD_TYPE, typename SITE_TYPE>
112__host__ __device__ __forceinline__ void write_assign_atomic_gpu(int stride, SITE_TYPE *s, FIELD_TYPE *out, int ix, int comp,
113 int dim) {
114 const int field_dim = sizeof(FIELD_TYPE) / sizeof(REAL);
115 const int n_components = sizeof(SITE_TYPE) / sizeof(REAL);
116 int iz = ((ix / THREADSIZE) * THREADSIZE) * dim * field_dim + (ix % THREADSIZE) + (comp)*n_components * (THREADSIZE);
117 const int _stride = THREADSIZE;
118 REAL *out_cpx = (REAL *)out;
119 REAL *out_comp_cpx = (REAL *)s;
120 for (int i = 0; i < n_components; ++i) {
121// Be aware that for lower architectures this is the same
122// as the write assign!!!
123#if (__CUDA_ARCH__ >= 600) || defined(HIP)
124 atomicAdd(&out_cpx[iz], out_comp_cpx[i]);
125#else
126 out_cpx[iz] += out_comp_cpx[i];
127#endif
128 iz += _stride;
129 }
130}
131
132__device__ __forceinline__ void read_clover(hr_complex *c, const ldl_t *in, int ix, int dn, int j) {
133 // Number of doubles per site
134 const int n_components = sizeof(ldl_t) / sizeof(double);
135
136 // offset: Round down to next lowest multiple of 32,
137 // find the offset at double level
138 int iz = ((ix / THREADSIZE) * THREADSIZE) * n_components;
139
140 // find the first double in the array of the given index
141 // components can now be found at strides of 32 away
142 iz += ix % THREADSIZE;
143
144 // Move by j hr complexes (containing 2 doubles)
145 // also offset by half the components if you want to read down
146 iz += 2 * j * THREADSIZE + dn * n_components * THREADSIZE / 2;
147
148 double *in_cpx = (double *)in;
149 double *in_comp_cpx = (double *)c;
150 for (int i = 0; i < 2; ++i) {
151 in_comp_cpx[i] = in_cpx[iz];
152 iz += THREADSIZE;
153 }
154}
155
156__device__ __forceinline__ void write_clover(hr_complex *c, ldl_t *out, int ix, int dn, int j) {
157 const int n_components = sizeof(ldl_t) / sizeof(double);
158 int iz = ((ix / THREADSIZE) * THREADSIZE) * n_components;
159 iz += ix % THREADSIZE;
160 iz += 2 * j * THREADSIZE + dn * n_components * THREADSIZE / 2;
161 double *out_cpx = (double *)out;
162 double *out_comp_cpx = (double *)c;
163 for (int i = 0; i < 2; ++i) {
164 out_cpx[iz] = out_comp_cpx[i];
165 iz += THREADSIZE;
166 }
167}
168
169__device__ __forceinline__ void read_force(hr_complex *c, const suNf *in, int ix, int comp, int i, int j) {
170 const int n_components = sizeof(suNf) / sizeof(double);
171 int iz = ((ix / THREADSIZE) * THREADSIZE) * n_components * 6;
172 iz += ix % THREADSIZE;
173 // Move by j hr complexes (containing 2 doubles)
174 // also offset by half the components if you want to read down
175#ifdef REPR_IS_REAL
176 iz += (i * NF + j) * THREADSIZE + comp * n_components * THREADSIZE;
177#else
178 iz += 2 * (i * NF + j) * THREADSIZE + comp * n_components * THREADSIZE;
179#endif
180
181 double *in_cpx = (double *)in;
182 double *in_comp_cpx = (double *)c;
183 for (int i = 0; i < 2; ++i) {
184 in_comp_cpx[i] = in_cpx[iz];
185 iz += THREADSIZE;
186 }
187}
188
189__device__ __forceinline__ void write_force(hr_complex *c, suNf *out, int ix, int comp, int i, int j) {
190 const int n_components = sizeof(suNf) / sizeof(double);
191 int iz = ((ix / THREADSIZE) * THREADSIZE) * n_components * 6;
192 iz += ix % THREADSIZE;
193#ifdef REPR_IS_REAL
194 iz += (i * NF + j) * THREADSIZE + comp * n_components * THREADSIZE;
195#else
196 iz += 2 * (i * NF + j) * THREADSIZE + comp * n_components * THREADSIZE;
197#endif
198 double *out_cpx = (double *)out;
199 double *out_comp_cpx = (double *)c;
200 for (int i = 0; i < 2; ++i) {
201 out_cpx[iz] = out_comp_cpx[i];
202 iz += THREADSIZE;
203 }
204}
205
206__device__ __forceinline__ void read_clover_term_comp(hr_complex *c, const suNfc *in, int ix, int comp, int i, int j) {
207 const int n_components = sizeof(suNfc) / sizeof(double);
208 int iz = ((ix / THREADSIZE) * THREADSIZE) * 4 * n_components;
209 iz += ix % (THREADSIZE);
210 iz += comp * n_components * (THREADSIZE);
211 iz += (i * NF + j) * 2 * (THREADSIZE);
212 double *in_cpx = (double *)in;
213 double *in_comp_cpx = (double *)c;
214 for (int i = 0; i < 2; ++i) {
215 in_comp_cpx[i] = in_cpx[iz];
216 iz += THREADSIZE;
217 }
218}
219
220__device__ __forceinline__ void write_clover_term_comp(hr_complex *c, suNfc *out, int ix, int comp, int i, int j) {
221 const int n_components = sizeof(suNfc) / sizeof(double);
222 int iz = ((ix / THREADSIZE) * THREADSIZE) * 4 * n_components;
223 iz += ix % (THREADSIZE);
224 iz += comp * n_components * (THREADSIZE);
225 iz += (i * NF + j) * 2 * (THREADSIZE);
226 double *out_cpx = (double *)out;
227 double *out_comp_cpx = (double *)c;
228 for (int i = 0; i < 2; ++i) {
229 out_cpx[iz] = out_comp_cpx[i];
230 iz += THREADSIZE;
231 }
232}
233
234#endif
235
236#ifdef __cplusplus
237extern "C" {
238#endif
239
240#define _FIELD_TYPE spinor_field
241#define _SITE_TYPE suNf_spinor
242#define _FIELD_DIM 1
243#define _REAL double
244#include "TMPL/strided_reads_gpu.h.tmpl"
245
246#define _FIELD_TYPE spinor_field_flt
247#define _SITE_TYPE suNf_spinor_flt
248#define _FIELD_DIM 1
249#define _REAL float
250#include "TMPL/strided_reads_gpu.h.tmpl"
251
252#define _FIELD_TYPE scalar_field
253#define _SITE_TYPE double
254#define _FIELD_DIM 1
255#define _REAL double
256#include "TMPL/strided_reads_gpu.h.tmpl"
257
258#define _FIELD_TYPE suNg_field
259#define _SITE_TYPE suNg
260#define _FIELD_DIM 4
261#define _REAL double
262#include "TMPL/strided_reads_gpu.h.tmpl"
263
264#define _FIELD_TYPE suNg_field_flt
265#define _SITE_TYPE suNg_flt
266#define _FIELD_DIM 4
267#define _REAL float
268#include "TMPL/strided_reads_gpu.h.tmpl"
269
270#define _FIELD_TYPE suNf_field
271#define _SITE_TYPE suNf
272#define _FIELD_DIM 4
273#define _REAL double
274#include "TMPL/strided_reads_gpu.h.tmpl"
275
276#define _FIELD_TYPE suNf_field_flt
277#define _SITE_TYPE suNf_flt
278#define _FIELD_DIM 4
279#define _REAL float
280#include "TMPL/strided_reads_gpu.h.tmpl"
281
282#define _FIELD_TYPE suNg_scalar_field
283#define _SITE_TYPE suNg_vector
284#define _FIELD_DIM 1
285#define _REAL double
286#include "TMPL/strided_reads_gpu.h.tmpl"
287
288#define _FIELD_TYPE suNg_av_field
289#define _SITE_TYPE suNg_algebra_vector
290#define _FIELD_DIM 4
291#define _REAL double
292#include "TMPL/strided_reads_gpu.h.tmpl"
293
294#define _FIELD_TYPE gtransf
295#define _SITE_TYPE suNg
296#define _FIELD_DIM 1
297#define _REAL double
298#include "TMPL/strided_reads_gpu.h.tmpl"
299
300#define _FIELD_TYPE ldl_field
301#define _SITE_TYPE ldl_t
302#define _FIELD_DIM 1
303#define _REAL double
304#include "TMPL/strided_reads_gpu.h.tmpl"
305
306#define _FIELD_TYPE clover_term
307#define _SITE_TYPE suNfc
308#define _FIELD_DIM 4
309#define _REAL double
310#include "TMPL/strided_reads_gpu.h.tmpl"
311
312#define _FIELD_TYPE clover_force
313#define _SITE_TYPE suNf
314#define _FIELD_DIM 6
315#define _REAL double
316#include "TMPL/strided_reads_gpu.h.tmpl"
317
318#define _FIELD_TYPE staple_field
319#define _SITE_TYPE suNg
320#define _FIELD_DIM 3
321#define _REAL double
322#include "TMPL/strided_reads_gpu.h.tmpl"
323
324#ifdef __cplusplus
325}
326#endif
327
328#endif