Loading...
Searching...
No Matches
7#if (NF == 3) || (NG == 3)
9#define _MVM_3x3C_AVX2(mc, mu, mp) \
11 __m256d temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15, \
14 temp1 = _mm256_loadu_pd((double *)(&mu)); \
15 temp2 = _mm256_shuffle_pd(temp1, temp1, 0b0000); \
16 temp3 = _mm256_shuffle_pd(temp1, temp1, 0b1111); \
17 temp8 = _mm256_loadu_pd((double *)(&mu) + 2); \
18 temp1 = _mm256_loadu_pd((double *)(&mu) + 6); \
19 temp4 = _mm256_shuffle_pd(temp1, temp1, 0b0000); \
20 temp5 = _mm256_shuffle_pd(temp1, temp1, 0b1111); \
21 temp9 = _mm256_loadu_pd((double *)(&mu) + 8); \
22 temp1 = _mm256_loadu_pd((double *)(&mu) + 12); \
23 temp6 = _mm256_shuffle_pd(temp1, temp1, 0b0000); \
24 temp7 = _mm256_shuffle_pd(temp1, temp1, 0b1111); \
25 temp1 = _mm256_loadu_pd((double *)(&mp)); \
26 temp14 = _mm256_shuffle_pd(temp1, temp1, 0b0101); \
27 temp16 = _mm256_loadu_pd((double *)(&mp) + 2); \
28 temp3 = _mm256_mul_pd(temp3, temp14); \
29 temp2 = _mm256_fmaddsub_pd(temp2, temp1, temp3); \
30 temp5 = _mm256_mul_pd(temp5, temp14); \
31 temp4 = _mm256_fmaddsub_pd(temp4, temp1, temp5); \
32 temp3 = _mm256_permute2f128_pd(temp4, temp2, 2); \
33 temp2 = _mm256_permute2f128_pd(temp2, temp2, 1); \
34 temp2 = _mm256_blend_pd(temp2, temp4, 12); \
35 temp2 = _mm256_add_pd(temp3, temp2); \
36 temp8 = _mm256_permute2f128_pd(temp8, temp8, 1); \
37 temp8 = _mm256_blend_pd(temp8, temp9, 12); \
38 temp9 = _mm256_permute2f128_pd(temp16, temp16, 1); \
39 temp4 = _mm256_blend_pd(temp9, temp16, 12); \
40 temp10 = _mm256_shuffle_pd(temp8, temp8, 0b0000); \
41 temp13 = _mm256_shuffle_pd(temp8, temp8, 0b1111); \
42 temp15 = _mm256_shuffle_pd(temp4, temp4, 0b0101); \
43 temp13 = _mm256_mul_pd(temp13, temp15); \
44 temp10 = _mm256_fmaddsub_pd(temp10, temp4, temp13); \
45 temp2 = _mm256_add_pd(temp2, temp10); \
46 temp7 = _mm256_mul_pd(temp7, temp14); \
47 temp6 = _mm256_fmaddsub_pd(temp6, temp1, temp7); \
48 temp10 = _mm256_permute2f128_pd(temp6, temp6, 1); \
49 temp1 = _mm256_add_pd(temp10, temp6); \
50 temp6 = _mm256_loadu_pd((double *)(&mu) + 14); \
51 temp10 = _mm256_permute2f128_pd(temp6, temp6, 1); \
52 temp10 = _mm256_blend_pd(temp10, temp6, 12); \
53 temp11 = _mm256_shuffle_pd(temp10, temp10, 0b0000); \
54 temp12 = _mm256_shuffle_pd(temp10, temp10, 0b1111); \
55 temp12 = _mm256_mul_pd(temp12, temp15); \
56 temp11 = _mm256_fmaddsub_pd(temp11, temp4, temp12); \
57 temp11 = _mm256_add_pd(temp1, temp11); \
58 chi_3rd = _mm256_castpd256_pd128(temp11); \
59 _mm256_store_pd((double *)(&mc), temp2); \
60 _mm_store_pd((double *)(&mc) + 4, chi_3rd); \
63#define _MTVM_3x3C_AVX2(mc, mu, mp) \
65 __m256d temp1, temp2, temp3, temp4, temp5, temp6, temp7; \
66 const __m256d simd_mask = _mm256_set_pd(-1.0, 1.0, -1.0, 1.0); \
68 temp1 = _mm256_loadu_pd((double *)(&mu)); \
69 temp1 = _mm256_mul_pd(temp1, simd_mask); \
70 temp2 = _mm256_loadu_pd((double *)(&mu) + 6); \
71 temp2 = _mm256_mul_pd(temp2, simd_mask); \
72 temp3 = _mm256_permute2f128_pd(temp2, temp1, 2); \
73 temp1 = _mm256_permute2f128_pd(temp1, temp1, 1); \
74 temp1 = _mm256_blend_pd(temp1, temp2, 12); \
75 temp2 = _mm256_shuffle_pd(temp3, temp3, 0b0101); \
76 temp4 = _mm256_shuffle_pd(temp1, temp1, 0b0101); \
77 temp5 = _mm256_loadu_pd((double *)(&mp)); \
78 temp6 = _mm256_shuffle_pd(temp5, temp5, 0b0000); \
79 temp5 = _mm256_shuffle_pd(temp5, temp5, 0b1111); \
80 temp2 = _mm256_mul_pd(temp5, temp2); \
81 temp2 = _mm256_fmaddsub_pd(temp6, temp3, temp2); \
82 temp3 = _mm256_mul_pd(temp5, temp4); \
83 temp1 = _mm256_fmaddsub_pd(temp6, temp1, temp3); \
84 temp3 = _mm256_permute2f128_pd(temp1, temp2, 2); \
85 temp2 = _mm256_permute2f128_pd(temp2, temp2, 1); \
86 temp1 = _mm256_blend_pd(temp2, temp1, 12); \
87 temp1 = _mm256_add_pd(temp3, temp1); \
88 temp3 = _mm256_loadu_pd((double *)(&mp) + 2); \
89 temp2 = _mm256_permute2f128_pd(temp3, temp3, 1); \
90 temp2 = _mm256_blend_pd(temp2, temp3, 12); \
91 temp3 = _mm256_shuffle_pd(temp2, temp2, 0b0000); \
92 temp2 = _mm256_shuffle_pd(temp2, temp2, 0b1111); \
93 temp4 = _mm256_loadu_pd((double *)(&mu) + 12); \
94 temp4 = _mm256_mul_pd(temp4, simd_mask); \
95 temp7 = _mm256_shuffle_pd(temp4, temp4, 0b0101); \
96 temp7 = _mm256_mul_pd(temp2, temp7); \
97 temp4 = _mm256_fmaddsub_pd(temp3, temp4, temp7); \
98 temp1 = _mm256_add_pd(temp1, temp4); \
99 temp4 = _mm256_loadu_pd((double *)(&mu) + 2); \
100 temp4 = _mm256_mul_pd(temp4, simd_mask); \
101 temp7 = _mm256_loadu_pd((double *)(&mu) + 8); \
102 temp7 = _mm256_mul_pd(temp7, simd_mask); \
103 temp4 = _mm256_permute2f128_pd(temp4, temp4, 1); \
104 temp4 = _mm256_blend_pd(temp4, temp7, 12); \
105 temp7 = _mm256_shuffle_pd(temp4, temp4, 0b0101); \
106 temp5 = _mm256_mul_pd(temp5, temp7); \
107 temp4 = _mm256_fmaddsub_pd(temp6, temp4, temp5); \
108 temp5 = _mm256_permute2f128_pd(temp4, temp4, 1); \
109 temp5 = _mm256_add_pd(temp5, temp4); \
110 temp4 = _mm256_loadu_pd((double *)(&mu) + 14); \
111 temp4 = _mm256_mul_pd(temp4, simd_mask); \
112 temp6 = _mm256_permute2f128_pd(temp4, temp4, 1); \
113 temp4 = _mm256_blend_pd(temp6, temp4, 12); \
114 temp6 = _mm256_shuffle_pd(temp4, temp4, 0b0101); \
115 temp2 = _mm256_mul_pd(temp2, temp6); \
116 temp2 = _mm256_fmaddsub_pd(temp3, temp4, temp2); \
117 temp2 = _mm256_add_pd(temp5, temp2); \
118 chi_3rd = _mm256_castpd256_pd128(temp2); \
119 _mm256_store_pd((double *)(&mc), temp1); \
120 _mm_store_pd((double *)(&mc) + 4, chi_3rd); \
123#define _double_MVM_3x3C_AVX2(mc, mc2, mu, mp, mp2) \
125 __m256d temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15, \
127 __m128d chi_3rd, chi2_3rd; \
128 temp1 = _mm256_loadu_pd((double *)(&mu)); \
129 temp6 = _mm256_shuffle_pd(temp1, temp1, 0b0000); \
130 temp1 = _mm256_shuffle_pd(temp1, temp1, 0b1111); \
131 temp2 = _mm256_loadu_pd((double *)(&mu) + 6); \
132 temp7 = _mm256_shuffle_pd(temp2, temp2, 0b0000); \
133 temp2 = _mm256_shuffle_pd(temp2, temp2, 0b1111); \
134 temp3 = _mm256_loadu_pd((double *)(&mu) + 12); \
135 temp8 = _mm256_shuffle_pd(temp3, temp3, 0b0000); \
136 temp3 = _mm256_shuffle_pd(temp3, temp3, 0b1111); \
137 temp4 = _mm256_loadu_pd((double *)(&mp)); \
138 temp9 = _mm256_shuffle_pd(temp4, temp4, 0b0101); \
139 temp5 = _mm256_loadu_pd((double *)(&mp2)); \
140 temp10 = _mm256_shuffle_pd(temp5, temp5, 0b0101); \
141 temp12 = _mm256_mul_pd(temp1, temp9); \
142 temp11 = _mm256_fmaddsub_pd(temp6, temp4, temp12); \
143 temp13 = _mm256_mul_pd(temp2, temp9); \
144 temp12 = _mm256_fmaddsub_pd(temp7, temp4, temp13); \
145 temp13 = _mm256_permute2f128_pd(temp12, temp11, 2); \
146 temp11 = _mm256_permute2f128_pd(temp11, temp11, 1); \
147 temp11 = _mm256_blend_pd(temp11, temp12, 12); \
148 temp11 = _mm256_add_pd(temp13, temp11); \
149 temp12 = _mm256_loadu_pd((double *)(&mu) + 2); \
150 temp12 = _mm256_permute2f128_pd(temp12, temp12, 1); \
151 temp13 = _mm256_loadu_pd((double *)(&mu) + 8); \
152 temp12 = _mm256_blend_pd(temp12, temp13, 12); \
153 temp13 = _mm256_loadu_pd((double *)(&mp) + 2); \
154 temp16 = _mm256_permute2f128_pd(temp13, temp13, 1); \
155 temp13 = _mm256_blend_pd(temp16, temp13, 12); \
156 temp15 = _mm256_shuffle_pd(temp12, temp12, 0b0000); \
157 temp12 = _mm256_shuffle_pd(temp12, temp12, 0b1111); \
158 temp14 = _mm256_shuffle_pd(temp13, temp13, 0b0101); \
159 temp14 = _mm256_mul_pd(temp12, temp14); \
160 temp13 = _mm256_fmaddsub_pd(temp15, temp13, temp14); \
161 temp11 = _mm256_add_pd(temp11, temp13); \
162 temp1 = _mm256_mul_pd(temp1, temp10); \
163 temp1 = _mm256_fmaddsub_pd(temp6, temp5, temp1); \
164 temp2 = _mm256_mul_pd(temp2, temp10); \
165 temp7 = _mm256_fmaddsub_pd(temp7, temp5, temp2); \
166 temp13 = _mm256_permute2f128_pd(temp7, temp1, 2); \
167 temp1 = _mm256_permute2f128_pd(temp1, temp1, 1); \
168 temp1 = _mm256_blend_pd(temp1, temp7, 12); \
169 temp1 = _mm256_add_pd(temp13, temp1); \
170 temp13 = _mm256_loadu_pd((double *)(&mp2) + 2); \
171 temp14 = _mm256_permute2f128_pd(temp13, temp13, 1); \
172 temp14 = _mm256_blend_pd(temp14, temp13, 12); \
173 temp17 = _mm256_shuffle_pd(temp14, temp14, 0b0101); \
174 temp12 = _mm256_mul_pd(temp12, temp17); \
175 temp12 = _mm256_fmaddsub_pd(temp15, temp14, temp12); \
176 temp1 = _mm256_add_pd(temp1, temp12); \
177 temp12 = _mm256_mul_pd(temp3, temp9); \
178 temp4 = _mm256_fmaddsub_pd(temp8, temp4, temp12); \
179 temp3 = _mm256_mul_pd(temp3, temp10); \
180 temp3 = _mm256_fmaddsub_pd(temp8, temp5, temp3); \
181 temp5 = _mm256_permute2f128_pd(temp3, temp4, 2); \
182 temp4 = _mm256_permute2f128_pd(temp4, temp4, 1); \
183 temp3 = _mm256_blend_pd(temp4, temp3, 12); \
184 temp3 = _mm256_add_pd(temp5, temp3); \
185 temp9 = _mm256_loadu_pd((double *)(&mu) + 14); \
186 temp10 = _mm256_permute2f128_pd(temp9, temp9, 1); \
187 temp9 = _mm256_blend_pd(temp10, temp9, 12); \
188 temp10 = _mm256_shuffle_pd(temp9, temp9, 0b0000); \
189 temp12 = _mm256_shuffle_pd(temp9, temp9, 0b1111); \
190 temp9 = _mm256_blend_pd(temp16, temp13, 12); \
191 temp13 = _mm256_shuffle_pd(temp9, temp9, 0b0101); \
192 temp2 = _mm256_mul_pd(temp12, temp13); \
193 temp7 = _mm256_fmaddsub_pd(temp10, temp9, temp2); \
194 temp2 = _mm256_add_pd(temp3, temp7); \
195 chi_3rd = _mm256_castpd256_pd128(temp2); \
196 chi2_3rd = _mm256_extractf128_pd(temp2, 1); \
197 _mm256_store_pd((double *)(&mc), temp11); \
198 _mm_store_pd((double *)(&mc) + 4, chi_3rd); \
199 _mm256_store_pd((double *)(&mc2), temp1); \
200 _mm_store_pd((double *)(&mc2) + 4, chi2_3rd); \
203#define _double_MTVM_3x3C_AVX2(mc, mc2, mu, mp, mp2) \
205 __m256d temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15; \
206 __m128d chi_3rd, chi2_3rd; \
207 const __m256d simd_mask = _mm256_set_pd(-1.0, 1.0, -1.0, 1.0); \
208 temp1 = _mm256_loadu_pd((double *)(&mu)); \
209 temp1 = _mm256_mul_pd(temp1, simd_mask); \
210 temp2 = _mm256_loadu_pd((double *)(&mu) + 6); \
211 temp2 = _mm256_mul_pd(temp2, simd_mask); \
212 temp3 = _mm256_permute2f128_pd(temp2, temp1, 2); \
213 temp1 = _mm256_permute2f128_pd(temp1, temp1, 1); \
214 temp1 = _mm256_blend_pd(temp1, temp2, 12); \
215 temp2 = _mm256_shuffle_pd(temp3, temp3, 0b0101); \
216 temp4 = _mm256_shuffle_pd(temp1, temp1, 0b0101); \
217 temp5 = _mm256_loadu_pd((double *)(&mp)); \
218 temp6 = _mm256_shuffle_pd(temp5, temp5, 0b0000); \
219 temp5 = _mm256_shuffle_pd(temp5, temp5, 0b1111); \
220 temp7 = _mm256_loadu_pd((double *)(&mp2)); \
221 temp8 = _mm256_shuffle_pd(temp7, temp7, 0b0000); \
222 temp7 = _mm256_shuffle_pd(temp7, temp7, 0b1111); \
223 temp10 = _mm256_mul_pd(temp5, temp2); \
224 temp9 = _mm256_fmaddsub_pd(temp6, temp3, temp10); \
225 temp11 = _mm256_mul_pd(temp5, temp4); \
226 temp10 = _mm256_fmaddsub_pd(temp6, temp1, temp11); \
227 temp11 = _mm256_permute2f128_pd(temp10, temp9, 2); \
228 temp9 = _mm256_permute2f128_pd(temp9, temp9, 1); \
229 temp9 = _mm256_blend_pd(temp9, temp10, 12); \
230 temp9 = _mm256_add_pd(temp11, temp9); \
231 temp10 = _mm256_loadu_pd((double *)(&mp) + 2); \
232 temp11 = _mm256_permute2f128_pd(temp10, temp10, 1); \
233 temp10 = _mm256_blend_pd(temp11, temp10, 12); \
234 temp12 = _mm256_shuffle_pd(temp10, temp10, 0b0000); \
235 temp10 = _mm256_shuffle_pd(temp10, temp10, 0b1111); \
236 temp13 = _mm256_loadu_pd((double *)(&mu) + 12); \
237 temp13 = _mm256_mul_pd(temp13, simd_mask); \
238 temp15 = _mm256_shuffle_pd(temp13, temp13, 0b0101); \
239 temp10 = _mm256_mul_pd(temp10, temp15); \
240 temp10 = _mm256_fmaddsub_pd(temp12, temp13, temp10); \
241 temp9 = _mm256_add_pd(temp9, temp10); \
242 temp2 = _mm256_mul_pd(temp7, temp2); \
243 temp2 = _mm256_fmaddsub_pd(temp8, temp3, temp2); \
244 temp3 = _mm256_mul_pd(temp7, temp4); \
245 temp1 = _mm256_fmaddsub_pd(temp8, temp1, temp3); \
246 temp3 = _mm256_permute2f128_pd(temp1, temp2, 2); \
247 temp2 = _mm256_permute2f128_pd(temp2, temp2, 1); \
248 temp1 = _mm256_blend_pd(temp2, temp1, 12); \
249 temp1 = _mm256_add_pd(temp3, temp1); \
250 temp2 = _mm256_loadu_pd((double *)(&mp2) + 2); \
251 temp3 = _mm256_permute2f128_pd(temp2, temp2, 1); \
252 temp3 = _mm256_blend_pd(temp3, temp2, 12); \
253 temp4 = _mm256_shuffle_pd(temp3, temp3, 0b0000); \
254 temp3 = _mm256_shuffle_pd(temp3, temp3, 0b1111); \
255 temp3 = _mm256_mul_pd(temp3, temp15); \
256 temp3 = _mm256_fmaddsub_pd(temp4, temp13, temp3); \
257 temp1 = _mm256_add_pd(temp1, temp3); \
258 temp3 = _mm256_loadu_pd((double *)(&mu) + 2); \
259 temp3 = _mm256_mul_pd(temp3, simd_mask); \
260 temp4 = _mm256_loadu_pd((double *)(&mu) + 8); \
261 temp4 = _mm256_mul_pd(temp4, simd_mask); \
262 temp3 = _mm256_permute2f128_pd(temp3, temp3, 1); \
263 temp3 = _mm256_blend_pd(temp3, temp4, 12); \
264 temp4 = _mm256_shuffle_pd(temp3, temp3, 0b0101); \
265 temp5 = _mm256_mul_pd(temp5, temp4); \
266 temp5 = _mm256_fmaddsub_pd(temp6, temp3, temp5); \
267 temp4 = _mm256_mul_pd(temp7, temp4); \
268 temp3 = _mm256_fmaddsub_pd(temp8, temp3, temp4); \
269 temp4 = _mm256_permute2f128_pd(temp3, temp5, 2); \
270 temp14 = _mm256_permute2f128_pd(temp5, temp5, 1); \
271 temp3 = _mm256_blend_pd(temp14, temp3, 12); \
272 temp3 = _mm256_add_pd(temp4, temp3); \
273 temp2 = _mm256_blend_pd(temp11, temp2, 12); \
274 temp4 = _mm256_shuffle_pd(temp2, temp2, 0b0000); \
275 temp2 = _mm256_shuffle_pd(temp2, temp2, 0b1111); \
276 temp5 = _mm256_loadu_pd((double *)(&mu) + 14); \
277 temp5 = _mm256_mul_pd(temp5, simd_mask); \
278 temp6 = _mm256_permute2f128_pd(temp5, temp5, 1); \
279 temp5 = _mm256_blend_pd(temp6, temp5, 12); \
280 temp6 = _mm256_shuffle_pd(temp5, temp5, 0b0101); \
281 temp2 = _mm256_mul_pd(temp2, temp6); \
282 temp2 = _mm256_fmaddsub_pd(temp4, temp5, temp2); \
283 temp2 = _mm256_add_pd(temp3, temp2); \
284 chi_3rd = _mm256_castpd256_pd128(temp2); \
285 chi2_3rd = _mm256_extractf128_pd(temp2, 1); \
286 _mm256_store_pd((double *)(&mc), temp9); \
287 _mm_storeu_pd((double *)(&mc) + 4, chi_3rd); \
288 _mm256_store_pd((double *)(&mc2), temp1); \
289 _mm_storeu_pd((double *)(&mc2) + 4, chi2_3rd); \
294#if (NF == 2) || (NG == 2)
296#define _MVM_2x2C_AVX2(mc, mu, mp) \
298 __m256d temp1, temp2, temp3, temp4, temp5, temp6; \
299 temp1 = _mm256_loadu_pd((double *)(&mu)); \
300 temp4 = _mm256_shuffle_pd(temp1, temp1, 0b0000); \
301 temp1 = _mm256_shuffle_pd(temp1, temp1, 0b1111); \
302 temp2 = _mm256_loadu_pd((double *)(&mu) + 4); \
303 temp5 = _mm256_shuffle_pd(temp2, temp2, 0b0000); \
304 temp2 = _mm256_shuffle_pd(temp2, temp2, 0b1111); \
305 temp3 = _mm256_loadu_pd((double *)(&mp)); \
306 temp6 = _mm256_shuffle_pd(temp3, temp3, 0b0101); \
307 temp1 = _mm256_mul_pd(temp1, temp6); \
308 temp1 = _mm256_fmaddsub_pd(temp4, temp3, temp1); \
309 temp2 = _mm256_mul_pd(temp2, temp6); \
310 temp2 = _mm256_fmaddsub_pd(temp5, temp3, temp2); \
311 temp4 = _mm256_permute2f128_pd(temp2, temp1, 2); \
312 temp1 = _mm256_permute2f128_pd(temp1, temp1, 1); \
313 temp1 = _mm256_blend_pd(temp1, temp2, 12); \
314 temp1 = _mm256_add_pd(temp4, temp1); \
315 _mm256_store_pd((double *)(&mc), temp1); \
318#define _MTVM_2x2C_AVX2(mc, mu, mp) \
320 __m256d temp1, temp2, temp3, temp4, temp5, temp6; \
321 const __m256d simd_mask = _mm256_set_pd(-1.0, 1.0, -1.0, 1.0); \
322 temp1 = _mm256_loadu_pd((double *)(&mu)); \
323 temp1 = _mm256_mul_pd(temp1, simd_mask); \
324 temp2 = _mm256_loadu_pd((double *)(&mu) + 4); \
325 temp2 = _mm256_mul_pd(temp2, simd_mask); \
326 temp4 = _mm256_permute2f128_pd(temp2, temp1, 2); \
327 temp1 = _mm256_permute2f128_pd(temp1, temp1, 1); \
328 temp1 = _mm256_blend_pd(temp1, temp2, 12); \
329 temp2 = _mm256_shuffle_pd(temp4, temp4, 0b0101); \
330 temp5 = _mm256_shuffle_pd(temp1, temp1, 0b0101); \
331 temp3 = _mm256_loadu_pd((double *)(&mp)); \
332 temp6 = _mm256_shuffle_pd(temp3, temp3, 0b0000); \
333 temp3 = _mm256_shuffle_pd(temp3, temp3, 0b1111); \
334 temp2 = _mm256_mul_pd(temp3, temp2); \
335 temp2 = _mm256_fmaddsub_pd(temp6, temp4, temp2); \
336 temp3 = _mm256_mul_pd(temp3, temp5); \
337 temp1 = _mm256_fmaddsub_pd(temp6, temp1, temp3); \
338 temp3 = _mm256_permute2f128_pd(temp1, temp2, 2); \
339 temp2 = _mm256_permute2f128_pd(temp2, temp2, 1); \
340 temp1 = _mm256_blend_pd(temp2, temp1, 12); \
341 temp1 = _mm256_add_pd(temp3, temp1); \
342 _mm256_store_pd((double *)(&mc), temp1); \
345#define _double_MVM_2x2C_AVX2(mc, mc2, mu, mp, mp2) \
347 __m256d temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; \
348 temp1 = _mm256_loadu_pd((double *)(&mu)); \
349 temp2 = _mm256_loadu_pd((double *)(&mu) + 4); \
350 temp3 = _mm256_loadu_pd((double *)(&mp)); \
351 temp5 = _mm256_shuffle_pd(temp3, temp3, 0b0101); \
352 temp4 = _mm256_loadu_pd((double *)(&mp2)); \
353 temp6 = _mm256_shuffle_pd(temp4, temp4, 0b0101); \
354 temp7 = _mm256_shuffle_pd(temp1, temp1, 0b0000); \
355 temp1 = _mm256_shuffle_pd(temp1, temp1, 0b1111); \
356 temp9 = _mm256_mul_pd(temp1, temp5); \
357 temp8 = _mm256_fmaddsub_pd(temp7, temp3, temp9); \
358 temp9 = _mm256_shuffle_pd(temp2, temp2, 0b0000); \
359 temp2 = _mm256_shuffle_pd(temp2, temp2, 0b1111); \
360 temp5 = _mm256_mul_pd(temp2, temp5); \
361 temp3 = _mm256_fmaddsub_pd(temp9, temp3, temp5); \
362 temp5 = _mm256_permute2f128_pd(temp3, temp8, 2); \
363 temp8 = _mm256_permute2f128_pd(temp8, temp8, 1); \
364 temp3 = _mm256_blend_pd(temp8, temp3, 12); \
365 temp3 = _mm256_add_pd(temp5, temp3); \
366 temp1 = _mm256_mul_pd(temp1, temp6); \
367 temp1 = _mm256_fmaddsub_pd(temp7, temp4, temp1); \
368 temp2 = _mm256_mul_pd(temp2, temp6); \
369 temp2 = _mm256_fmaddsub_pd(temp9, temp4, temp2); \
370 temp4 = _mm256_permute2f128_pd(temp2, temp1, 2); \
371 temp1 = _mm256_permute2f128_pd(temp1, temp1, 1); \
372 temp1 = _mm256_blend_pd(temp1, temp2, 12); \
373 temp1 = _mm256_add_pd(temp4, temp1); \
374 _mm256_store_pd((double *)(&mc), temp3); \
375 _mm256_store_pd((double *)(&mc2), temp1); \
378#define _double_MTVM_2x2C_AVX2(mc, mc2, mu, mp, mp2) \
380 __m256d temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11; \
381 const __m256d simd_mask = _mm256_set_pd(-1.0, 1.0, -1.0, 1.0); \
382 temp1 = _mm256_loadu_pd((double *)(&mu)); \
383 temp1 = _mm256_mul_pd(temp1, simd_mask); \
384 temp2 = _mm256_loadu_pd((double *)(&mu) + 4); \
385 temp2 = _mm256_mul_pd(temp2, simd_mask); \
386 temp4 = _mm256_permute2f128_pd(temp2, temp1, 2); \
387 temp1 = _mm256_permute2f128_pd(temp1, temp1, 1); \
388 temp1 = _mm256_blend_pd(temp1, temp2, 12); \
389 temp2 = _mm256_shuffle_pd(temp4, temp4, 0b0101); \
390 temp5 = _mm256_shuffle_pd(temp1, temp1, 0b0101); \
391 temp3 = _mm256_loadu_pd((double *)(&mp)); \
392 temp6 = _mm256_shuffle_pd(temp3, temp3, 0b0000); \
393 temp3 = _mm256_shuffle_pd(temp3, temp3, 0b1111); \
394 temp7 = _mm256_loadu_pd((double *)(&mp2)); \
395 temp8 = _mm256_shuffle_pd(temp7, temp7, 0b0000); \
396 temp7 = _mm256_shuffle_pd(temp7, temp7, 0b1111); \
397 temp9 = _mm256_mul_pd(temp6, temp4); \
398 temp10 = _mm256_mul_pd(temp3, temp2); \
399 temp9 = _mm256_addsub_pd(temp9, temp10); \
400 temp6 = _mm256_mul_pd(temp6, temp1); \
401 temp3 = _mm256_mul_pd(temp3, temp5); \
402 temp3 = _mm256_addsub_pd(temp6, temp3); \
403 temp6 = _mm256_permute2f128_pd(temp3, temp9, 2); \
404 temp11 = _mm256_permute2f128_pd(temp9, temp9, 1); \
405 temp3 = _mm256_blend_pd(temp11, temp3, 12); \
406 temp3 = _mm256_add_pd(temp6, temp3); \
407 temp4 = _mm256_mul_pd(temp8, temp4); \
408 temp2 = _mm256_mul_pd(temp7, temp2); \
409 temp2 = _mm256_addsub_pd(temp4, temp2); \
410 temp1 = _mm256_mul_pd(temp8, temp1); \
411 temp4 = _mm256_mul_pd(temp7, temp5); \
412 temp1 = _mm256_addsub_pd(temp1, temp4); \
413 temp4 = _mm256_permute2f128_pd(temp1, temp2, 2); \
414 temp2 = _mm256_permute2f128_pd(temp2, temp2, 1); \
415 temp1 = _mm256_blend_pd(temp2, temp1, 12); \
416 temp2 = _mm256_add_pd(temp4, temp1); \
417 _mm256_store_pd((double *)(&mc), temp3); \
418 _mm256_store_pd((double *)(&mc2), temp2); \
424#if (NF == 3) && !defined(REPR_IS_REAL)
427#define _suNf_multiply(mc, mu, mp) \
429 suNf_vector : ({ _MVM_3x3C_AVX2((mc), (mu), (mp)); }), \
430 default : ({ _suNf_multiply_default(mc, mu, mp); }))
432#undef _suNf_inverse_multiply
433#define _suNf_inverse_multiply(mc, mu, mp) \
435 suNf_vector : ({ _MTVM_3x3C_AVX2((mc), (mu), (mp)); }), \
436 default : ({ _suNf_inverse_multiply_default(mc, mu, mp); }))
438#undef _suNf_double_multiply
439#define _suNf_double_multiply(mc, mc2, mu, mp, mp2) \
441 suNf_vector : ({ _double_MVM_3x3C_AVX2((mc), (mc2), (mu), (mp), (mp2)); }), \
442 default : ({ _suNf_double_multiply_default(mc, mc2, mu, mp, mp2); }))
444#undef _suNf_double_inverse_multiply
445#define _suNf_double_inverse_multiply(mc, mc2, mu, mp, mp2) \
447 suNf_vector : ({ _double_MTVM_3x3C_AVX2((mc), (mc2), (mu), (mp), (mp2)); }), \
448 default : ({ _suNf_double_inverse_multiply_default(mc, mc2, mu, mp, mp2); }))
454#define _suNg_multiply(mc, mu, mp) \
456 suNg_vector : ({ _MVM_3x3C_AVX2((mc), (mu), (mp)); }),\
457 default : ({ _suNg_multiply_default(mc, mu, mp); }))
459#undef _suNg_inverse_multiply
460#define _suNg_inverse_multiply(mc, mu, mp) \
462 suNg_vector : ({ _MTVM_3x3C_AVX2((mc), (mu), (mp)); }), \
463 default : ({ _suNg_inverse_multiply_default(mc, mu, mp); }))
465#undef _suNg_double_multiply
466#define _suNg_double_multiply(mc, mc2, mu, mp, mp2) \
468 suNg_vector : ({ _double_MVM_3x3C_AVX2((mc), (mc2), (mu), (mp), (mp2)); }), \
469 default : ({ _suNg_double_multiply_default(mc, mc2, mu, mp, mp2); }))
471#undef _suNg_double_inverse_multiply
472#define _suNg_double_inverse_multiply(mc, mc2, mu, mp, mp2) \
474 suNg_vector : ({ _double_MTVM_3x3C_AVX2((mc), (mc2), (mu), (mp), (mp2)); }), \
475 default : ({ _suNg_double_inverse_multiply_default(mc, mc2, mu, mp, mp2); }))
479#if (NF == 2) && !defined(REPR_IS_REAL)
482#define _suNf_multiply(mc, mu, mp) \
484 suNf_vector : ({ _MVM_2x2C_AVX2((mc), (mu), (mp)); }),\
485 default : ({ _suNf_multiply_default(mc, mu, mp); }))
486#undef _suNf_inverse_multiply
487#define _suNf_inverse_multiply(mc, mu, mp) \
489 suNf_vector : ({ _MTVM_2x2C_AVX2((mc), (mu), (mp)); }), \
490 default : ({ _suNf_inverse_multiply_default(mc, mu, mp); }))
492#undef _suNf_double_multiply
493#define _suNf_double_multiply(mc, mc2, mu, mp, mp2) \
495 suNf_vector : ({ _double_MVM_2x2C_AVX2((mc), (mc2), (mu), (mp), (mp2)); }),\
496 default : ({ _suNf_double_multiply_default(mc, mc2, mu, mp, mp2); }))
498#undef _suNf_double_inverse_multiply
499#define _suNf_double_inverse_multiply(mc, mc2, mu, mp, mp2) \
501 suNf_vector : ({ _double_MTVM_2x2C_AVX2((mc), (mc2), (mu), (mp), (mp2)); }), \
502 default : ({ _suNf_double_inverse_multiply_default(mc, mc2, mu, mp, mp2); }))
508#define _suNg_multiply(mc, mu, mp) \
510 suNg_vector : ({ _MVM_2x2C_AVX2((mc), (mu), (mp)); }),\
511 default : ({ _suNg_multiply_default(mc, mu, mp); }))
513#undef _suNg_inverse_multiply
514#define _suNg_inverse_multiply(mc, mu, mp) \
516 suNg_vector : ({ _MTVM_2x2C_AVX2((mc), (mu), (mp)); }), \
517 default : ({ _suNg_inverse_multiply_default(mc, mu, mp); }))
519#undef _suNg_double_multiply
520#define _suNg_double_multiply(mc, mc2, mu, mp, mp2) \
522 suNg_vector : ({ _double_MVM_2x2C_AVX2((mc), (mc2), (mu), (mp), (mp2)); }),\
523 default : ({ _suNg_double_multiply_default(mc, mc2, mu, mp, mp2); }))
525#undef _suNg_double_inverse_multiply
526#define _suNg_double_inverse_multiply(mc, mc2, mu, mp, mp2) \
528 suNg_vector : ({ _double_MTVM_2x2C_AVX2((mc), (mc2), (mu), (mp), (mp2)); }),\
529 default : ({ _suNg_double_inverse_multiply_default(mc, mc2, mu, mp, mp2); }))