Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx_h
2 : #error "Do not include this directly; use fd_avx.h"
3 : #endif
4 :
5 : /* Vector double API **************************************************/
6 :
7 : /* A vd_t is a vector where each adjacent pair of 32-bit wide lanes
8 : (e.g. 0-1 / 2-3 / 4-5 / 6-7) hold a double precision IEEE 754
9 : floating point value (a "double").
10 :
11 : Inputs to all operations assume that the values aren't exotic (no
12 : NaNs, no +/-Infs, no denorms) and, if the output of an operation
13 : would produce an exotic value in the IEEE 754 standard, the results
14 : of that operation are undefined. Additionally, correct handling of
15 : signed zero is not guaranteed. Lastly, these will not raise floating
16 : point exceptions or set math errno's.
17 :
18 : Basically, handling of exotics and signed zero will generally be
19 : reasonable but most of that relies on the underlying compiler and
20 : hardware having conformant behavior and this is flaky at the best of
21 : times. So it is best for developers not to assume conformant
22 : behavior.
23 :
24 : Note that, when doing conditional operations on a vector double, the
25 : vector conditional should be identical values in adjacent pairs of
26 : lanes. Results are be undefined otherwise.
27 :
28 : These mirror the other APIs as much as possible. Macros are
29 : preferred over static inlines when it is possible to do it robustly
30 : to reduce the risk of the compiler mucking it up. */
31 :
32 14418691 : #define wd_t __m256d
33 :
34 : /* Constructors */
35 :
36 : /* Given the double values, return ... */
37 :
38 1179648 : #define wd(d0,d1,d2,d3) _mm256_setr_pd( (d0), (d1), (d2), (d3) ) /* [ d0 d1 d2 d3 ] */
39 :
40 524288 : #define wd_bcast(d0) _mm256_set1_pd( (d0) ) /* [ d0 d0 d0 d0 ] */
41 :
42 : static inline wd_t /* [ d0 d1 d0 d1 ] */
43 196608 : wd_bcast_pair( double d0, double d1 ) {
44 196608 : return _mm256_setr_pd( d0, d1, d0, d1 );
45 196608 : }
46 :
47 : static inline wd_t /* [ d0 d0 d1 d1 ] */
48 196608 : wd_bcast_wide( double d0, double d1 ) {
49 196608 : return _mm256_setr_pd( d0, d0, d1, d1 );
50 196608 : }
51 :
52 : /* wd_permute returns [ d(imm_i0) d(imm_i1) d(imm_i2) d(imm_i3) ].
53 : imm_i* should be compile time constants in 0:3. */
54 :
55 : #define wd_permute(x,imm_i0,imm_i1,imm_i2,imm_i3) _mm256_permute4x64_pd( (x), (imm_i0)+4*(imm_i1)+16*(imm_i2)+64*(imm_i3) )
56 :
57 : /* Predefined constants */
58 :
59 : #define wd_zero() _mm256_setzero_pd() /* Return [ 0. 0. 0. 0. ] */
60 10617603 : #define wd_one() _mm256_set1_pd(1.) /* Return [ 1. 1. 1. 1. ] */
61 :
62 : /* Memory operations */
63 :
64 : /* wd_ld return the 4 doubles at the 32-byte aligned / 32-byte sized
65 : location p as a vector double. wd_ldu is the same but p does not have
66 : to be aligned. wd_st writes the vector double to the 32-byte aligned
67 : / 32-byte sized location p as 4 doubles. wd_stu is the same but p
68 : does not have to be aligned. In all these 64-bit lane l will be at
69 : p[l]. FIXME: USE ATTRIBUTES ON P PASSED TO THESE? */
70 :
71 10617603 : #define wd_ld(p) _mm256_load_pd( (p) )
72 42470412 : #define wd_ldu(p) _mm256_loadu_pd( (p) )
73 10617603 : #define wd_st(p,d) _mm256_store_pd( (p), (d) )
74 42470412 : #define wd_stu(p,d) _mm256_storeu_pd( (p), (d) )
75 :
76 : /* wd_ldif is an optimized equivalent to wd_notczero(c,wd_ldu(p)) (may
77 : have different behavior if c is not a proper vector conditional). It
78 : is provided for symmetry with the wd_stif operation. wd_stif stores
79 : x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
80 : Undefined behavior if c(n) is not a proper paired lane vector
81 : conditional. */
82 :
83 : #define wd_ldif(c,p) _mm256_maskload_pd( (p),(c))
84 : #define wd_stif(c,p,x) _mm256_maskstore_pd((p),(c),(x))
85 :
86 : /* Element operations */
87 :
88 : /* wd_extract extracts the double in 64-bit lane imm (e.g. indexed
89 : 0,1,2,3, corresponding to 32-bit pairs of lines 0-1, 2-3, 4-5, 6-7
90 : respectively) from the vector double as a double. wd_insert returns
91 : the vector double formed by replacing the value in 64-bit lane imm of
92 : a with the provided double. imm should be a compile time constant in
93 : 0:3. wd_extract_variable and wd_insert_variable are the slower but
94 : the 64-bit lane n does not have to be known at compile time (should
95 : still be in 0:3). */
96 :
97 : static inline double
98 42470412 : wd_extract( wd_t a, int imm ) { /* FIXME: USE EPI64 HACKS? */
99 42470412 : double d[4] W_ATTR;
100 42470412 : _mm256_store_pd( d, a );
101 42470412 : return d[imm];
102 42470412 : }
103 :
104 : #if FD_USING_CLANG || !FD_HAS_OPTIMIZATION
105 :
106 : /* Sigh ... clang is sad and can't handle passing compile time const
107 : expressions through a static inline */
108 42470412 : #define wd_insert( a, imm, v ) (__extension__({ \
109 42470412 : union { double v; long i; } _wd_insert_t; \
110 42470412 : _wd_insert_t.v = v; \
111 42470412 : _mm256_castsi256_pd( _mm256_insert_epi64( _mm256_castpd_si256( (a) ), _wd_insert_t.i, (imm) ) ); \
112 42470412 : }))
113 :
114 : #else
115 :
116 : static inline wd_t
117 : wd_insert( wd_t a, int imm, double v ) {
118 : union { double v; long i; } t;
119 : t.v = v;
120 : return _mm256_castsi256_pd( _mm256_insert_epi64( _mm256_castpd_si256( a ), t.i, imm ) );
121 : }
122 :
123 : #endif
124 :
125 : static inline double
126 42470412 : wd_extract_variable( wd_t a, int n ) {
127 42470412 : double d[4] W_ATTR;
128 42470412 : _mm256_store_pd( d, a );
129 42470412 : return d[n];
130 42470412 : }
131 :
132 : static inline wd_t
133 42470412 : wd_insert_variable( wd_t a, int n, double v ) {
134 42470412 : double d[4] W_ATTR;
135 42470412 : _mm256_store_pd( d, a );
136 42470412 : d[n] = v;
137 42470412 : return _mm256_load_pd( d );
138 42470412 : }
139 :
140 : /* Arithmetic operations */
141 :
142 : /* wd_neg(a) returns [ -a0 -a1 ... -a3 ] (i.e. -a )
143 : wd_sign(a) returns [ sign(a0) sign(a1) ... sign(a3) ]
144 : wd_abs(a) returns [ fabs(a0) fabs(a1) ... fabs(a3) ] (i.e. abs(a))
145 : wd_negabs(a) returns [ -fabs(a0) -fabs(a1) ... -fabs(a3) ] (i.e. -abs(a))
146 : wd_ceil(a) returns [ ceil(a0) ceil(a1) ... ceil(a3) ] (i.e. ceil(a))
147 : wd_floor(a) returns [ floor(a0) floor(a1) ... floor(a3) ] (i.e. floor(a))
148 : wd_rint(a) returns [ rint(a0) rint(a1) ... rint(a3) includi roundb(a))
149 : wd_trunc(a) returns [ trunc(a0) trunc(a1) ... trunc(a3) ] (i.e. fix(a))
150 : wd_sqrt(a) returns [ sqrt(a0) sqrt(a1) ... sqrt(a3) ] (i.e. sqrt(a))
151 : wd_rcp_fast(a) returns [ ~rcp(a0) ~rcp(a1) ... ~rcp(a3) ]
152 : wd_rsqrt_fast(a) returns [ ~rsqrt(a0) ~rsqrt(a1) ... ~rsqrt(a3) ]
153 :
154 : wd_add( a,b) returns [ a0+b0 a1+b1 ... a3+b3 ] (i.e. a +b)
155 : wd_sub( a,b) returns [ a0-b0 a1-b1 ... a3-b3 ] (i.e. a -b)
156 : wd_mul( a,b) returns [ a0*b0 a1*b1 ... a3*b3 ] (i.e. a.*b)
157 : wd_div( a,b) returns [ a0/b0 a1/b1 ... a3/b3 ] (i.e. a./b)
158 : wd_min( a,b) returns [ fmin(a0,b0) fmin(a1,b1) ... fmin(a3,b3) ] (i.e. min([a;b]) (a and b are 1x4)
159 : wd_max( a,b) returns [ fmax(a0,b0) fmax(a1,b1) ... fmax(a3,b3) ] (i.e. max([a;b]) (a and b are 1x4)
160 : wd_copysign(a,b) returns [ copysign(a0,b0) copysign(a1,b1) ... copysign(a3,b3) ]
161 : wd_flipsign(a,b) returns [ flipsign(a0,b0) flipsign(a1,b1) ... flipsign(a3,b3) ]
162 :
163 : wd_fma( a,b,c) returns [ fma(a0,b0, c0) fma(a1,b1, c1) ... fma(a3,b3, c3) ] (i.e. a.*b+c)
164 : wd_fms( a,b,c) returns [ fma(a0,b0,-c0) fma(a1,b1,-c1) ... fma(a3,b3,-c3) ] (i.e. a.*b-c)
165 : wd_fnma(a,b,c) returns [ -fma(a0,b0,-c0) -fma(a1,b1,-c1) ... -fma(a3,b3,-c3) ] (i.e. -a.*b+c)
166 :
167 : where sign(a) is -1. if a's sign bit is set and +1. otherwise, rcp(a)
168 : is 1./a, rsqrt(a) is 1./sqrt(a), and flipsign(a,b) returns -a if b
169 : signbit is set and a otherwise.
170 :
171 : rint is in round-to-nearest-even rounding mode (note rint and
172 : nearbyint are identical once floating point exceptions are ignored).
173 :
174 : sqrt should typically be full accuracy.
175 :
176 : rcp_fast and rsqrt_fast should typically be ~12 bits or more bits
177 : accurate (~3 or more decimal digits) such that (nearly) full accuracy
178 : can be achieved with two to three rounds of Newton-Raphson polishing.
179 : Bit level replicable code should avoid rcp_fast and rsqrt_fast though
180 : as the approximations used can vary between various generations /
181 : steppings / microcode updates of x86 processors (including Intel and
182 : AMD). */
183 :
184 : #define wd_neg(a) _mm256_xor_pd( _mm256_set1_pd( -0. ), (a) )
185 : #define wd_sign(a) _mm256_xor_pd( _mm256_set1_pd( 1. ), _mm256_and_pd( _mm256_set1_pd( -0. ), (a) ) )
186 : #define wd_abs(a) _mm256_andnot_pd( _mm256_set1_pd( -0. ), (a) )
187 : #define wd_negabs(a) _mm256_or_pd( _mm256_set1_pd( -0. ), (a) )
188 : #define wd_ceil(a) _mm256_ceil_pd( (a) )
189 : #define wd_floor(a) _mm256_floor_pd( (a) )
190 : #define wd_rint(a) _mm256_round_pd( (a), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC )
191 : #define wd_trunc(a) _mm256_round_pd( (a), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC )
192 : #define wd_sqrt(a) _mm256_sqrt_pd( (a) )
193 : #define wd_rcp_fast(a) _mm256_cvtps_pd( _mm_rcp_ps( _mm256_cvtpd_ps( (a) ) ) )
194 : #define wd_rsqrt_fast(a) _mm256_cvtps_pd( _mm_rsqrt_ps( _mm256_cvtpd_ps( (a) ) ) )
195 :
196 : #define wd_add(a,b) _mm256_add_pd( (a), (b) )
197 524288 : #define wd_sub(a,b) _mm256_sub_pd( (a), (b) )
198 : #define wd_mul(a,b) _mm256_mul_pd( (a), (b) )
199 : #define wd_div(a,b) _mm256_div_pd( (a), (b) )
200 : #define wd_min(a,b) _mm256_min_pd( (a), (b) )
201 : #define wd_max(a,b) _mm256_max_pd( (a), (b) )
202 : #define wd_copysign(a,b) _mm256_or_pd( _mm256_andnot_pd( _mm256_set1_pd( -0. ), (a) ), \
203 : _mm256_and_pd( _mm256_set1_pd( -0. ), (b) ) )
204 : #define wd_flipsign(a,b) _mm256_xor_pd( (a), _mm256_and_pd( _mm256_set1_pd( -0. ), (b) ) )
205 :
206 : #define wd_fma(a,b,c) _mm256_fmadd_pd( (a), (b), (c) )
207 : #define wd_fms(a,b,c) _mm256_fmsub_pd( (a), (b), (c) )
208 : #define wd_fnma(a,b,c) _mm256_fnmadd_pd( (a), (b), (c) )
209 :
210 : /* Binary operations */
211 :
212 : /* Note: binary operations are not well defined on vector doubles.
213 : If doing tricks with floating point binary representations, the user
214 : should use wd_to_wl_raw as necessary. */
215 :
216 : /* Logical operations */
217 :
218 : /* These all return proper paired lane vector conditionals */
219 :
220 : #define wd_lnot(a) /* [ !a0 !a0 !a1 !a1 ... !a3 !a3 ] */ \
221 : _mm256_castpd_si256( _mm256_cmp_pd( (a), _mm256_setzero_pd(), _CMP_EQ_OQ ) )
222 : #define wd_lnotnot(a) /* [ !!a0 !!a0 !!a1 !!a1 ... !!a3 !!a3 ] */ \
223 : _mm256_castpd_si256( _mm256_cmp_pd( (a), _mm256_setzero_pd(), _CMP_NEQ_OQ ) )
224 : #define wd_signbit(a) /* [ signbit(a0) signbit(a0) signbit(a1) signbit(a1) ... signbit(a3) signbit(a3) ] */ \
225 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( _mm256_srai_epi32( _mm256_castpd_si256( (a) ), 31 ) ), \
226 : _MM_SHUFFLE(3,3,1,1) ) )
227 :
228 : #define wd_eq(a,b) _mm256_castpd_si256( _mm256_cmp_pd( (a), (b), _CMP_EQ_OQ ) ) /* [ a0==b0 a0==b0 a1==b1 a1==b1 ... a3==b3 a3==b3 ] */
229 : #define wd_gt(a,b) _mm256_castpd_si256( _mm256_cmp_pd( (a), (b), _CMP_GT_OQ ) ) /* [ a0> b0 a0> b0 a1> b1 a1> b1 ... a3> b3 a3> b3 ] */
230 524288 : #define wd_lt(a,b) _mm256_castpd_si256( _mm256_cmp_pd( (a), (b), _CMP_LT_OQ ) ) /* [ a0< b0 a0< b0 a1< b1 a1< b1 ... a3< b3 a3< b3 ] */
231 : #define wd_ne(a,b) _mm256_castpd_si256( _mm256_cmp_pd( (a), (b), _CMP_NEQ_OQ ) ) /* [ a0!=b0 a0!=b0 a1!=b1 a1!=b1 ... a3!=b3 a3!=b3 ] */
232 : #define wd_ge(a,b) _mm256_castpd_si256( _mm256_cmp_pd( (a), (b), _CMP_GE_OQ ) ) /* [ a0>=b0 a0>=b0 a1>=b1 a1>=b1 ... a3>=b3 a3>=b3 ] */
233 : #define wd_le(a,b) _mm256_castpd_si256( _mm256_cmp_pd( (a), (b), _CMP_LE_OQ ) ) /* [ a0<=b0 a0<=b0 a1<=b1 a1<=b1 ... a3<=b3 a3<=b3 ] */
234 :
235 : /* Conditional operations */
236 :
237 : /* c should be a proper paired lane vector conditional for these */
238 :
239 : #define wd_czero(c,a) _mm256_andnot_pd( _mm256_castsi256_pd( (c) ), (a) ) /* [ c01?0.:a0 c23?0.:a1 c45?0.:a2 c67?0.:a3 ] */
240 : #define wd_notczero(c,a) _mm256_and_pd( _mm256_castsi256_pd( (c) ), (a) ) /* [ c01?a0:0. c23?a1:0. c45?a2:0. c67?a3:0. ] */
241 :
242 524288 : #define wd_if(c,t,f) _mm256_blendv_pd( (f), (t), _mm256_castsi256_pd( (c) ) ) /* [ c01?t0:f0 c23?t1:f1 c45?t2:f2 c67?t3:f3 ] */
243 :
244 : /* Conversion operations */
245 :
246 : /* Summarizing:
247 :
248 : wd_to_wc(d) returns [ !!d0 !!d0 !!d1 !!d1 ... !!d3 !!d3 ] ... proper paired lane
249 :
250 : wd_to_wf(d,f,0) returns [ (float)d0 (float)d1 (float)d2 (float)d3 f4 f5 f6 f7 ]
251 : wd_to_wf(d,f,1) returns [ f0 f1 f2 f3 (float)d0 (float)d1 (float)d2 (float)d3 ]
252 :
253 : wd_to_wi(d,i,0) returns [ (int)d0 (int)d1 (int)d2 (int)d3 i4 i5 i6 i7 ]
254 : wd_to_wi(d,i,1) returns [ i0 i1 i2 i3 (int)d0 (int)d1 (int)d2 (int)d3 ]
255 :
256 : wd_to_wi_fast(d,i,0) returns [ (int)rint(d0) (int)rint(d1) (int)rint(d2) (int)rint(d3) i4 i5 i6 i7 ]
257 : wd_to_wi_fast(d,i,1) returns [ i0 i1 i2 i3 (int)rint(d0) (int)rint(d1) (int)rint(d2) (int)rint(d3) ]
258 :
259 : wd_to_wu(d,u,0) returns [ (uint)d0 (uint)d1 (uint)d2 (uint)d3 u4 u5 u6 u7 ]
260 : wd_to_wu(d,u,1) returns [ u0 u1 u2 u3 (uint)d0 (uint)d1 (uint)d2 (uint)d3 ]
261 :
262 : wd_to_wu_fast(d,u,0) returns [ (uint)rint(d0) (uint)rint(d1) (uint)rint(d2) (uint)rint(d3) u4 u5 u6 u7 ]
263 : wd_to_wu_fast(d,u,1) returns [ u0 u1 u2 u3 (uint)rint(d0) (uint)rint(d1) (uint)rint(d2) (uint)rint(d3) ]
264 :
265 : wd_to_wl(d) returns [ (long)d0 (long)d1 (long)d2 (long)d3 ]
266 :
267 : wd_to_wv(d) returns [ (ulong)d0 (ulong)d1 (ulong)d2 (ulong)d3 ]
268 :
269 : where rint is configured for round-to-nearest-even rounding (Intel
270 : architecture defaults to round-nearest-even here ... sigh, they still
271 : don't fully get it) and imm_hi should be a compile time constant.
272 : That is, the fast variants assume that float point inputs are already
273 : integral value in the appropriate range for the output type.
274 :
275 : Note that wd_to_{wf,wi,wi_fast} insert the converted values into
276 : lanes 0:3 (imm_hi==0) or 4:7 (imm_hi!=0) of the provided vector.
277 :
278 : The raw variants return just the raw bits as the corresponding vector
279 : type. wd_to_wl_raw allows doing advanced bit tricks on a vector
280 : double. The others are probably dubious but are provided for
281 : completeness. */
282 :
283 : #define wd_to_wc(d) _mm256_castpd_si256( _mm256_cmp_pd( (d), _mm256_setzero_pd(), _CMP_NEQ_OQ ) )
284 :
285 : #define wd_to_wf(d,f,imm_hi) _mm256_insertf128_ps( (f), _mm256_cvtpd_ps( (d) ), !!(imm_hi) )
286 :
287 : #define wd_to_wi(d,i,imm_hi) wd_to_wi_fast( _mm256_round_pd( (d), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ), (i), (imm_hi) )
288 : #define wd_to_wu(d,u,imm_hi) wd_to_wu_fast( _mm256_round_pd( (d), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ), (u), (imm_hi) )
289 :
290 : /* FIXME: IS IT FASTER TO USE INSERT / EXTRACT FOR THESE? */
291 :
292 196608 : static inline __m256i wd_to_wl( wd_t d ) { /* FIXME: workaround wl_t isn't declared at this point */
293 196608 : union { double d[4]; __m256d v[1]; } t[1];
294 196608 : union { long l[4]; __m256i v[1]; } u[1];
295 196608 : _mm256_store_pd( t->d, d );
296 196608 : u->l[0] = (long)t->d[0];
297 196608 : u->l[1] = (long)t->d[1];
298 196608 : u->l[2] = (long)t->d[2];
299 196608 : u->l[3] = (long)t->d[3];
300 196608 : return _mm256_load_si256( u->v );
301 196608 : }
302 :
303 196608 : static inline __m256i wd_to_wv( wd_t d ) { /* FIXME: workaround wv_t isn't declared at this point */
304 196608 : union { double d[4]; __m256d v[1]; } t[1];
305 196608 : union { ulong u[4]; __m256i v[1]; } u[1];
306 196608 : _mm256_store_pd( t->d, d );
307 196608 : u->u[0] = (ulong)t->d[0];
308 196608 : u->u[1] = (ulong)t->d[1];
309 196608 : u->u[2] = (ulong)t->d[2];
310 196608 : u->u[3] = (ulong)t->d[3];
311 196608 : return _mm256_load_si256( u->v );
312 196608 : }
313 :
314 : #define wd_to_wi_fast(d,i,imm_hi) _mm256_insertf128_si256( (i), _mm256_cvtpd_epi32( (d) ), !!(imm_hi) )
315 :
316 786432 : static inline wu_t wd_to_wu_fast( wd_t d, wu_t u, int imm_hi ) {
317 :
318 : /* Note: Given that _mm256_cvtpd_epi32 existed for a long time, Intel
319 : clearly had the hardware under the hood for _mm256_cvtpd_epu32 but
320 : didn't bother to expose it pre-Skylake-X ... sigh (all too typical
321 : unfortunately). We use _mm256_cvtpd_epu32 where supported because
322 : it is faster and it replicates the same IB behaviors as the compiler
323 : generated scalar ASM for float to uint casts on these targets.
324 :
325 : Pre-Skylake-X, we emulate it by noting that subtracting 2^31
326 : from a double holding an integer in [0,2^32) is exact and the
327 : result can be exactly converted to a signed integer by
328 : _mm256_cvtpd_epi32. We then use twos complement hacks to add back
329 : any shift. This also replicates the compiler's IB behaviors on
330 : these ISAs for float to int casts. */
331 :
332 262144 : # if defined(__AVX512F__) && defined(__AVX512VL__)
333 262144 : __m128i v = _mm256_cvtpd_epu32( d );
334 : # else
335 : /**/ // Assumes d is integer in [0,2^32)
336 524288 : wd_t s = wd_bcast( (double)(1UL<<31) ); // (double)2^31
337 524288 : wc_t c = wd_lt ( d, s ); // -1L if d<2^31, 0L o.w.
338 524288 : wd_t ds = wd_sub( d, s ); // (double)(d-2^31)
339 524288 : __m128 b = _mm_shuffle_ps( _mm_castsi128_ps( _mm256_extractf128_si256( c, 0 ) ),
340 524288 : _mm_castsi128_ps( _mm256_extractf128_si256( c, 1 ) ),
341 524288 : _MM_SHUFFLE(2,0,2,0) ); // -1 if d<2^31, 0 if o.w.
342 524288 : __m128i v0 = _mm256_cvtpd_epi32( wd_if( c, d, ds ) ); // (uint)(d if d<2^31, d-2^31 o.w.)
343 524288 : __m128i v1 = _mm_add_epi32( v0, _mm_set1_epi32( (int)(1U<<31) ) ); // (uint)(d+2^31 if d<2^31, d o.w.)
344 524288 : __m128i v = _mm_castps_si128( _mm_blendv_ps( _mm_castsi128_ps( v1 ),
345 524288 : _mm_castsi128_ps( v0 ), b ) ); // (uint)d
346 524288 : # endif
347 786432 : return imm_hi ? _mm256_insertf128_si256( u, v, 1 ) : _mm256_insertf128_si256( u, v, 0 ); // compile time
348 :
349 786432 : }
350 :
351 : #define wd_to_wc_raw(a) _mm256_castpd_si256( (a) )
352 : #define wd_to_wf_raw(a) _mm256_castpd_ps( (a) )
353 : #define wd_to_wi_raw(a) _mm256_castpd_si256( (a) )
354 : #define wd_to_wu_raw(a) _mm256_castpd_si256( (a) )
355 : #define wd_to_wl_raw(a) _mm256_castpd_si256( (a) )
356 : #define wd_to_wv_raw(a) _mm256_castpd_si256( (a) )
357 :
358 : /* Reduction operations */
359 :
360 : static inline wd_t
361 196608 : wd_sum_all( wd_t x ) { /* Returns wd_bcast( sum( x ) ) */
362 196608 : x = _mm256_add_pd( x, _mm256_permute2f128_pd( x, x, 1 ) ); /* x02 x13 ... */
363 196608 : return _mm256_hadd_pd( x, x ); /* xsum ... */
364 196608 : }
365 :
366 : static inline wd_t
367 196608 : wd_min_all( wd_t a ) { /* Returns wd_bcast( min( x ) ) */
368 196608 : a = _mm256_min_pd( a, _mm256_permute2f128_pd( a, a, 1 ) );
369 196608 : return _mm256_min_pd( a, _mm256_permute_pd( a, 5 ) );
370 196608 : }
371 :
372 : static inline wd_t
373 196608 : wd_max_all( wd_t a ) { /* Returns wd_bcast( max( x ) ) */
374 196608 : a = _mm256_max_pd( a, _mm256_permute2f128_pd( a, a, 1 ) );
375 196608 : return _mm256_max_pd( a, _mm256_permute_pd( a, 5 ) );
376 196608 : }
377 :
378 : /* Misc operations */
379 :
380 : /* wd_gather(b,i,imm_hi) returns
381 : [ b[i(0)] b[i(1)] b[i(2)] b[i(3)] ] if imm_hi is 0 and
382 : [ b[i(4)] b[i(5)] b[i(6)] b[i(7)] ] o.w.
383 : where b is a "double const*", i is wi_t and imm_hi is a compile time
384 : constant. */
385 :
386 21235206 : #define wd_gather(b,i,imm_hi) _mm256_i32gather_pd( (b), _mm256_extractf128_si256( (i), !!(imm_hi) ), 8 )
387 :
388 : /* wd_transpose_4x4 transposes the 4x4 matrix stored in wd_t r0,r1,r2,r3
389 : and stores the result in 4x4 matrix wd_t c0,c1,c2,c3. All
390 : c0,c1,c2,c3 should be different for a well defined result.
391 : Otherwise, in-place operation and/or using the same wd_t to specify
392 : multiple rows of r is fine. */
393 :
394 196608 : #define wd_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do { \
395 196608 : wd_t _wd_transpose_r0 = (r0); wd_t _wd_transpose_r1 = (r1); wd_t _wd_transpose_r2 = (r2); wd_t _wd_transpose_r3 = (r3); \
396 196608 : wd_t _wd_transpose_t; \
397 196608 : /* Transpose 2x2 blocks */ \
398 196608 : _wd_transpose_t = _wd_transpose_r0; _wd_transpose_r0 = _mm256_permute2f128_pd( _wd_transpose_t, _wd_transpose_r2, 0x20 ); \
399 196608 : /**/ _wd_transpose_r2 = _mm256_permute2f128_pd( _wd_transpose_t, _wd_transpose_r2, 0x31 ); \
400 196608 : _wd_transpose_t = _wd_transpose_r1; _wd_transpose_r1 = _mm256_permute2f128_pd( _wd_transpose_t, _wd_transpose_r3, 0x20 ); \
401 196608 : /**/ _wd_transpose_r3 = _mm256_permute2f128_pd( _wd_transpose_t, _wd_transpose_r3, 0x31 ); \
402 196608 : /* Transpose 1x1 blocks */ \
403 196608 : /**/ (c0) = _mm256_unpacklo_pd( _wd_transpose_r0, _wd_transpose_r1 ); \
404 196608 : /**/ (c1) = _mm256_unpackhi_pd( _wd_transpose_r0, _wd_transpose_r1 ); \
405 196608 : /**/ (c2) = _mm256_unpacklo_pd( _wd_transpose_r2, _wd_transpose_r3 ); \
406 196608 : /**/ (c3) = _mm256_unpackhi_pd( _wd_transpose_r2, _wd_transpose_r3 ); \
407 196608 : } while(0)
|