Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx_h
2 : #error "Do not include this directly; use fd_avx.h"
3 : #endif
4 :
5 : /* Vector float API ***************************************************/
6 :
7 : /* A wf_t is a vector where each 32-bit wide lane holds a single
8 : precision IEEE 754 floating point value (a "float").
9 :
10 : Inputs to all operations assume that the values aren't exotic (no
11 : NaNs, no +/-Infs, no denorms) and, if the output of an operation
12 : would produce an exotic value in the IEEE 754 standard, the results
13 : of that operation are undefined. Additionally, correct handling of
14 : signed zero is not guaranteed. Lastly, these will not raise floating
15 : point exceptions or set math errno's.
16 :
17 : Basically, handling of exotics and signed zero will generally be
18 : reasonable but most of that relies on the underlying compiler and
19 : hardware having conformant behavior and this is flaky at the best of
20 : times. So it is best for developers not to assume conformant
21 : behavior.
22 :
23 : These mirror the other APIs as much as possible. Macros are
24 : preferred over static inlines when it is possible to do it robustly
25 : to reduce the risk of the compiler mucking it up. */
26 :
27 13894403 : #define wf_t __m256
28 :
29 : /* Constructors */
30 :
31 : /* Given the float values, return ... */
32 :
33 : #define wf(f0,f1,f2,f3,f4,f5,f6,f7) /* [ f0 f1 f2 f3 f4 f5 f6 f7 ] */ \
34 1179648 : _mm256_setr_ps( (f0), (f1), (f2), (f3), (f4), (f5), (f6), (f7) )
35 :
36 262144 : #define wf_bcast(f0) _mm256_set1_ps( (f0) ) /* [ f0 f0 f0 f0 f0 f0 f0 f0 ] */
37 :
38 : static inline wf_t /* [ f0 f1 f0 f1 f0 f1 f0 f1 ] */
39 196608 : wf_bcast_pair( float f0, float f1 ) {
40 196608 : return _mm256_setr_ps( f0, f1, f0, f1, f0, f1, f0, f1 );
41 196608 : }
42 :
43 : static inline wf_t /* [ f0 f0 f0 f0 f1 f1 f1 f1 ] */
44 196608 : wf_bcast_lohi( float f0, float f1 ) {
45 196608 : return _mm256_setr_ps( f0, f0, f0, f0, f1, f1, f1, f1 );
46 196608 : }
47 :
48 : static inline wf_t /* [ f0 f1 f2 f3 f0 f1 f2 f3 ] */
49 196608 : wf_bcast_quad( float f0, float f1, float f2, float f3 ) {
50 196608 : return _mm256_setr_ps( f0, f1, f2, f3, f0, f1, f2, f3 );
51 196608 : }
52 :
53 : static inline wf_t /* [ f0 f0 f1 f1 f2 f2 f3 f3 ] */
54 196608 : wf_bcast_wide( float f0, float f1, float f2, float f3 ) {
55 196608 : return _mm256_setr_ps( f0, f0, f1, f1, f2, f2, f3, f3 );
56 196608 : }
57 :
58 : /* No general vf_permute due to cross-128-bit lane limitations in AVX.
59 : Useful cases are provided below. Given [ f0 f1 f2 f3 f4 f5 f6 f7 ],
60 : return ... */
61 :
62 : #define wf_bcast_even(f) _mm256_permute_ps( (f), _MM_SHUFFLE(2,2,0,0) ) /* [ f0 f0 f2 f2 f4 f4 f6 f6 ] */
63 : #define wf_bcast_odd(f) _mm256_permute_ps( (f), _MM_SHUFFLE(3,3,1,1) ) /* [ f1 f1 f3 f3 f5 f5 f7 f7 ] */
64 : #define wf_exch_adj(f) _mm256_permute_ps( (f), _MM_SHUFFLE(2,3,0,1) ) /* [ f1 f0 f3 f2 f5 f4 f7 f6 ] */
65 : #define wf_exch_adj_pair(f) _mm256_permute_ps( (f), _MM_SHUFFLE(1,0,3,2) ) /* [ f2 f3 f0 f1 f6 f7 f4 f5 ] */
66 :
67 : static inline wf_t
68 196608 : wf_exch_adj_quad( wf_t f ) { /* [ f4 f5 f6 f7 f0 f1 f2 f3 ] */
69 196608 : return _mm256_permute2f128_ps( f, f, 1 );
70 196608 : }
71 :
72 : /* Predefined constants */
73 :
74 : #define wf_zero() _mm256_setzero_ps() /* Return [ 0.f 0.f 0.f 0.f 0.f 0.f 0.f 0.f ] */
75 9044739 : #define wf_one() _mm256_set1_ps( 1.f ) /* Return [ 1.f 1.f 1.f 1.f 1.f 1.f 1.f 1.f ] */
76 :
77 : /* Memory operations */
78 :
79 : /* wf_ld return the 8 floats at the 32-byte aligned / 32-byte sized
80 : location p as a vector float. wf_ldu is the same but p does not have
81 : to be aligned. wf_st writes the vector float to the 32-byte aligned
82 : / 32-byte sized location p as 8 floats. wf_stu is the same but p
83 : does not have to be aligned. In all these lane l will be at p[l].
84 : FIXME: USE ATTRIBUTES ON P PASSED TO THESE? */
85 :
86 9044739 : #define wf_ld(p) _mm256_load_ps( (p) )
87 72357912 : #define wf_ldu(p) _mm256_loadu_ps( (p) )
88 9044739 : #define wf_st(p,x) _mm256_store_ps( (p), (x) )
89 72357912 : #define wf_stu(p,x) _mm256_storeu_ps( (p), (x) )
90 :
91 : /* wf_ldif is an optimized equivalent to wf_notczero(c,wf_ldu(p)) (may
92 : have different behavior if c is not a proper vector conditional). It
93 : is provided for symmetry with the wf_stif operation. wf_stif stores
94 : x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
95 : Undefined behavior if c is not a proper vector conditional. */
96 :
97 : #define wf_ldif(c,p) _mm256_maskload_ps( (p),(c))
98 : #define wf_stif(c,p,x) _mm256_maskstore_ps((p),(c),(x))
99 :
100 : /* Element operations */
101 :
102 : /* wf_extract extracts the float in lane imm from the vector float
103 : as a float. wf_insert returns the vector float formed by replacing
104 : the value in lane imm of a with the provided float. imm should be a
105 : compile time constant in 0:7. wf_extract_variable and
106 : wf_insert_variable are the slower but the lane n does not have to be
107 : known at compile time (should still be in 0:7). */
108 :
109 : /* FIXME: ARE THESE BETTER IMPLEMENTED VIA BOUNCING OF THE STACK (IT
110 : SEEMS PRETTY CLEAR THAT INTEL DIDN'T INTEND THIS TO BE POSSIBLE,
111 : ESPECIALLY GIVEN THE _MM256_CVTSS_F32 IS MISSING GCC)?
112 : ALTERNATIVELY, IT IS WORTHWHILE TO SPECIAL CASE 0 AND 4 EXTRACTION AS
113 : PER THE BELOW? */
114 :
115 : #if FD_USING_CLANG || !FD_HAS_OPTIMIZATION
116 :
117 : /* Sigh ... clang is sad and can't handle passing compile time const
118 : expressions through a static inline */
119 : static inline float
120 72357912 : wf_extract( wf_t a, int imm ) {
121 72357912 : union { float f[8]; __m256 v[1]; } t[1];
122 72357912 : _mm256_store_ps( t->f, a );
123 72357912 : return t->f[ imm ];
124 72357912 : }
125 :
126 : #else
127 :
128 : static inline float
129 : wf_extract( wf_t a, int imm ) {
130 : int avx_lane = imm >> 2; /* compile time eval */
131 : int sse_lane = imm & 3; /* compile time eval */
132 : __m128 t = _mm256_extractf128_ps( a, avx_lane );
133 : if( sse_lane ) /* compile time eval */
134 : t = _mm_castsi128_ps( _mm_insert_epi32( _mm_setzero_si128(), _mm_extract_epi32( _mm_castps_si128( t ), sse_lane ), 0 ) );
135 : return _mm_cvtss_f32( t );
136 : }
137 :
138 : #endif
139 :
140 : #define wf_insert(a,imm,v) \
141 72357912 : _mm256_castsi256_ps( _mm256_insert_epi32( _mm256_castps_si256( (a) ), \
142 72357912 : _mm_extract_epi32( _mm_castps_si128( _mm_set_ss( (v) ) ), 0 ), (imm) ) )
143 :
144 : static inline float
145 72357912 : wf_extract_variable( wf_t a, int n ) {
146 72357912 : float f[8] W_ATTR;
147 72357912 : _mm256_store_ps( f, a );
148 72357912 : return f[n];
149 72357912 : }
150 :
151 : static inline wf_t
152 72357912 : wf_insert_variable( wf_t a, int n, float v ) {
153 72357912 : float f[8] W_ATTR;
154 72357912 : _mm256_store_ps( f, a );
155 72357912 : f[n] = v;
156 72357912 : return _mm256_load_ps( f );
157 72357912 : }
158 :
159 : /* Given [a0 a1 a2 a3 a4 a5 a6 a7], [b0 b1 b2 b3 b4 b5 b6 b7] and/or
160 : [c0 c1 c2 c3 c4 c5 c6 c7], return ... */
161 :
162 : /* Arithmetic operations */
163 :
164 : /* wf_neg(a) returns [ -a0 -a1 ... -a7 ] (i.e. -a )
165 : wf_sign(a) returns [ signf(a0) signf(a1) ... signf(a7) ]
166 : wf_abs(a) returns [ fabsf(a0) fabsf(a1) ... fabsf(a7) ] (i.e. abs(a))
167 : wf_negabs(a) returns [ -fabsf(a0) -fabsf(a1) ... -fabsf(a7) ] (i.e. -abs(a))
168 : wf_ceil(a) returns [ ceilf(a0) ceilf(a1) ... ceilf(a7) ] (i.e. ceil(a))
169 : wf_floor(a) returns [ floorf(a0) floorf(a1) ... floorf(a7) ] (i.e. floor(a))
170 : wf_rint(a) returns [ rintf(a0) rintf(a1) ... rintf(a7) ] (i.e. roundb(a))
171 : wf_trunc(a) returns [ truncf(a0) truncf(a1) ... truncf(a7) ] (i.e. fix(a))
172 : wf_sqrt(a) returns [ sqrtf(a0) sqrtf(a1) ... sqrtf(a7) ] (i.e. sqrt(a))
173 : wf_rcp_fast(a) returns [ ~rcpf(a0) ~rcpf(a1) ... ~rcpf(a7) ]
174 : wf_rsqrt_fast(a) returns [ ~rsqrtf(a0) ~rsqrtf(a1) ... ~rsqrtf(a7) ]
175 :
176 : wf_add(a,b) returns [ a0+b0 a1+b1 ... a7+b7 ] (i.e. a +b)
177 : wf_sub(a,b) returns [ a0-b0 a1-b1 ... a7-b7 ] (i.e. a -b)
178 : wf_mul(a,b) returns [ a0*b0 a1*b1 ... a7*b7 ] (i.e. a.*b)
179 : wf_div(a,b) returns [ a0/b0 a1/b1 ... a7/b7 ] (i.e. a./b)
180 : wf_min(a,b) returns [ fminf(a0,b0) fminf(a1,b1) ... fminf(a7,b7) ] (i.e. min([a;b]) (a and b are 1x8)
181 : wf_max(a,b) returns [ fmaxf(a0,b0) fmaxf(a1,b1) ... fmaxf(a7,b7) ] (i.e. max([a;b]) (a and b are 1x8)
182 : wf_copysign(a,b) returns [ copysignf(a0,b0) copysignf(a1,b1) ... copysignf(a7,b7) ]
183 : wf_flipsign(a,b) returns [ flipsignf(a0,b0) flipsignf(a1,b1) ... flipsignf(a7,b7) ]
184 :
185 : wf_fma(a,b,c) returns [ fmaf(a0,b0, c0) fmaf(a1,b1, c1) ... fmaf(a7,b7, c7) ] (i.e. a.*b+c)
186 : wf_fms(a,b,c) returns [ fmaf(a0,b0,-c0) fmaf(a1,b1,-c1) ... fmaf(a7,b7,-c7) ] (i.e. a.*b-c)
187 : wf_fnma(a,b,c) returns [ -fmaf(a0,b0,-c0) -fmaf(a1,b1,-c1) ... -fmaf(a7,b7,-c7) ] (i.e. -a.*b+c)
188 :
189 : where sign(a) is -1. if a's sign bit is set and +1. otherwise, rcp(a)
190 : is 1./a and rsqrt(a) is 1./sqrt(a), and flipsign(a,b) returns -a if b
191 : signbit is set and a otherwise.
192 :
193 : rintf is in round-to-nearest-even rounding mode (note rintf and
194 : nearbyintf are identical once floating point exceptions are ignored).
195 :
196 : sqrtf should typically be full accuracy.
197 :
198 : rcp_fast and rsqrt_fast should typically be ~12 bits or more bits
199 : accurate (~3 or more decimal digits) such that (nearly) full accuracy
200 : can be achieved with two to three rounds of Newton-Raphson polishing.
201 : Bit level replicable code should avoid rcp_fast and rsqrt_fast though
202 : as the approximations used can vary between various generations /
203 : steppings / microcode updates of x86 processors (including Intel and
204 : AMD). */
205 :
206 : #define wf_neg(a) _mm256_xor_ps( _mm256_set1_ps( -0.f ), (a) )
207 : #define wf_sign(a) _mm256_xor_ps( _mm256_set1_ps( 1.f ), _mm256_and_ps( _mm256_set1_ps( -0.f ), (a) ) )
208 : #define wf_abs(a) _mm256_andnot_ps( _mm256_set1_ps( -0.f ), (a) )
209 : #define wf_negabs(a) _mm256_or_ps( _mm256_set1_ps( -0.f ), (a) )
210 : #define wf_ceil(a) _mm256_ceil_ps( (a) )
211 : #define wf_floor(a) _mm256_floor_ps( (a) )
212 : #define wf_rint(a) _mm256_round_ps( (a), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC )
213 : #define wf_trunc(a) _mm256_round_ps( (a), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC )
214 : #define wf_sqrt(a) _mm256_sqrt_ps( (a) )
215 : #define wf_rcp_fast(a) _mm256_rcp_ps( (a) )
216 : #define wf_rsqrt_fast(a) _mm256_rsqrt_ps( (a) )
217 :
218 : #define wf_add(a,b) _mm256_add_ps( (a), (b) )
219 262144 : #define wf_sub(a,b) _mm256_sub_ps( (a), (b) )
220 : #define wf_mul(a,b) _mm256_mul_ps( (a), (b) )
221 : #define wf_div(a,b) _mm256_div_ps( (a), (b) )
222 : #define wf_min(a,b) _mm256_min_ps( (a), (b) )
223 : #define wf_max(a,b) _mm256_max_ps( (a), (b) )
224 : #define wf_copysign(a,b) _mm256_or_ps( _mm256_andnot_ps( _mm256_set1_ps( -0.f ), (a) ), \
225 : _mm256_and_ps( _mm256_set1_ps( -0.f ), (b) ) )
226 : #define wf_flipsign(a,b) _mm256_xor_ps( (a), _mm256_and_ps( _mm256_set1_ps( -0.f ), (b) ) )
227 :
228 : #define wf_fma(a,b,c) _mm256_fmadd_ps( (a), (b), (c) )
229 : #define wf_fms(a,b,c) _mm256_fmsub_ps( (a), (b), (c) )
230 : #define wf_fnma(a,b,c) _mm256_fnmadd_ps( (a), (b), (c) )
231 :
232 : /* Binary operations */
233 :
234 : /* Note: binary operations are not well defined on vector floats.
235 : If doing tricks with floating point binary representations, the user
236 : should use wf_to_wi_raw as necessary. */
237 :
238 : /* Logical operations */
239 :
240 : /* These all return proper vector conditionals */
241 :
242 : #define wf_lnot(a) _mm256_castps_si256( _mm256_cmp_ps( (a), _mm256_setzero_ps(), _CMP_EQ_OQ ) ) /* [ !a0 !a1 ... !a7 ] */
243 : #define wf_lnotnot(a) _mm256_castps_si256( _mm256_cmp_ps( (a), _mm256_setzero_ps(), _CMP_NEQ_OQ ) ) /* [ !!a0 !!a1 ... !!a7 ] */
244 : #define wf_signbit(a) _mm256_srai_epi32( _mm256_castps_si256( (a) ), 31 )
245 :
246 : #define wf_eq(a,b) _mm256_castps_si256( _mm256_cmp_ps( (a), (b), _CMP_EQ_OQ ) ) /* [ a0==b0 a1==b1 ... a7==b7 ] */
247 : #define wf_gt(a,b) _mm256_castps_si256( _mm256_cmp_ps( (a), (b), _CMP_GT_OQ ) ) /* [ a0> b0 a1> b1 ... a7> b7 ] */
248 262144 : #define wf_lt(a,b) _mm256_castps_si256( _mm256_cmp_ps( (a), (b), _CMP_LT_OQ ) ) /* [ a0< b0 a1< b1 ... a7< b7 ] */
249 : #define wf_ne(a,b) _mm256_castps_si256( _mm256_cmp_ps( (a), (b), _CMP_NEQ_OQ ) ) /* [ a0==b0 a1==b1 ... a7==b7 ] */
250 : #define wf_ge(a,b) _mm256_castps_si256( _mm256_cmp_ps( (a), (b), _CMP_GE_OQ ) ) /* [ a0==b0 a1==b1 ... a7==b7 ] */
251 : #define wf_le(a,b) _mm256_castps_si256( _mm256_cmp_ps( (a), (b), _CMP_LE_OQ ) ) /* [ a0==b0 a1==b1 ... a7==b7 ] */
252 :
253 : /* Conditional operations */
254 :
255 : #define wf_czero(c,f) _mm256_andnot_ps( _mm256_castsi256_ps( (c) ), (f) ) /* [ c0?0.f:f0 c1?0.f:f1 ... c7?0.f:f7 ] */
256 : #define wf_notczero(c,f) _mm256_and_ps( _mm256_castsi256_ps( (c) ), (f) ) /* [ c0?f0:0.f c1?f1:0.f ... c7?f7:0.f ] */
257 :
258 262144 : #define wf_if(c,t,f) _mm256_blendv_ps( (f), (t), _mm256_castsi256_ps( (c) ) ) /* [ c0?t0:f0 c1?t1:f1 ... c7?t7:f7 ] */
259 :
260 : /* Conversion operations */
261 :
262 : /* Summarizing:
263 :
264 : wf_to_wc(a) returns [ !!a0 !!a1 ... !!a7 ]
265 :
266 : wf_to_wi(a) returns [ (int)a0 (int)a1 ... (int)a7 ]
267 : wf_to_wi_fast(a) returns [ (int)rintf(a0) (int)rintf(a1) ... (int)rintf(a7) ]
268 :
269 : wf_to_wu(a) returns [ (uint)a0 (uint)a1 ... (uint)a7 ]
270 : wf_to_wu_fast(a) returns [ (uint)rintf(a0) (uint)rintf(a1) ... (uint)rintf(a7) ]
271 :
272 : wf_to_wd(a,0) returns [ (double)a0 (double)a1 (double)a2 (double)a3 ]
273 : wf_to_wd(a,1) returns [ (double)a4 (double)a5 (double)a6 (double)a7 ]
274 :
275 : wf_to_wl(a,0) returns [ (long)a0 (long)a1 (long)a2 (long)a3 ]
276 : wf_to_wl(a,1) returns [ (long)a4 (long)a5 (long)a6 (long)a7 ]
277 :
278 : wf_to_wv(a,0) returns [ (ulong)a0 (ulong)a1 (ulong)a2 (ulong)a3 ]
279 : wf_to_wv(a,1) returns [ (ulong)a4 (ulong)a5 (ulong)a6 (ulong)a7 ]
280 :
281 : where rintf is configured for round-to-nearest-even rounding (Intel
282 : architecture defaults to round-nearest-even here ... sigh, they still
283 : don't fully get it) and imm_hi should be a compile time constant.
284 : That is, the fast variants assume that float point inputs are already
285 : integral value in the appropriate range for the output type.
286 :
287 : For wf_to_{wd,wl}, the permutation used for the conversion is less
288 : flexible due to cross 128-bit lane limitations in AVX. If imm_hi==0,
289 : the conversion is done to lanes 0:3. Otherwise, the conversion is
290 : done to lanes 4:7.
291 :
292 : The raw variants return just raw bits as the corresponding vector
293 : type. wf_to_wi_raw in particular allows doing advanced bit tricks on
294 : a vector float. The others are probably dubious but they are
295 : provided for completeness. */
296 :
297 : #define wf_to_wc(a) _mm256_castps_si256( _mm256_cmp_ps( (a), _mm256_setzero_ps(), _CMP_NEQ_OQ ) )
298 : #define wf_to_wi(a) wf_to_wi_fast( _mm256_round_ps( (a), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) )
299 : #define wf_to_wu(a) wf_to_wu_fast( _mm256_round_ps( (a), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) )
300 : #define wf_to_wd(a,imm_hi) _mm256_cvtps_pd( _mm256_extractf128_ps( (a), !!(imm_hi) ) )
301 :
302 : /* FIXME: IS IT FASTER TO USE INSERT / EXTRACT FOR THESE? */
303 :
304 393216 : static inline __m256i wf_to_wl( wf_t f, int imm_hi ) { /* FIXME: workaround wl_t isn't declared at this point */
305 393216 : union { float f[4]; __m128 v[1]; } t[1];
306 393216 : union { long l[4]; __m256i v[1]; } u[1];
307 393216 : _mm_store_ps( t->f, imm_hi ? _mm256_extractf128_ps( f, 1 ) : _mm256_extractf128_ps( f, 0 ) /* compile time */ );
308 393216 : u->l[0] = (long)t->f[0];
309 393216 : u->l[1] = (long)t->f[1];
310 393216 : u->l[2] = (long)t->f[2];
311 393216 : u->l[3] = (long)t->f[3];
312 393216 : return _mm256_load_si256( u->v );
313 393216 : }
314 :
315 393216 : static inline __m256i wf_to_wv( wf_t f, int imm_hi ) { /* FIXME: workaround wv_t isn't declared at this point */
316 393216 : union { float f[4]; __m128 v[1]; } t[1];
317 393216 : union { ulong u[4]; __m256i v[1]; } u[1];
318 393216 : _mm_store_ps( t->f, imm_hi ? _mm256_extractf128_ps( f, 1 ) : _mm256_extractf128_ps( f, 0 ) /* compile time */ );
319 393216 : u->u[0] = (ulong)t->f[0];
320 393216 : u->u[1] = (ulong)t->f[1];
321 393216 : u->u[2] = (ulong)t->f[2];
322 393216 : u->u[3] = (ulong)t->f[3];
323 393216 : return _mm256_load_si256( u->v );
324 393216 : }
325 :
326 : #define wf_to_wi_fast(a) _mm256_cvtps_epi32( (a) )
327 :
328 : /* Note: Given that _mm256_cvtps_epi32 existed for a long time, Intel
329 : clearly had the hardware under the hood for _mm256_cvtps_epu32 but
330 : didn't bother to expose it pre-Skylake-X ... sigh (all too typical
331 : unfortunately). We use _mm256_cvtps_epu32 where supported because
332 : it is faster and it replicates the same IB behaviors as the compiler
333 : generated scalar ASM for float to uint casts on these targets.
334 :
335 : Pre-Skylake-X, we emulate it by noting that subtracting 2^31 from
336 : a float holding an integer in [2^31,2^32) is exact and the result can
337 : be exactly converted to a signed integer by _mm256_cvtps_epi32. We
338 : then use twos complement hacks to add back any shift. This also
339 : replicates the compiler's IB behaviors on these ISAs for float to
340 : int casts. */
341 :
342 : #if defined(__AVX512F__) && defined(__AVX512VL__)
343 : #define wf_to_wu_fast( a ) _mm256_cvtps_epu32( (a) )
344 : #else
345 262144 : static inline __m256i wf_to_wu_fast( wf_t a ) { /* FIXME: workaround wu_t isn't declared at this point */
346 : /**/ /* Assumes a is integer in [0,2^32) */
347 262144 : wf_t s = wf_bcast( (float)(1U<<31) ); /* 2^31 */
348 262144 : wc_t c = wf_lt ( a, s ); /* -1 if a<2^31, 0 o.w. */
349 262144 : wf_t as = wf_sub( a, s ); /* a-2^31 */
350 262144 : __m256i u = _mm256_cvtps_epi32( wf_if( c, a, as ) ); /* (uint)(a if a<2^31, a-2^31 o.w.) */
351 262144 : __m256i us = _mm256_add_epi32( u, _mm256_set1_epi32( (int)(1U<<31) ) ); /* (uint)(a+2^31 if a<2^31, a o.w.) */
352 262144 : return _mm256_castps_si256( _mm256_blendv_ps( _mm256_castsi256_ps( us ), _mm256_castsi256_ps( u ), _mm256_castsi256_ps( c ) ) );
353 262144 : }
354 : #endif
355 :
356 : #define wf_to_wc_raw(a) _mm256_castps_si256( (a) )
357 : #define wf_to_wi_raw(a) _mm256_castps_si256( (a) )
358 : #define wf_to_wu_raw(a) _mm256_castps_si256( (a) )
359 : #define wf_to_wd_raw(a) _mm256_castps_pd( (a) )
360 : #define wf_to_wl_raw(a) _mm256_castps_si256( (a) )
361 : #define wf_to_wv_raw(a) _mm256_castps_si256( (a) )
362 :
363 : /* Reduction operations */
364 :
365 : static inline wf_t
366 196608 : wf_sum_all( wf_t x ) { /* Returns wf_bcast( sum( x ) ) */
367 196608 : x = _mm256_add_ps( x, _mm256_permute2f128_ps( x, x, 1 ) ); /* x04 x15 x26 x37 ... */
368 196608 : x = _mm256_hadd_ps( x, x ); /* x0145 x2367 ... */
369 196608 : return _mm256_hadd_ps( x, x ); /* xsum ... */
370 196608 : }
371 :
372 : static inline wf_t
373 196608 : wf_min_all( wf_t x ) { /* Returns wf_bcast( min( x ) ) */
374 196608 : __m256 y = _mm256_permute2f128_ps( x, x, 1 ); /* x4 x5 x6 x7 x0 x1 x2 x3 */
375 196608 : x = _mm256_min_ps( x, y ); /* x04 x15 x26 x37 ... */
376 196608 : y = _mm256_permute_ps( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x26 x37 x04 x15 ... */
377 196608 : x = _mm256_min_ps( x, y ); /* x0246 x1357 ... */
378 196608 : y = _mm256_permute_ps( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x1357 x0246 ... */
379 196608 : x = _mm256_min_ps( x, y ); /* xmin ... */
380 196608 : return x;
381 196608 : }
382 :
383 : static inline wf_t
384 196608 : wf_max_all( wf_t x ) { /* Returns wf_bcast( max( x ) ) */
385 196608 : __m256 y = _mm256_permute2f128_ps( x, x, 1 ); /* x4 x5 x6 x7 x0 x1 x2 x3 */
386 196608 : x = _mm256_max_ps( x, y ); /* x04 x15 x26 x37 ... */
387 196608 : y = _mm256_permute_ps( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x26 x37 x04 x15 ... */
388 196608 : x = _mm256_max_ps( x, y ); /* x0246 x1357 ... */
389 196608 : y = _mm256_permute_ps( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x1357 x0246 ... */
390 196608 : x = _mm256_max_ps( x, y ); /* xmax ... */
391 196608 : return x;
392 196608 : }
393 :
394 : /* Misc operations */
395 :
396 : /* wf_gather(b,i) returns [ b[i(0)] b[i(1)] ... b[i(7)] ] where b is a
397 : "float const *" and i is a wi_t. */
398 :
399 9044739 : #define wf_gather(b,i) _mm256_i32gather_ps( (b), (i), 4 )
400 :
401 : /* wf_transpose_8x8 transposes the 8x8 matrix stored in wf_t r0,r1,...r7
402 : and stores the result in 8x8 matrix wf_t c0,c1,...c7. All
403 : c0,c1,...c7 should be different for a well defined result.
404 : Otherwise, in-place operation and/or using the same wf_t to specify
405 : multiple rows of r is fine. */
406 :
407 196608 : #define wf_transpose_8x8( r0,r1,r2,r3,r4,r5,r6,r7, c0,c1,c2,c3,c4,c5,c6,c7 ) do { \
408 196608 : wf_t _wf_transpose_r0 = (r0); wf_t _wf_transpose_r1 = (r1); wf_t _wf_transpose_r2 = (r2); wf_t _wf_transpose_r3 = (r3); \
409 196608 : wf_t _wf_transpose_r4 = (r4); wf_t _wf_transpose_r5 = (r5); wf_t _wf_transpose_r6 = (r6); wf_t _wf_transpose_r7 = (r7); \
410 196608 : wf_t _wf_transpose_t; \
411 196608 : /* Transpose 4x4 blocks */ \
412 196608 : _wf_transpose_t = _wf_transpose_r0; _wf_transpose_r0 = _mm256_permute2f128_ps( _wf_transpose_t, _wf_transpose_r4, 0x20 ); \
413 196608 : /**/ _wf_transpose_r4 = _mm256_permute2f128_ps( _wf_transpose_t, _wf_transpose_r4, 0x31 ); \
414 196608 : _wf_transpose_t = _wf_transpose_r1; _wf_transpose_r1 = _mm256_permute2f128_ps( _wf_transpose_t, _wf_transpose_r5, 0x20 ); \
415 196608 : /**/ _wf_transpose_r5 = _mm256_permute2f128_ps( _wf_transpose_t, _wf_transpose_r5, 0x31 ); \
416 196608 : _wf_transpose_t = _wf_transpose_r2; _wf_transpose_r2 = _mm256_permute2f128_ps( _wf_transpose_t, _wf_transpose_r6, 0x20 ); \
417 196608 : /**/ _wf_transpose_r6 = _mm256_permute2f128_ps( _wf_transpose_t, _wf_transpose_r6, 0x31 ); \
418 196608 : _wf_transpose_t = _wf_transpose_r3; _wf_transpose_r3 = _mm256_permute2f128_ps( _wf_transpose_t, _wf_transpose_r7, 0x20 ); \
419 196608 : /**/ _wf_transpose_r7 = _mm256_permute2f128_ps( _wf_transpose_t, _wf_transpose_r7, 0x31 ); \
420 196608 : /* Transpose 2x2 blocks */ \
421 196608 : _wf_transpose_t = _wf_transpose_r0; _wf_transpose_r0 = _mm256_unpacklo_ps( _wf_transpose_t, _wf_transpose_r2 ); \
422 196608 : /**/ _wf_transpose_r2 = _mm256_unpackhi_ps( _wf_transpose_t, _wf_transpose_r2 ); \
423 196608 : _wf_transpose_t = _wf_transpose_r1; _wf_transpose_r1 = _mm256_unpacklo_ps( _wf_transpose_t, _wf_transpose_r3 ); \
424 196608 : /**/ _wf_transpose_r3 = _mm256_unpackhi_ps( _wf_transpose_t, _wf_transpose_r3 ); \
425 196608 : _wf_transpose_t = _wf_transpose_r4; _wf_transpose_r4 = _mm256_unpacklo_ps( _wf_transpose_t, _wf_transpose_r6 ); \
426 196608 : /**/ _wf_transpose_r6 = _mm256_unpackhi_ps( _wf_transpose_t, _wf_transpose_r6 ); \
427 196608 : _wf_transpose_t = _wf_transpose_r5; _wf_transpose_r5 = _mm256_unpacklo_ps( _wf_transpose_t, _wf_transpose_r7 ); \
428 196608 : /**/ _wf_transpose_r7 = _mm256_unpackhi_ps( _wf_transpose_t, _wf_transpose_r7 ); \
429 196608 : /* Transpose 1x1 blocks */ \
430 196608 : /**/ (c0) = _mm256_unpacklo_ps( _wf_transpose_r0, _wf_transpose_r1 ); \
431 196608 : /**/ (c1) = _mm256_unpackhi_ps( _wf_transpose_r0, _wf_transpose_r1 ); \
432 196608 : /**/ (c2) = _mm256_unpacklo_ps( _wf_transpose_r2, _wf_transpose_r3 ); \
433 196608 : /**/ (c3) = _mm256_unpackhi_ps( _wf_transpose_r2, _wf_transpose_r3 ); \
434 196608 : /**/ (c4) = _mm256_unpacklo_ps( _wf_transpose_r4, _wf_transpose_r5 ); \
435 196608 : /**/ (c5) = _mm256_unpackhi_ps( _wf_transpose_r4, _wf_transpose_r5 ); \
436 196608 : /**/ (c6) = _mm256_unpacklo_ps( _wf_transpose_r6, _wf_transpose_r7 ); \
437 196608 : /**/ (c7) = _mm256_unpackhi_ps( _wf_transpose_r6, _wf_transpose_r7 ); \
438 196608 : } while(0)
|