Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_sse_h
2 : #error "Do not include this directly; use fd_sse.h"
3 : #endif
4 :
5 : /* Vector uint API ****************************************************/
6 :
7 : /* A vu_t is a vector where each 32-bit wide lane holds an unsigned
8 : 32-bit integer (an "uint"). These mirror vc and vf as much as
9 : possible.
10 :
11 : These mirror the other APIs as much as possible. Macros are
12 : preferred over static inlines when it is possible to do it robustly
13 : to reduce the risk of the compiler mucking it up. */
14 :
15 330519115 : #define vu_t __m128i
16 :
17 : /* Constructors */
18 :
19 : /* Given the uint values, return ... */
20 :
21 95128182 : #define vu(u0,u1,u2,u3) _mm_setr_epi32( (int)(u0), (int)(u1), (int)(u2), (int)(u3) ) /* [ u0 u1 u2 u3 ] */
22 :
23 393216 : #define vu_bcast(u0) _mm_set1_epi32( (int)(u0) ) /* [ u0 u0 u0 u0 ] */
24 :
25 : static inline vu_t /* [ u0 u1 u0 u1 ] */
26 196608 : vu_bcast_pair( uint u0, uint u1 ) {
27 196608 : int i0 = (int)u0; int i1 = (int)u1;
28 196608 : return _mm_setr_epi32( i0, i1, i0, i1 );
29 196608 : }
30 :
31 : static inline vu_t /* [ u0 u0 u1 u1 ] */
32 196608 : vu_bcast_wide( uint u0, uint u1 ) {
33 196608 : int i0 = (int)u0; int i1 = (int)u1;
34 196608 : return _mm_setr_epi32( i0, i0, i1, i1 );
35 196608 : }
36 :
37 : /* vu_permute returns [ x(imm_i0) x(imm_i1) x(imm_i2) x(imm_i3) ].
38 : imm_i* should be compile time constants in 0:3. */
39 :
40 444660340 : #define vu_permute(x,imm_i0,imm_i1,imm_i2,imm_i3) _mm_shuffle_epi32( (x), _MM_SHUFFLE( (imm_i3), (imm_i2), (imm_i1), (imm_i0) ) )
41 :
42 : /* vu_permute2 returns [ a(imm_i0) a(imm_i1) b(imm_i2) b(imm_i3) ].
43 : imm_i* should be compile time constants in 0:3. */
44 :
45 889320680 : #define vu_permute2(a,b,imm_i0,imm_i1,imm_i2,imm_i3) ((vu_t)_mm_shuffle_ps( (vf_t)(a), (vf_t)(b), _MM_SHUFFLE( (imm_i3), (imm_i2), (imm_i1), (imm_i0) ) ))
46 :
47 : /* Predefined constants */
48 :
49 44466034 : #define vu_zero() _mm_setzero_si128() /* Return [ 0U 0U 0U 0U ] */
50 61538355 : #define vu_one() _mm_set1_epi32( 1 ) /* Return [ 1U 1U 1U 1U ] */
51 :
52 : /* Memory operations */
53 :
54 : /* vu_ld return the 4 uints at the 16-byte aligned / 16-byte sized
55 : location p as a vector uint. vu_ldu is the same but p does not have
56 : to be aligned. vu_st writes the vector uint to the 16-byte aligned /
57 : 16-byte sized location p as 4 uints. vu_stu is the same but p does
58 : not have to be aligned. In all these lane l will be at p[l]. FIXME:
59 : USE ATTRIBUTES ON P PASSED TO THESE?
60 :
61 : Note: gcc knows a __m128i may alias. */
62 :
63 160538364 : static inline vu_t vu_ld( uint const * p ) { return _mm_load_si128( (__m128i const *)p ); }
64 193538367 : static inline void vu_st( uint * p, vu_t i ) { _mm_store_si128( (__m128i *)p, i ); }
65 :
66 246153420 : static inline vu_t vu_ldu( void const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
67 246153420 : static inline void vu_stu( void * p, vu_t i ) { _mm_storeu_si128( (__m128i *)p, i ); }
68 :
69 : /* vu_ldif is an optimized equivalent to vu_notczero(c,vu_ldu(p)) (may
70 : have different behavior if c is not a proper vector conditional). It
71 : is provided for symmetry with the vu_stif operation. vu_stif stores
72 : x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
73 : Undefined behavior if c is not a proper vector conditional. */
74 :
75 : #define vu_ldif(c,p) _mm_maskload_epi32( (p),(c))
76 : #define vu_stif(c,p,x) _mm_maskstore_epi32((p),(c),(x))
77 :
78 : /* Element operations */
79 :
80 : /* vu_extract extracts the uint in lane imm from the vector uint as an
81 : uint. vu_insert returns the vector uint formed by replacing the
82 : value in lane imm of a with the provided uint. imm should be a
83 : compile time constant in 0:3. vu_extract_variable and
84 : vu_insert_variable are the slower but the lane n does not have to be
85 : known at compile time (should be in 0:3).
86 :
87 : Note: C99 TC3 allows type punning through a union. */
88 :
89 246153420 : #define vu_extract(a,imm) ((uint)_mm_extract_epi32( (a), (imm) ))
90 246153420 : #define vu_insert(a,imm,v) _mm_insert_epi32( (a), (int)(v), (imm) )
91 :
92 : static inline uint
93 246153420 : vu_extract_variable( vu_t a, int n ) {
94 246153420 : union { __m128i m[1]; uint u[4]; } t[1];
95 246153420 : _mm_store_si128( t->m, a );
96 246153420 : return t->u[n];
97 246153420 : }
98 :
99 : static inline vu_t
100 246153420 : vu_insert_variable( vu_t a, int n, uint v ) {
101 246153420 : union { __m128i m[1]; uint u[4]; } t[1];
102 246153420 : _mm_store_si128( t->m, a );
103 246153420 : t->u[n] = v;
104 246153420 : return _mm_load_si128( t->m );
105 246153420 : }
106 :
107 : /* Given [a0 a1 a2 a3] and/or [b0 b1 b2 b3], return ... */
108 :
109 : /* Arithmetic operations */
110 :
111 : #define vu_neg(a) _mm_sub_epi32( _mm_setzero_si128(), (a) ) /* [ -a0 -a1 ... -a3 ] (twos complement handling) */
112 : #define vu_abs(a) (a) /* [ |a0| |a1| ... |a3| ] (twos complement handling) */
113 :
114 : #define vu_min(a,b) _mm_min_epu32( (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a3,b3) ] */
115 : #define vu_max(a,b) _mm_max_epu32( (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a3,b3) ] */
116 2772524540 : #define vu_add(a,b) _mm_add_epi32( (a), (b) ) /* [ a0 +b0 a1 +b1 ... a3 +b3 ] */
117 : #define vu_sub(a,b) _mm_sub_epi32( (a), (b) ) /* [ a0 -b0 a1 -b1 ... a3 -b3 ] */
118 : #define vu_mul(a,b) _mm_mullo_epi32( (a), (b) ) /* [ a0 *b0 a1 *b1 ... a3 *b3 ] */
119 :
120 : /* Binary operations */
121 :
122 : /* Note: vu_shl/vu_shr/vu_shru is a left/signed right/unsigned right
123 : shift by imm bits; imm should be a compile time constant in 0:31.
124 : The variable variants are slower but do not require the shift amount
125 : to be known at compile time (should still be in 0:31). */
126 :
127 : #define vu_not(a) _mm_xor_si128( _mm_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a3 ] */
128 :
129 : #define vu_shl(a,imm) _mm_slli_epi32( (a), (imm) ) /* [ a0<<imm a1<<imm ... a3<<imm ] */
130 : #define vu_shr(a,imm) _mm_srli_epi32( (a), (imm) ) /* [ a0>>imm a1>>imm ... a3>>imm ] */
131 :
132 : #define vu_shl_variable(a,n) _mm_sll_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
133 : #define vu_shr_variable(a,n) _mm_srl_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
134 :
135 : #define vu_shl_vector(a,b) _mm_sllv_epi32( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a3<<b3 ] */
136 : #define vu_shr_vector(a,b) _mm_srlv_epi32( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a3>>b3 ] */
137 :
138 : #define vu_and(a,b) _mm_and_si128( (a), (b) ) /* [ a0 &b0 a1& b1 ... a3& b3 ] */
139 : #define vu_andnot(a,b) _mm_andnot_si128( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ... (~a3)&b3 ] */
140 901626960 : #define vu_or(a,b) _mm_or_si128( (a), (b) ) /* [ a0 |b0 a1 |b1 ... a3 |b3 ] */
141 3973981260 : #define vu_xor(a,b) _mm_xor_si128( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ... a3 ^b3 ] */
142 :
143 : /* vu_rol(x,n) returns vu( rotate_left (x0,n), rotate_left (x1,n), ... )
144 : vu_ror(x,n) returns vu( rotate_right(x0,n), rotate_right(x1,n), ... ) */
145 :
146 : #if FD_HAS_AVX512
147 440131112 : #define vu_rol(a,imm) _mm_rol_epi32( (a), (imm) )
148 : #define vu_ror(a,imm) _mm_ror_epi32( (a), (imm) )
149 : #else
150 884456528 : static inline vu_t vu_rol( vu_t a, int imm ) { return vu_or( vu_shl( a, imm & 31 ), vu_shr( a, (-imm) & 31 ) ); }
151 4194304 : static inline vu_t vu_ror( vu_t a, int imm ) { return vu_or( vu_shr( a, imm & 31 ), vu_shl( a, (-imm) & 31 ) ); }
152 : #endif
153 :
154 6291456 : static inline vu_t vu_rol_variable( vu_t a, int n ) { return vu_or( vu_shl_variable( a, n&31 ), vu_shr_variable( a, (-n)&31 ) ); }
155 6291456 : static inline vu_t vu_ror_variable( vu_t a, int n ) { return vu_or( vu_shr_variable( a, n&31 ), vu_shl_variable( a, (-n)&31 ) ); }
156 :
157 0 : static inline vu_t vu_rol_vector( vu_t a, vi_t b ) {
158 0 : vi_t m = vi_bcast( 31 );
159 0 : return vu_or( vu_shl_vector( a, vi_and( b, m ) ), vu_shr_vector( a, vi_and( vi_neg( b ), m ) ) );
160 0 : }
161 :
162 0 : static inline vu_t vu_ror_vector( vu_t a, vi_t b ) {
163 0 : vi_t m = vi_bcast( 31 );
164 0 : return vu_or( vu_shr_vector( a, vi_and( b, m ) ), vu_shl_vector( a, vi_and( vi_neg( b ), m ) ) );
165 0 : }
166 :
167 393216 : static inline vu_t vu_bswap( vu_t a ) {
168 393216 : vu_t m = vu_bcast( 0x00FF00FFU ); /* Probably hoisted */
169 393216 : vu_t t = vu_rol( a, 16 ); /* Swap E/O 16-bit pairs */
170 393216 : return vu_or( vu_andnot( m, vu_shl( t, 8 ) ), vu_and( m, vu_shr( t, 8 ) ) ); /* Swap E/O 8-bit pairs */
171 393216 : }
172 :
173 : /* Logical operations */
174 :
175 : /* Like noted below in the vu_to_{vf,vd} converters, Intel clearly has
176 : the hardware to do a _mm_cmpgt_epu32 given that _mm_cmpgt_epi32
177 : exists but doesn't expose it in the ISA pre AVX-512. Sigh ... twos
178 : complement bit tricks to the rescue for vu_{gt,lt,ge,le}. */
179 :
180 : #define vu_lnot(a) _mm_cmpeq_epi32( (a), _mm_setzero_si128() ) /* [ !a0 !a1 ... !a3 ] */
181 : #define vu_lnotnot(a) /* [ !!a0 !!a1 ... !!a3 ] */ \
182 : _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( (a), _mm_setzero_si128() ) )
183 :
184 : #define vu_eq(a,b) _mm_cmpeq_epi32( (a), (b) ) /* [ a0==b0 a1==b1 ... a3==b3 ] */
185 : #define vu_gt(a,b) /* [ a0> b0 a1> b1 ... a3> b3 ] */ \
186 : _mm_cmpgt_epi32( _mm_sub_epi32( (a), _mm_set1_epi32( (int)(1U<<31) ) ), \
187 : _mm_sub_epi32( (b), _mm_set1_epi32( (int)(1U<<31) ) ) )
188 : #define vu_lt(a,b) vu_gt( (b), (a) ) /* [ a0< b0 a1< b1 ... a3> b3 ] */
189 : #define vu_ne(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a3!=b3 ] */
190 : #define vu_ge(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), vu_gt( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ... a3>=b3 ] */
191 : #define vu_le(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), vu_gt( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ... a3<=b3 ] */
192 :
193 : /* Conditional operations */
194 :
195 : #define vu_czero(c,f) _mm_andnot_si128( (c), (f) ) /* [ c0?0U:f0 c1?0U:f1 ... c3?0U:f3 ] */
196 : #define vu_notczero(c,f) _mm_and_si128( (c), (f) ) /* [ c0?f0:0U c1?f1:0U ... c3?f3:0U ] */
197 :
198 524288 : #define vu_if(c,t,f) _mm_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c3?t3:f3 ] */
199 :
200 : /* Conversion operations */
201 :
202 : /* Summarizing:
203 :
204 : vu_to_vc(a) returns [ !!a0 !!a1 ... a3 ]
205 :
206 : vu_to_vf(a) returns [ (float)a0 (float)a1 ... (float)a3 ]
207 :
208 : vu_to_vi(a) returns [ (int)a0 (int)a1 ... (int)a3 ]
209 :
210 : vu_to_vd(a,imm_i0,imm_i1) returns [ (double)a(imm_i0) (double)a(imm_i1) ]
211 :
212 : vu_to_vl(a,imm_i0,imm_i1) returns [ (long)a(imm_i0) (long)a(imm_i1) ]
213 :
214 : vu_to_vv(a,imm_i0,imm_i1) returns [ (ulong)a(imm_i0) (ulong)a(imm_i1) ]
215 :
216 : where imm_i* should be a compile time constant in 0:3.
217 :
218 : The raw variants just treat the raw bits as the corresponding vector
219 : type. For vu_to_vc_raw, the user promises vu contains a proper
220 : vector conditional (i.e. 0 or -1 in each lane). vu_to_vf_raw is
221 : useful for doing advanced bit tricks on floating point values. The
222 : others are probably dubious but are provided for completness. */
223 :
224 : #define vu_to_vc(a) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( (a), _mm_setzero_si128() ) )
225 : #define vu_to_vi(a) (a)
226 :
227 3538944 : static inline __m128d vu_to_vd_core( vu_t u ) { /* FIXME: workaround vd_t isn't declared at this point */
228 :
229 : /* Note: Given that _mm_cvtepi32_pd exists, Intel clearly has the
230 : hardware under the hood to support a _mm_cvtepu32_pd but didn't
231 : bother to expose it pre AVX-512 ... sigh (all too typical
232 : unfortunately). We can do a mix of twos complement and floating
233 : point hacks to emulate it without spilling. */
234 :
235 3538944 : __m128i c = _mm_cmpgt_epi32( _mm_setzero_si128(), u ); // 0 if u<2^31, -1 o.w
236 3538944 : __m128d d = _mm_cvtepi32_pd( u ); // u if u<2^31, u-2^32 o.w, exact
237 3538944 : __m128d ds = _mm_add_pd( d, _mm_set1_pd( (double)(1UL<<32) ) ); // u+2^32 if u<2^31, u o.w, exact
238 3538944 : __m128i cl = _mm_cvtepi32_epi64( c ); // 0L if u<2^31, -1L o.w
239 3538944 : return _mm_blendv_pd( d, ds, _mm_castsi128_pd( cl ) ); // u
240 :
241 3538944 : }
242 :
243 : #define vu_to_vd(a,imm_i0,imm_i1) vu_to_vd_core( _mm_shuffle_epi32( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
244 :
245 196608 : static inline vf_t vu_to_vf( vu_t u ) {
246 :
247 : /* See note above re ISA dubiousness. Note that we can't do the same
248 : trick as vu_to_vd due to single precision roundoff limitations (the
249 : _mm_cvtepi32_pd equivalent would not be exact such that add to
250 : correct the twos complement mangling would add a possible second
251 : roundoff error ... this would result in slightly different values
252 : occasionally when u is >~ 2^31). We instead convert the two
253 : halves to double (exact), convert the double to float (single
254 : roundoff error) and then concat the two float halves to make a
255 : correctly rounded implementation. */
256 :
257 196608 : return _mm_shuffle_ps( _mm_cvtpd_ps( vu_to_vd_core(u) ), _mm_cvtpd_ps( vu_to_vd(u,2,3) ), _MM_SHUFFLE(1,0,1,0) );
258 196608 : }
259 :
260 : #define vu_to_vl(a,imm_i0,imm_i1) _mm_cvtepu32_epi64( _mm_shuffle_epi32( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
261 : #define vu_to_vv(a,imm_i0,imm_i1) _mm_cvtepu32_epi64( _mm_shuffle_epi32( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
262 :
263 : #define vu_to_vc_raw(a) (a)
264 : #define vu_to_vf_raw(a) _mm_castsi128_ps( (a) )
265 : #define vu_to_vi_raw(a) (a)
266 : #define vu_to_vd_raw(a) _mm_castsi128_pd( (a) )
267 : #define vu_to_vl_raw(a) (a)
268 : #define vu_to_vv_raw(a) (a)
269 :
270 : /* Reduction operations */
271 :
272 : static inline vu_t
273 196608 : vu_sum_all( vu_t x ) { /* Returns vu_bcast( sum( x ) ) */
274 196608 : x = _mm_hadd_epi32( x, x ); /* x01 x23 ... */
275 196608 : return _mm_hadd_epi32( x, x ); /* xsum ... */
276 196608 : }
277 :
278 : static inline vu_t
279 196608 : vu_min_all( vu_t x ) { /* Returns vu_bcast( min( x ) ) */
280 196608 : __m128i y;
281 196608 : y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x2 x3 x0 x1 */
282 196608 : x = _mm_min_epu32( x, y ); /* x02 x13 ... */
283 196608 : y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x13 x02 ... */
284 196608 : x = _mm_min_epu32( x, y ); /* xmin ... */
285 196608 : return x;
286 196608 : }
287 :
288 : static inline vu_t
289 196608 : vu_max_all( vu_t x ) { /* Returns vu_bcast( max( x ) ) */
290 196608 : __m128i y;
291 196608 : y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x2 x3 x0 x1 */
292 196608 : x = _mm_max_epu32( x, y ); /* x02 x13 ... */
293 196608 : y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x13 x02 ... */
294 196608 : x = _mm_max_epu32( x, y ); /* xmax ... */
295 196608 : return x;
296 196608 : }
297 :
298 : /* Misc operations */
299 :
300 : /* vu_gather(b,i) returns [ b[i(0)] b[i(1)] ... b[i(3)] ] where b is a
301 : "uint const *" and i is a vi_t. We use a static inline here instead
302 : of a define to keep strict type checking while working around yet
303 : another Intel intrinsic type mismatch issue. */
304 :
305 : #if defined(__AVX2__)
306 61538355 : static inline vu_t vu_gather( uint const * b, vi_t i ) {
307 61538355 : return _mm_i32gather_epi32( (int const *)b, (i), 4 );
308 61538355 : }
309 : #endif /* defined(__AVX2__) */
310 :
311 : /* vu_transpose_4x4 transposes the 4x4 matrix stored in vu_t r0,r1,r2,r3
312 : and stores the result in 4x4 matrix vu_t c0,c1,c2,c3. All
313 : c0,c1,c2,c3 should be different for a well defined result.
314 : Otherwise, in-place operation and/or using the same vu_t to specify
315 : multiple rows of r is fine. */
316 :
317 196608 : #define vu_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do { \
318 196608 : vu_t _vu_transpose_r0 = (r0); vu_t _vu_transpose_r1 = (r1); vu_t _vu_transpose_r2 = (r2); vu_t _vu_transpose_r3 = (r3); \
319 196608 : vu_t _vu_transpose_t; \
320 196608 : /* Transpose 2x2 blocks */ \
321 196608 : _vu_transpose_t = _vu_transpose_r0; _vu_transpose_r0 = _mm_unpacklo_epi32( _vu_transpose_t, _vu_transpose_r2 ); \
322 196608 : /**/ _vu_transpose_r2 = _mm_unpackhi_epi32( _vu_transpose_t, _vu_transpose_r2 ); \
323 196608 : _vu_transpose_t = _vu_transpose_r1; _vu_transpose_r1 = _mm_unpacklo_epi32( _vu_transpose_t, _vu_transpose_r3 ); \
324 196608 : /**/ _vu_transpose_r3 = _mm_unpackhi_epi32( _vu_transpose_t, _vu_transpose_r3 ); \
325 196608 : /* Transpose 1x1 blocks */ \
326 196608 : /**/ (c0) = _mm_unpacklo_epi32( _vu_transpose_r0, _vu_transpose_r1 ); \
327 196608 : /**/ (c1) = _mm_unpackhi_epi32( _vu_transpose_r0, _vu_transpose_r1 ); \
328 196608 : /**/ (c2) = _mm_unpacklo_epi32( _vu_transpose_r2, _vu_transpose_r3 ); \
329 196608 : /**/ (c3) = _mm_unpackhi_epi32( _vu_transpose_r2, _vu_transpose_r3 ); \
330 196608 : } while(0)
|