Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_sse_h
2 : #error "Do not include this directly; use fd_sse.h"
3 : #endif
4 :
5 : /* Vector long API ****************************************************/
6 :
7 : /* A vl_t is a vector where each adjacent pair of 32-bit wide lanes
8 : (e.g. 0-1 / 2-3) holds a signed 64-bit twos-complement integer (a
9 : "long").
10 :
11 : These mirror the other APIs as much as possible. Macros are
12 : preferred over static inlines when it is possible to do it robustly
13 : to reduce the risk of the compiler mucking it up. */
14 :
15 142737459 : #define vl_t __m128i
16 :
17 : /* Constructors */
18 :
19 : /* Given the long values, return ... */
20 :
21 589824 : #define vl(l0,l1) _mm_set_epi64x( (l1), (l0) ) /* [ l0 l1 ] ... sigh ... backwards intel */
22 :
23 : #define vl_bcast(l0) _mm_set1_epi64x( (l0) ) /* [ l0 l0 ] */
24 :
25 : /* vl_permute returns [ l(imm_i0) l(imm_i1) ]. imm_i* should be compile
26 : time constants in 0:1. */
27 :
28 262144 : #define vl_permute( v, imm_i0, imm_i1 ) _mm_castpd_si128( _mm_permute_pd( _mm_castsi128_pd( (v) ), (imm_i0) + 2*(imm_i1) ) )
29 :
30 : /* Predefined constants */
31 :
32 : #define vl_zero() _mm_setzero_si128() /* Return [ 0L 0L ] */
33 141361203 : #define vl_one() _mm_set1_epi64x( 1L ) /* Return [ 1L 1L ] */
34 :
35 : /* Memory operations */
36 :
37 : /* vl_ld return the 2 longs at the 16-byte aligned / 16-byte sized
38 : location p as a vector long. vl_ldu is the same but p does not have
39 : to be aligned. vl_st writes the vector long to the 16-byte aligned /
40 : 16-byte sized location p as 2 longs. vl_stu is the same but p does
41 : not have to be aligned. In all these 64-bit lane l vlll be at p[l].
42 : FIXME: USE ATTRIBUTES ON P PASSED TO THESE?
43 :
44 : Note: gcc knows a __m128i may alias. */
45 :
46 141361203 : static inline vl_t vl_ld( long const * p ) { return _mm_load_si128( (__m128i const *)p ); }
47 141361203 : static inline void vl_st( long * p, vl_t i ) { _mm_store_si128( (__m128i *)p, i ); }
48 :
49 282722406 : static inline vl_t vl_ldu( void const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
50 282722406 : static inline void vl_stu( void * p, vl_t i ) { _mm_storeu_si128( (__m128i *)p, i ); }
51 :
52 : /* vl_ldif is an optimized equivalent to vl_notczero(c,vl_ldu(p)) (may
53 : have different behavior if c is not a proper vector conditional). It
54 : is provided for symmetry with the vl_stif operation. vl_stif stores
55 : x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
56 : Undefined behavior if c is not a proper vector conditional. */
57 :
58 : #define vl_ldif(c,p) _mm_maskload_epi64( (p),(c))
59 : #define vl_stif(c,p,x) _mm_maskstore_epi64((p),(c),(x))
60 :
61 : /* Element operations */
62 :
63 : /* vl_extract extracts the long in lane imm from the vector long as a
64 : long. vl_insert returns the vector long formed by replacing the
65 : value in lane imm of a with the provided long. imm should be a
66 : compile time known in 0:1. vl_extract_variable and
67 : vl_insert_variable are the slower but the lane n does not have to be
68 : known at compile time (should still be in 0:1).
69 :
70 : Note: C99 TC3 allows type punning through a union. */
71 :
72 282722406 : #define vl_extract(a,imm) _mm_extract_epi64( (a), (imm) )
73 :
74 282722406 : #define vl_insert(a,imm,v) _mm_insert_epi64( (a), (v), (imm) )
75 :
76 : static inline long
77 282722406 : vl_extract_variable( vl_t a, int n ) {
78 282722406 : union { __m128i m[1]; long l[2]; } t[1];
79 282722406 : _mm_store_si128( t->m, a );
80 282722406 : return t->l[n];
81 282722406 : }
82 :
83 : static inline vl_t
84 282722406 : vl_insert_variable( vl_t a, int n, long v ) {
85 282722406 : union { __m128i m[1]; long l[2]; } t[1];
86 282722406 : _mm_store_si128( t->m, a );
87 282722406 : t->l[n] = v;
88 282722406 : return _mm_load_si128( t->m );
89 282722406 : }
90 :
91 : /* Given [a0 a1] and/or [b0 b1], return ... */
92 :
93 : /* Arithmetic operations */
94 :
95 : #define vl_neg(a) _mm_sub_epi64( _mm_setzero_si128(), (a) ) /* [ -a0 -a1 ] (twos complement handling) */
96 :
97 : /* Note: _mm_{abs,min,max}_epi64 are missing pre AVX-512. We emulate
98 : these below (and use the AVX-512 versions if possible). Likewise,
99 : there is no _mm_mullo_epi64 pre AVX-512. Since this is not cheap to
100 : emulate, we do not provide a vl_mul for the time being (we could
101 : consider exposing it on AVX-512 targets though). There is a
102 : 64L*64L->64 multiply (where the lower 32-bits will be sign extended
103 : to 64-bits beforehand) though and that is very useful. So we do
104 : provide that. */
105 :
106 196608 : #define vl_add(a,b) _mm_add_epi64( (a), (b) ) /* [ a0 +b0 a1 +b1 ] */
107 : #define vl_sub(a,b) _mm_sub_epi64( (a), (b) ) /* [ a0 -b0 a1 -b1 ] */
108 : //#define vl_mul(a,b) _mm_mullo_epi64( (a), (b) ) /* [ a0 *b0 a1 *b1 ] */
109 : #define vl_mul_ll(a,b) _mm_mul_epi32( (a), (b) ) /* [ a0l*b0l a1l*b1l ] */
110 :
111 : /* Binary operations */
112 :
113 : /* Note: vl_shl/vl_shr/vl_shru is a left/signed right/unsigned right
114 : shift by imm bits; imm should be a compile time constant in 0:63.
115 : The variable variants are slower but do not require the shift amount
116 : to be known at compile time (should still be in 0:63). Also, AVX is
117 : missing _mm_sra*_epi64 intrinsics. We emulate these below. */
118 :
119 : #define vl_not(a) _mm_xor_si128( _mm_set1_epi64x( -1L ), (a) ) /* [ ~a0 ~a1 ] */
120 :
121 : #define vl_shl(a,imm) _mm_slli_epi64( (a), (imm) ) /* [ a0<<imm a1<<imm ] */
122 : //#define vl_shr(a,imm) _mm_srai_epi64( (a), (imm) ) /* [ a0>>imm a1>>imm ] (treat a as signed)*/
123 : #define vl_shru(a,imm) _mm_srli_epi64( (a), (imm) ) /* [ a0>>imm a1>>imm ] (treat a as unsigned) */
124 :
125 : #define vl_shl_variable(a,n) _mm_sll_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
126 : //#define vl_shr_variable(a,n) _mm_sra_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
127 : #define vl_shru_variable(a,n) _mm_srl_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
128 :
129 : #define vl_shl_vector(a,b) _mm_sllv_epi64( (a), (b) ) /* [ a0<<b0 a1<<b1 ] */
130 : //#define vl_shr_vector(a,b) _mm_srav_epi64( (a), (b) ) /* [ a0>>b0 a1>>b1 ] (treat a as signed) */
131 : #define vl_shru_vector(a,b) _mm_srlv_epi64( (a), (b) ) /* [ a0>>b0 a1>>b1 ] (treat a as unsigned) */
132 :
133 : #define vl_and(a,b) _mm_and_si128( (a), (b) ) /* [ a0 &b0 a1& b1 ] */
134 : #define vl_andnot(a,b) _mm_andnot_si128( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ] */
135 41943040 : #define vl_or(a,b) _mm_or_si128( (a), (b) ) /* [ a0 |b0 a1 |b1 ] */
136 : #define vl_xor(a,b) _mm_xor_si128( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ] */
137 :
138 : /* vl_rol(x,n) returns vl( rotate_left (x0,n), rotate_left (x1,n), ... )
139 : vl_ror(x,n) returns vl( rotate_right(x0,n), rotate_right(x1,n), ... ) */
140 :
141 : #if FD_HAS_AVX512
142 : #define vl_rol(a,imm) _mm_rol_epi64( (a), (imm) )
143 : #define vl_ror(a,imm) _mm_ror_epi64( (a), (imm) )
144 : #else
145 8388608 : static inline vl_t vl_rol( vl_t a, int imm ) { return vl_or( vl_shl( a, imm & 63 ), vl_shru( a, (-imm) & 63 ) ); }
146 8388608 : static inline vl_t vl_ror( vl_t a, int imm ) { return vl_or( vl_shru( a, imm & 63 ), vl_shl( a, (-imm) & 63 ) ); }
147 : #endif
148 :
149 12582912 : static inline vl_t vl_rol_variable( vl_t a, int n ) { return vl_or( vl_shl_variable( a, n&63 ), vl_shru_variable( a, (-n)&63 ) ); }
150 12582912 : static inline vl_t vl_ror_variable( vl_t a, int n ) { return vl_or( vl_shru_variable( a, n&63 ), vl_shl_variable( a, (-n)&63 ) ); }
151 :
152 0 : static inline vl_t vl_rol_vector( vl_t a, vl_t b ) {
153 0 : vl_t m = vl_bcast( 63L );
154 0 : return vl_or( vl_shl_vector( a, vl_and( b, m ) ), vl_shru_vector( a, vl_and( vl_neg( b ), m ) ) );
155 0 : }
156 :
157 0 : static inline vl_t vl_ror_vector( vl_t a, vl_t b ) {
158 0 : vl_t m = vl_bcast( 63L );
159 0 : return vl_or( vl_shru_vector( a, vl_and( b, m ) ), vl_shl_vector( a, vl_and( vl_neg( b ), m ) ) );
160 0 : }
161 :
162 : /* Logical operations */
163 :
164 : #define vl_lnot(a) _mm_cmpeq_epi64( (a), _mm_setzero_si128() ) /* [ !a0 !a1 ] */
165 : #define vl_lnotnot(a) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpeq_epi64( (a), _mm_setzero_si128() ) ) /* [ !!a0 !!a1 ] */
166 :
167 : #define vl_eq(a,b) _mm_cmpeq_epi64( (a), (b) ) /* [ a0==b0 a1==b1 ] */
168 : #define vl_gt(a,b) _mm_cmpgt_epi64( (a), (b) ) /* [ a0> b0 a1> b1 ] */
169 25165824 : #define vl_lt(a,b) _mm_cmpgt_epi64( (b), (a) ) /* [ a0< b0 a1< b1 ] */
170 : #define vl_ne(a,b) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpeq_epi64( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ] */
171 : #define vl_ge(a,b) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpgt_epi64( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ] */
172 : #define vl_le(a,b) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpgt_epi64( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ] */
173 :
174 : /* Conditional operations */
175 :
176 : #define vl_czero(c,f) _mm_andnot_si128( (c), (f) ) /* [ c0?0L:f0 c1? 0:f1 ] */
177 : #define vl_notczero(c,f) _mm_and_si128( (c), (f) ) /* [ c0?f0:0L c1?f1:0L ] */
178 :
179 655360 : #define vl_if(c,t,f) _mm_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ] */
180 :
181 : #if defined(__AVX512F__) && defined(__AVX512VL__) /* See note above */
182 : #define vl_abs(a) _mm_abs_epi64( (a) )
183 65536 : #define vl_min(a,b) _mm_min_epi64( (a), (b) )
184 65536 : #define vl_max(a,b) _mm_max_epi64( (a), (b) )
185 : #else
186 131072 : static inline vl_t vl_abs( vl_t a ) { return vl_if( vl_lt( a, vl_zero() ), vl_neg( a ), a ); }
187 262144 : static inline vl_t vl_min( vl_t a, vl_t b ) { return vl_if( vl_lt( a, b ), a, b ); }
188 262144 : static inline vl_t vl_max( vl_t a, vl_t b ) { return vl_if( vl_gt( a, b ), a, b ); }
189 : #endif
190 :
191 12582912 : static inline vl_t vl_shr( vl_t a, int imm ) {
192 12582912 : vc_t c = vl_lt( a, vl_zero() ); /* Note that vc_t is binary compat with vl_t */
193 12582912 : return _mm_xor_si128( _mm_srli_epi64( _mm_xor_si128( a, c ), imm ), c );
194 12582912 : }
195 12582912 : static inline vl_t vl_shr_variable( vl_t a, int n ) {
196 12582912 : vc_t c = vl_lt( a, vl_zero() ); /* Note that vc_t is binary compat with vl_t */
197 12582912 : return _mm_xor_si128( _mm_srl_epi64( _mm_xor_si128( a, c ), _mm_insert_epi64( _mm_setzero_si128(), n, 0 ) ), c );
198 12582912 : }
199 0 : static inline vl_t vl_shr_vector( vl_t a, vl_t n ) {
200 0 : vc_t c = vl_lt( a, vl_zero() ); /* Note that vc_t is binary compat with vl_t */
201 0 : return _mm_xor_si128( _mm_srlv_epi64( _mm_xor_si128( a, c ), n ), c );
202 0 : }
203 :
204 : /* Conversion operations */
205 :
206 : /* Summarizing:
207 :
208 : vl_to_vc(d) returns [ !!l0 !!l0 !!l1 !!l1 ]
209 :
210 : vl_to_vf(l,f,0) returns [ (float)l0 (float)l1 f2 f3 ]
211 : vl_to_vf(l,f,1) returns [ f0 f1 (float)l0 (float)l1 ]
212 :
213 : vl_to_vi(l,i,0) returns [ (int)l0 (int)l1 i2 i3 ]
214 : vl_to_vi(l,i,1) returns [ i0 i1 (int)l0 (int)l1 ]
215 :
216 : vl_to_vu(l,u,0) returns [ (uint)l0 (uint)l1 u2 u3 ]
217 : vl_to_vu(l,u,1) returns [ u0 u1 (uint)l0 (uint)l1 ]
218 :
219 : vl_to_vd(l) returns [ (double)l0 (double)l1 ]
220 :
221 : vl_to_vv(l) returns [ (ulong)l0 (ulong)l1 ]
222 :
223 : The raw variants just treat the raw bits as the corresponding vector
224 : type. For vl_to_vc_raw, the user promises vl contains a proper
225 : vector conditional (e.g. 0 or -1 in each lane). The others are
226 : provided to facilitate doing advanced bit tricks on floating point
227 : values. */
228 :
229 : #define vl_to_vc(a) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpeq_epi64( (a), _mm_setzero_si128() ) )
230 :
231 393216 : static inline vf_t vl_to_vf( vl_t l, vf_t f, int imm_hi ) {
232 393216 : float f0 = (float)_mm_extract_epi64( l, 0 );
233 393216 : float f1 = (float)_mm_extract_epi64( l, 1 );
234 393216 : return imm_hi ? vf_insert( vf_insert( f, 2, f0 ), 3, f1 ) : vf_insert( vf_insert( f, 0, f0 ), 1, f1 ); /* Compile time */
235 393216 : }
236 :
237 393216 : static inline vl_t vl_to_vi( vl_t l, vi_t i, int imm_hi ) {
238 393216 : vf_t _l = _mm_castsi128_ps( l ); /* [ x0l x0h x1l x1h ] */
239 393216 : vf_t _i = _mm_castsi128_ps( i );
240 393216 : if( imm_hi ) _l = _mm_shuffle_ps( _i, _l, _MM_SHUFFLE(2,0,1,0) ); /* Compile time */
241 196608 : else _l = _mm_shuffle_ps( _l, _i, _MM_SHUFFLE(3,2,2,0) );
242 393216 : return _mm_castps_si128( _l );
243 393216 : }
244 :
245 393216 : static inline vl_t vl_to_vu( vl_t l, vu_t u, int imm_hi ) {
246 393216 : vf_t _l = _mm_castsi128_ps( l ); /* [ x0l x0h x1l x1h ] */
247 393216 : vf_t _u = _mm_castsi128_ps( u );
248 393216 : if( imm_hi ) _l = _mm_shuffle_ps( _u, _l, _MM_SHUFFLE(2,0,1,0) ); /* Compile time */
249 196608 : else _l = _mm_shuffle_ps( _l, _u, _MM_SHUFFLE(3,2,2,0) );
250 393216 : return _mm_castps_si128( _l );
251 393216 : }
252 :
253 196608 : static inline vd_t vl_to_vd( vl_t l ) {
254 196608 : return _mm_setr_pd( (double)_mm_extract_epi64( l, 0 ), (double)_mm_extract_epi64( l, 1 ) );
255 196608 : }
256 :
257 : #define vl_to_vv(a) (a)
258 :
259 : #define vl_to_vc_raw(a) (a)
260 : #define vl_to_vf_raw(a) _mm_castsi128_ps( (a) )
261 : #define vl_to_vi_raw(a) (a)
262 : #define vl_to_vu_raw(a) (a)
263 : #define vl_to_vd_raw(a) _mm_castsi128_pd( (a) )
264 : #define vl_to_vv_raw(a) (a)
265 :
266 : /* Reduction operations */
267 :
268 : static inline vl_t
269 196608 : vl_sum_all( vl_t x ) { /* Returns vl_bcast( sum( x ) ) */
270 196608 : return vl_add( x, vl_permute( x, 1, 0 ) );
271 196608 : }
272 :
273 : static inline vl_t
274 196608 : vl_min_all( vl_t x ) { /* Returns vl_bcast( min( x ) ) */
275 196608 : return vl_min( x, vl_permute( x, 1, 0 ) );
276 196608 : }
277 :
278 : static inline vl_t
279 196608 : vl_max_all( vl_t x ) { /* Returns vl_bcast( max( x ) ) */
280 196608 : return vl_max( x, vl_permute( x, 1, 0 ) );
281 196608 : }
282 :
283 : /* Misc operations */
284 :
285 : /* vl_gather(b,i,imm_i0,imm_i1) returns [ b[i(imm_i0)] b[i(imm_i1)] ]
286 : where b is a "long const *" and i is a vi_t and imm_i0,imm_i1 are
287 : compile time constants in 0:3. We use a static inline here instead
288 : of a define to keep strict type checking while working around yet
289 : another Intel intrinsic type mismatch issue. And we use a define to
290 : workaround clang sadness with passing a compile time constant into a
291 : static inline. */
292 :
293 : #if defined(__AVX2__)
294 565444812 : static inline vl_t _vl_gather( long const * b, vi_t i ) {
295 565444812 : return _mm_i32gather_epi64( (long long const *)b, i, 8 );
296 565444812 : }
297 : #endif
298 :
299 565444812 : #define vl_gather(b,i,imm_i0,imm_i1) _vl_gather( (b), _mm_shuffle_epi32( (i), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
300 :
301 : /* vl_transpose_2x2 transposes the 2x2 matrix stored in vl_t r0,r1
302 : and stores the result in 2x2 matrix vl_t c0,c1. All c0,c1 should be
303 : different for a well defined result. Otherwise, in-place operation
304 : and/or using the same vl_t to specify multiple rows of r is fine. */
305 :
306 196608 : #define vl_transpose_2x2( r0,r1, c0,c1 ) do { \
307 196608 : vl_t _vl_transpose_r0 = (r0); vl_t _vl_transpose_r1 = (r1); \
308 196608 : (c0) = _mm_unpacklo_epi64( _vl_transpose_r0, _vl_transpose_r1 ); \
309 196608 : (c1) = _mm_unpackhi_epi64( _vl_transpose_r0, _vl_transpose_r1 ); \
310 196608 : } while(0)
|