Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_sse_h
2 : #error "Do not include this directly; use fd_sse.h"
3 : #endif
4 :
5 : /* Vector ulong API ***************************************************/
6 :
7 : /* A vv_t is a vector where each adjacent pair of 32-bit wide lanes
8 : (e.g. 0-1 / 2-3) holds an unsigned 64-bit integer (a "ulong").
9 :
10 : These mirror the other APIs as much as possible. Macros are
11 : preferred over static inlines when it is possible to do it robustly
12 : to reduce the risk of the compiler mucking it up. */
13 :
14 117768243 : #define vv_t __m128i
15 :
16 : /* Constructors */
17 :
18 : /* Given the long values, return ... */
19 :
20 589824 : #define vv(v0,v1) _mm_set_epi64x( (long)(v1), (long)(v0) ) /* [ v0 v1 ] ... sigh ... backwards intel */
21 :
22 : #define vv_bcast(v0) _mm_set1_epi64x( (long)(v0) ) /* [ v0 v0 ] */
23 :
24 : /* vv_permute returns [ l(imm_i0) l(imm_i1) ]. imm_i* should be compile
25 : time constants in 0:1. */
26 :
27 262144 : #define vv_permute( v, imm_i0, imm_i1 ) _mm_castpd_si128( _mm_permute_pd( _mm_castsi128_pd( (v) ), (imm_i0) + 2*(imm_i1) ) )
28 :
29 : /* Predefined constants */
30 :
31 : #define vv_zero() _mm_setzero_si128() /* Return [ 0UL 0UL ] */
32 116391987 : #define vv_one() _mm_set1_epi64x( 1L ) /* Return [ 1UL 1UL ] */
33 :
34 : /* Memory operations */
35 :
36 : /* vv_ld return the 2 ulongs at the 16-byte aligned / 16-byte sized
37 : location p as a vector ulong. vv_ldu is the same but p does not have
38 : to be aligned. vv_st writes the vector ulong to the 16-byte aligned /
39 : 16-byte sized location p as 2 ulongs. vv_stu is the same but p does
40 : not have to be aligned. In all these 64-bit lane l will be at p[l].
41 : FIXME: USE ATTRIBUTES ON P PASSED TO THESE?
42 :
43 : Note: gcc knows a __m128i may alias. */
44 :
45 116391987 : static inline vv_t vv_ld( ulong const * p ) { return _mm_load_si128( (__m128i const *)p ); }
46 116391987 : static inline void vv_st( ulong * p, vv_t i ) { _mm_store_si128( (__m128i *)p, i ); }
47 :
48 232783974 : static inline vv_t vv_ldu( void const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
49 232783974 : static inline void vv_stu( void * p, vv_t i ) { _mm_storeu_si128( (__m128i *)p, i ); }
50 :
51 : /* vv_ldif is an optimized equivalent to vv_notczero(c,vv_ldu(p)) (may
52 : have different behavior if c is not a proper vector conditional). It
53 : is provided for symmetry with the vv_stif operation. vv_stif stores
54 : x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
55 : Undefined behavior if c is not a proper vector conditional. */
56 :
57 : #define vv_ldif(c,p) _mm_maskload_epi64( (p),(c))
58 : #define vv_stif(c,p,x) _mm_maskstore_epi64((p),(c),(x))
59 :
60 : /* Element operations */
61 :
62 : /* vv_extract extracts the ulong in lane imm from the vector ulong as a
63 : ulong. vv_insert returns the vector ulong formed by replacing the
64 : value in lane imm of a with the provided ulong. imm should be a
65 : compile time known in 0:1. vv_extract_variable and
66 : vv_insert_variable are the slower but the lane n does not have to be
67 : known at compile time (should still be in 0:1).
68 :
69 : Note: C99 TC3 allows type punning through a union. */
70 :
71 233570406 : #define vv_extract(a,imm) ((ulong)_mm_extract_epi64( (a), (imm) ))
72 :
73 232783974 : #define vv_insert(a,imm,v) _mm_insert_epi64( (a), (long)(v), (imm) )
74 :
75 : static inline ulong
76 232783974 : vv_extract_variable( vv_t a, int n ) {
77 232783974 : union { __m128i m[1]; ulong u[2]; } t[1];
78 232783974 : _mm_store_si128( t->m, a );
79 232783974 : return t->u[n];
80 232783974 : }
81 :
82 : static inline vv_t
83 232783974 : vv_insert_variable( vv_t a, int n, ulong v ) {
84 232783974 : union { __m128i m[1]; ulong u[2]; } t[1];
85 232783974 : _mm_store_si128( t->m, a );
86 232783974 : t->u[n] = v;
87 232783974 : return _mm_load_si128( t->m );
88 232783974 : }
89 :
90 : /* Given [a0 a1] and/or [b0 b1], return ... */
91 :
92 : /* Arithmetic operations */
93 :
94 : #define vv_neg(a) _mm_sub_epi64( _mm_setzero_si128(), (a) ) /* [ -a0 -a1 ] */
95 : #define vv_abs(a) (a) /* [ |a0| |a1| ] */
96 :
97 : /* Note: _mm_{min,max}_epu64 are missing pre AVX-512. We emulate these
98 : on pre AVX-512 targets below (and use the AVX-512 versions if
99 : possible). Likewise, there is no _mm_mullo_epi64 pre AVX-512. Since
100 : this is not cheap to emulate, we do not provide a wl_mul for the time
101 : being (we could consider exposing it on AVX-512 targets though).
102 : There is a 64L*64L->64 multiply (where the lower 32-bits will be zero
103 : extended to 64-bits beforehand) though and that is very useful. So
104 : we do provide that. */
105 :
106 196608 : #define vv_add(a,b) _mm_add_epi64( (a), (b) ) /* [ a0 +b0 a1 +b1 ] */
107 : #define vv_sub(a,b) _mm_sub_epi64( (a), (b) ) /* [ a0 -b0 a1 -b1 ] */
108 : //#define vv_mul(a,b) _mm_mullo_epi64( (a), (b) ) /* [ a0 *b0 a1 *b1 ] */
109 : #define vv_mul_ll(a,b) _mm_mul_epu32( (a), (b) ) /* [ a0l*b0l a1l*b1l ] */
110 :
111 : /* Binary operations */
112 :
113 : /* Note: vv_shl/vv_shr/vv_shru is a left/right shift by imm bits; imm
114 : should be a compile time constant in 0:63. The variable variants are
115 : slower but do not require the shift amount to be known at compile
116 : time (should still be in 0:63). */
117 :
118 : #define vv_not(a) _mm_xor_si128( _mm_set1_epi64x( -1L ), (a) ) /* [ ~a0 ~a1 ] */
119 :
120 : #define vv_shl(a,imm) _mm_slli_epi64( (a), (imm) ) /* [ a0<<imm a1<<imm ] */
121 : #define vv_shr(a,imm) _mm_srli_epi64( (a), (imm) ) /* [ a0>>imm a1>>imm ] */
122 :
123 : #define vv_shl_variable(a,n) _mm_sll_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
124 : #define vv_shr_variable(a,n) _mm_srl_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
125 :
126 : #define vv_shl_vector(a,b) _mm_sllv_epi64( (a), (b) ) /* [ a0<<b0 a1<<b1 ] */
127 : #define vv_shr_vector(a,b) _mm_srlv_epi64( (a), (b) ) /* [ a0>>b0 a1>>b1 ] */
128 :
129 : #define vv_and(a,b) _mm_and_si128( (a), (b) ) /* [ a0 &b0 a1& b1 ] */
130 : #define vv_andnot(a,b) _mm_andnot_si128( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ] */
131 42074112 : #define vv_or(a,b) _mm_or_si128( (a), (b) ) /* [ a0 |b0 a1 |b1 ] */
132 : #define vv_xor(a,b) _mm_xor_si128( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ] */
133 :
134 : /* vv_rol(x,n) returns vv( rotate_left (x0,n), rotate_left (x1,n), ... )
135 : vv_ror(x,n) returns vv( rotate_right(x0,n), rotate_right(x1,n), ... ) */
136 :
137 : #if FD_HAS_AVX512
138 : #define vv_rol(a,imm) _mm_rol_epi64( (a), (imm) )
139 : #define vv_ror(a,imm) _mm_ror_epi64( (a), (imm) )
140 : #else
141 8519680 : static inline vv_t vv_rol( vv_t a, int imm ) { return vv_or( vv_shl( a, imm & 63 ), vv_shr( a, (-imm) & 63 ) ); }
142 8388608 : static inline vv_t vv_ror( vv_t a, int imm ) { return vv_or( vv_shr( a, imm & 63 ), vv_shl( a, (-imm) & 63 ) ); }
143 : #endif
144 :
145 12582912 : static inline vv_t vv_rol_variable( vv_t a, int n ) { return vv_or( vv_shl_variable( a, n&63 ), vv_shr_variable( a, (-n)&63 ) ); }
146 12582912 : static inline vv_t vv_ror_variable( vv_t a, int n ) { return vv_or( vv_shr_variable( a, n&63 ), vv_shl_variable( a, (-n)&63 ) ); }
147 :
148 0 : static inline vv_t vv_rol_vector( vv_t a, vl_t b ) {
149 0 : vl_t m = vl_bcast( 63L );
150 0 : return vv_or( vv_shl_vector( a, vl_and( b, m ) ), vv_shr_vector( a, vl_and( vl_neg( b ), m ) ) );
151 0 : }
152 :
153 0 : static inline vv_t vv_ror_vector( vv_t a, vl_t b ) {
154 0 : vl_t m = vl_bcast( 63L );
155 0 : return vv_or( vv_shr_vector( a, vl_and( b, m ) ), vv_shl_vector( a, vl_and( vl_neg( b ), m ) ) );
156 0 : }
157 :
158 : #define vv_bswap(a) vu_to_vv_raw( vu_bswap( vv_to_vu_raw( vv_rol( (a), 32 ) ) ) )
159 :
160 : /* Logical operations */
161 :
162 : /* Like noted below in the converters, Intel clearly has the hardware to
163 : do a _mm_cmpgt_epu64 given that _mm_cmpgt_epi64 exists but doesn't
164 : expose it in the ISA pre AVX-512. Sigh ... twos complement bit
165 : tricks to the rescue for wu_{gt,lt,ge,le}. */
166 :
167 : #define vv_lnot(a) _mm_cmpeq_epi64( (a), _mm_setzero_si128() ) /* [ !a0 !a1 ] */
168 : #define vv_lnotnot(a) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpeq_epi64( (a), _mm_setzero_si128() ) ) /* [ !!a0 !!a1 ] */
169 :
170 : #define vv_eq(a,b) _mm_cmpeq_epi64( (a), (b) ) /* [ a0==b0 a1==b1 ] */
171 : #define vv_gt(a,b) _mm_cmpgt_epi64( _mm_sub_epi64( (a), _mm_set1_epi64x( (long)(1UL<<63) ) ), /* [ a0> b0 a1> b1 ] */ \
172 : _mm_sub_epi64( (b), _mm_set1_epi64x( (long)(1UL<<63) ) ) )
173 : #define vv_lt(a,b) vv_gt( (b), (a) ) /* [ a0< b0 a1< b1 ] */
174 : #define vv_ne(a,b) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpeq_epi64( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ] */
175 : #define vv_ge(a,b) _mm_xor_si128( _mm_set1_epi64x( -1L ), vv_gt( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ] */
176 : #define vv_le(a,b) _mm_xor_si128( _mm_set1_epi64x( -1L ), vv_gt( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ] */
177 :
178 : /* Conditional operations */
179 :
180 : #define vv_czero(c,f) _mm_andnot_si128( (c), (f) ) /* [ c0?0UL:f0 c1?0UL:f1 ] */
181 : #define vv_notczero(c,f) _mm_and_si128( (c), (f) ) /* [ c0?f0:0UL c1?f1:0UL ] */
182 :
183 524288 : #define vv_if(c,t,f) _mm_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ] */
184 :
185 : #if defined(__AVX512F__) && defined(__AVX512VL__) /* See note above */
186 65536 : #define vv_min(a,b) _mm_min_epu64( (a), (b) )
187 65536 : #define vv_max(a,b) _mm_max_epu64( (a), (b) )
188 : #else
189 262144 : static inline vv_t vv_min( vv_t a, vv_t b ) { return vv_if( vv_lt( a, b ), a, b ); }
190 262144 : static inline vv_t vv_max( vv_t a, vv_t b ) { return vv_if( vv_gt( a, b ), a, b ); }
191 : #endif
192 :
193 : /* Conversion operations */
194 :
195 : /* Summarizing:
196 :
197 : vv_to_vc(d) returns [ !!v0 !!v0 !!v1 !!v1 ]
198 :
199 : vv_to_vf(l,i,0) returns [ (float)v0 (float)v1 f2 f3 ]
200 : vv_to_vf(l,i,1) returns [ f0 f1 (float)v0 (float)v1 ]
201 :
202 : vv_to_vi(l,i,0) returns [ (int)v0 (int)v1 i2 i3 ]
203 : vv_to_vi(l,i,1) returns [ i0 i1 (int)v0 (int)v1 ]
204 :
205 : vv_to_vu(l,u,0) returns [ (uint)v0 (uint)v1 u2 u3 ]
206 : vv_to_vu(l,u,1) returns [ u0 u1 (uint)v0 (uint)v1 ]
207 :
208 : vv_to_vd(l) returns [ (double)v0 (double)v1 ]
209 :
210 : vv_to_vl(l) returns [ (long)v0 (long)v1 ]
211 :
212 : The raw variants just treat the raw bits as the corresponding vector
213 : type. For vv_to_vc_raw, the user promises vv contains a proper
214 : vector conditional (e.g. 0 or -1 in each lane). The others are
215 : provided to facilitate doing advanced bit tricks on floating point
216 : values. */
217 :
218 : #define vv_to_vc(a) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpeq_epi64( (a), _mm_setzero_si128() ) )
219 :
220 393216 : static inline vf_t vv_to_vf( vv_t v, vf_t f, int imm_hi ) {
221 393216 : float f0 = (float)vv_extract( v, 0 );
222 393216 : float f1 = (float)vv_extract( v, 1 );
223 393216 : return imm_hi ? vf_insert( vf_insert( f, 2, f0 ), 3, f1 ) : vf_insert( vf_insert( f, 0, f0 ), 1, f1 ); /* Compile time */
224 393216 : }
225 :
226 393216 : static inline vv_t vv_to_vi( vv_t v, vi_t i, int imm_hi ) {
227 393216 : vf_t _v = _mm_castsi128_ps( v ); /* [ x0l x0h x1l x1h ] */
228 393216 : vf_t _i = _mm_castsi128_ps( i );
229 393216 : if( imm_hi ) _v = _mm_shuffle_ps( _i, _v, _MM_SHUFFLE(2,0,1,0) ); /* Compile time */
230 196608 : else _v = _mm_shuffle_ps( _v, _i, _MM_SHUFFLE(3,2,2,0) );
231 393216 : return _mm_castps_si128( _v );
232 393216 : }
233 :
234 393216 : static inline vv_t vv_to_vu( vv_t v, vu_t u, int imm_hi ) {
235 393216 : vf_t _v = _mm_castsi128_ps( v ); /* [ x0l x0h x1l x1h ] */
236 393216 : vf_t _u = _mm_castsi128_ps( u );
237 393216 : if( imm_hi ) _v = _mm_shuffle_ps( _u, _v, _MM_SHUFFLE(2,0,1,0) ); /* Compile time */
238 196608 : else _v = _mm_shuffle_ps( _v, _u, _MM_SHUFFLE(3,2,2,0) );
239 393216 : return _mm_castps_si128( _v );
240 393216 : }
241 :
242 196608 : static inline vd_t vv_to_vd( vv_t v ) {
243 196608 : return _mm_setr_pd( (double)(ulong)_mm_extract_epi64( v, 0 ), (double)(ulong)_mm_extract_epi64( v, 1 ) );
244 196608 : }
245 :
246 : #define vv_to_vl(a) (a)
247 :
248 : #define vv_to_vc_raw(a) (a)
249 : #define vv_to_vf_raw(a) _mm_castsi128_ps( (a) )
250 : #define vv_to_vi_raw(a) (a)
251 : #define vv_to_vu_raw(a) (a)
252 : #define vv_to_vd_raw(a) _mm_castsi128_pd( (a) )
253 : #define vv_to_vl_raw(a) (a)
254 :
255 : /* Reduction operations */
256 :
257 : static inline vv_t
258 196608 : vv_sum_all( vv_t x ) { /* Returns vv_bcast( sum( x ) ) */
259 196608 : return vv_add( x, vv_permute( x, 1, 0 ) );
260 196608 : }
261 :
262 : static inline vv_t
263 196608 : vv_min_all( vv_t x ) { /* Returns vv_bcast( min( x ) ) */
264 196608 : return vv_min( x, vv_permute( x, 1, 0 ) );
265 196608 : }
266 :
267 : static inline vv_t
268 196608 : vv_max_all( vv_t x ) { /* Returns vv_bcast( max( x ) ) */
269 196608 : return vv_max( x, vv_permute( x, 1, 0 ) );
270 196608 : }
271 :
272 : /* Misc operations */
273 :
274 : /* vv_gather(b,i,imm_i0,imm_i1) returns [ b[i(imm_i0)] b[i(imm_i1)] ]
275 : where b is a "ulong const *" and i is a vi_t and imm_i0,imm_i1 are
276 : compile time constants in 0:3. We use a static inline here instead
277 : of a define to keep strict type checking while working around yet
278 : another Intel intrinsic type mismatch issue. And we use a define to
279 : workaround clang sadness with passing a compile time constant into a
280 : static inline. */
281 :
282 465567948 : static inline vv_t _vv_gather( ulong const * b, vi_t i ) {
283 465567948 : return _mm_i32gather_epi64( (long long const *)b, i, 8 );
284 465567948 : }
285 :
286 465567948 : #define vv_gather(b,i,imm_i0,imm_i1) _vv_gather( (b), _mm_shuffle_epi32( (i), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
287 :
288 : /* vv_transpose_2x2 transposes the 2x2 matrix stored in vv_t r0,r1
289 : and stores the result in 2x2 matrix vv_t c0,c1. All c0,c1 should be
290 : different for a well defined result. Otherwise, in-place operation
291 : and/or using the same vv_t to specify multiple rows of r is fine. */
292 :
293 196608 : #define vv_transpose_2x2( r0,r1, c0,c1 ) do { \
294 196608 : vv_t _vv_transpose_r0 = (r0); vv_t _vv_transpose_r1 = (r1); \
295 196608 : (c0) = _mm_unpacklo_epi64( _vv_transpose_r0, _vv_transpose_r1 ); \
296 196608 : (c1) = _mm_unpackhi_epi64( _vv_transpose_r0, _vv_transpose_r1 ); \
297 196608 : } while(0)
|