Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_sse_h
2 : #error "Do not include this directly; use fd_sse.h"
3 : #endif
4 :
5 : /* Vector int API *****************************************************/
6 :
7 : /* A vi_t is a vector where each 32-bit wide lane holds a signed 32-bit
8 : twos-complement integer (an "int"). These mirror vc and vf as much
9 : as possible.
10 :
11 : These mirror the other APIs as much as possible. Macros are
12 : preferred over static inlines when it is possible to do it robustly
13 : to reduce the risk of the compiler mucking it up. */
14 :
15 74121267 : #define vi_t __m128i
16 :
17 : /* Constructors */
18 :
19 : /* Given the int values, return ... */
20 :
21 589824 : #define vi(i0,i1,i2,i3) _mm_setr_epi32( (i0), (i1), (i2), (i3) ) /* [ i0 i1 i2 i3 ] */
22 :
23 : #define vi_bcast(i0) _mm_set1_epi32( (i0) ) /* [ i0 i0 i0 i0 ] */
24 :
25 : static inline vi_t /* [ i0 i1 i0 i1 ] */
26 196608 : vi_bcast_pair( int i0, int i1 ) {
27 196608 : return _mm_setr_epi32( i0, i1, i0, i1 );
28 196608 : }
29 :
30 : static inline vi_t /* [ i0 i0 i1 i1 ] */
31 196608 : vi_bcast_wide( int i0, int i1 ) {
32 196608 : return _mm_setr_epi32( i0, i0, i1, i1 );
33 196608 : }
34 :
35 : /* vi_permute returns [ i(imm_i0) i(imm_i1) i(imm_i2) i(imm_i3) ].
36 : imm_i* should be compile time constants in 0:3. */
37 :
38 : #define vi_permute(x,imm_i0,imm_i1,imm_i2,imm_i3) _mm_shuffle_epi32( (x), _MM_SHUFFLE( (imm_i3), (imm_i2), (imm_i1), (imm_i0) ) )
39 :
40 : /* Predefined constants */
41 :
42 : #define vi_zero() _mm_setzero_si128() /* Return [ 0 0 0 0 ] */
43 71761971 : #define vi_one() _mm_set1_epi32( 1 ) /* Return [ 1 1 1 1 ] */
44 :
45 : /* Memory operations */
46 :
47 : /* vi_ld return the 4 ints at the 16-byte aligned / 16-byte sized
48 : location p as a vector int. vi_ldu is the same but p does not have
49 : to be aligned. vi_st writes the vector int to the 16-byte aligned /
50 : 16-byte sized location p as 4 ints. vi_stu is the same but p does
51 : not have to be aligned. In all these lane l will be at p[l]. FIXME:
52 : USE ATTRIBUTES ON P PASSED TO THESE?
53 :
54 : Note: gcc knows a __m128i may alias. */
55 :
56 71761971 : static inline vi_t vi_ld( int const * p ) { return _mm_load_si128( (__m128i const *)p ); }
57 71761971 : static inline void vi_st( int * p, vi_t i ) { _mm_store_si128( (__m128i *)p, i ); }
58 :
59 287047884 : static inline vi_t vi_ldu( void const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
60 287047884 : static inline void vi_stu( void * p, vi_t i ) { _mm_storeu_si128( (__m128i *)p, i ); }
61 :
62 : /* vi_ldif is an optimized equivalent to vi_notczero(c,vi_ldu(p)) (may
63 : have different behavior if c is not a proper vector conditional). It
64 : is provided for symmetry with the vi_stif operation. vi_stif stores
65 : x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
66 : Undefined behavior if c is not a proper vector conditional. */
67 :
68 : #define vi_ldif(c,p) _mm_maskload_epi32( (p),(c))
69 : #define vi_stif(c,p,x) _mm_maskstore_epi32((p),(c),(x))
70 :
71 : /* Element operations */
72 :
73 : /* vi_extract extracts the int in lane imm from the vector int as an int.
74 : vi_insert returns the vector int formed by replacing the value in
75 : lane imm of a with the provided int. imm should be a compile time
76 : constant in 0:3. vi_extract_variable and vi_insert_variable are the
77 : slower but the lane n does not have to be known at compile time
78 : (should be in 0:3).
79 :
80 : Note: C99 TC3 allows type punning through a union. */
81 :
82 287047884 : #define vi_extract(a,imm) _mm_extract_epi32( (a), (imm) )
83 287047884 : #define vi_insert(a,imm,v) _mm_insert_epi32( (a), (v), (imm) )
84 :
85 : static inline int
86 287047884 : vi_extract_variable( vi_t a, int n ) {
87 287047884 : union { __m128i m[1]; int i[4]; } t[1];
88 287047884 : _mm_store_si128( t->m, a );
89 287047884 : return t->i[n];
90 287047884 : }
91 :
92 : static inline vi_t
93 287047884 : vi_insert_variable( vi_t a, int n, int v ) {
94 287047884 : union { __m128i m[1]; int i[4]; } t[1];
95 287047884 : _mm_store_si128( t->m, a );
96 287047884 : t->i[n] = v;
97 287047884 : return _mm_load_si128( t->m );
98 287047884 : }
99 :
100 : /* Given [a0 a1 a2 a3] and/or [b0 b1 b2 b3], return ... */
101 :
102 : /* Arithmetic operations */
103 :
104 : #define vi_neg(a) _mm_sub_epi32( _mm_setzero_si128(), (a) ) /* [ -a0 -a1 ... -a3 ] (twos complement handling) */
105 : #define vi_abs(a) _mm_abs_epi32( (a) ) /* [ |a0| |a1| ... |a3| ] (twos complement handling) */
106 :
107 : #define vi_min(a,b) _mm_min_epi32( (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a3,b3) ] */
108 : #define vi_max(a,b) _mm_max_epi32( (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a3,b3) ] */
109 : #define vi_add(a,b) _mm_add_epi32( (a), (b) ) /* [ a0 +b0 a1 +b1 ... a3 +b3 ] */
110 : #define vi_sub(a,b) _mm_sub_epi32( (a), (b) ) /* [ a0 -b0 a1 -b1 ... a3 -b3 ] */
111 : #define vi_mul(a,b) _mm_mullo_epi32( (a), (b) ) /* [ a0 *b0 a1 *b1 ... a3 *b3 ] */
112 :
113 : /* Binary operations */
114 :
115 : /* Note: vi_shl/vi_shr/vi_shru is a left/signed right/unsigned right
116 : shift by imm bits; imm should be a compile time constant in 0:31.
117 : The variable variants are slower but do not require the shift amount
118 : to be known at compile time (should still be in 0:31). */
119 :
120 : #define vi_not(a) _mm_xor_si128( _mm_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a3 ] */
121 :
122 : #define vi_shl(a,imm) _mm_slli_epi32( (a), (imm) ) /* [ a0<<imm a1<<imm ... a3<<imm ] */
123 : #define vi_shr(a,imm) _mm_srai_epi32( (a), (imm) ) /* [ a0>>imm a1>>imm ... a3>>imm ] (treat a as signed) */
124 : #define vi_shru(a,imm) _mm_srli_epi32( (a), (imm) ) /* [ a0>>imm a1>>imm ... a3>>imm ] (treat a as unsigned) */
125 :
126 : #define vi_shl_variable(a,n) _mm_sll_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
127 : #define vi_shr_variable(a,n) _mm_sra_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
128 : #define vi_shru_variable(a,n) _mm_srl_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
129 :
130 : #define vi_shl_vector(a,b) _mm_sllv_epi32( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a3<<b3 ] */
131 : #define vi_shr_vector(a,b) _mm_srav_epi32( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a3>>b3 ] (treat a as signed) */
132 : #define vi_shru_vector(a,b) _mm_srlv_epi32( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a3>>b3 ] (treat a as unsigned) */
133 :
134 : #define vi_and(a,b) _mm_and_si128( (a), (b) ) /* [ a0 &b0 a1& b1 ... a3& b3 ] */
135 : #define vi_andnot(a,b) _mm_andnot_si128( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ... (~a3)&b3 ] */
136 20971520 : #define vi_or(a,b) _mm_or_si128( (a), (b) ) /* [ a0 |b0 a1 |b1 ... a3 |b3 ] */
137 : #define vi_xor(a,b) _mm_xor_si128( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ... a3 ^b3 ] */
138 :
139 : /* vi_rol(x,n) returns vi( rotate_left (x0,n), rotate_left (x1,n), ... )
140 : vi_ror(x,n) returns vi( rotate_right(x0,n), rotate_right(x1,n), ... ) */
141 :
142 : #if FD_HAS_AVX512
143 : #define vi_rol(a,imm) _mm_rol_epi32( (a), (imm) )
144 : #define vi_ror(a,imm) _mm_ror_epi32( (a), (imm) )
145 : #else
146 4194304 : static inline vi_t vi_rol( vi_t a, int imm ) { return vi_or( vi_shl( a, imm & 31 ), vi_shru( a, (-imm) & 31 ) ); }
147 4194304 : static inline vi_t vi_ror( vi_t a, int imm ) { return vi_or( vi_shru( a, imm & 31 ), vi_shl( a, (-imm) & 31 ) ); }
148 : #endif
149 :
150 6291456 : static inline vi_t vi_rol_variable( vi_t a, int n ) { return vi_or( vi_shl_variable( a, n&31 ), vi_shru_variable( a, (-n)&31 ) ); }
151 6291456 : static inline vi_t vi_ror_variable( vi_t a, int n ) { return vi_or( vi_shru_variable( a, n&31 ), vi_shl_variable( a, (-n)&31 ) ); }
152 :
153 0 : static inline vi_t vi_rol_vector( vi_t a, vi_t b ) {
154 0 : vi_t m = vi_bcast( 31 );
155 0 : return vi_or( vi_shl_vector( a, vi_and( b, m ) ), vi_shru_vector( a, vi_and( vi_neg( b ), m ) ) );
156 0 : }
157 :
158 0 : static inline vi_t vi_ror_vector( vi_t a, vi_t b ) {
159 0 : vi_t m = vi_bcast( 31 );
160 0 : return vi_or( vi_shru_vector( a, vi_and( b, m ) ), vi_shl_vector( a, vi_and( vi_neg( b ), m ) ) );
161 0 : }
162 :
163 : /* Logical operations */
164 :
165 : #define vi_lnot(a) _mm_cmpeq_epi32( (a), _mm_setzero_si128() ) /* [ !a0 !a1 ... !a3 ] */
166 : #define vi_lnotnot(a) /* [ !!a0 !!a1 ... !!a3 ] */ \
167 : _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( (a), _mm_setzero_si128() ) )
168 :
169 : #define vi_eq(a,b) _mm_cmpeq_epi32( (a), (b) ) /* [ a0==b0 a1==b1 ... a3==b3 ] */
170 : #define vi_gt(a,b) _mm_cmpgt_epi32( (a), (b) ) /* [ a0> b0 a1> b1 ... a3> b3 ] */
171 : #define vi_lt(a,b) _mm_cmpgt_epi32( (b), (a) ) /* [ a0< b0 a1< b1 ... a3> b3 ] */
172 : #define vi_ne(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a3!=b3 ] */
173 : #define vi_ge(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpgt_epi32( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ... a3>=b3 ] */
174 : #define vi_le(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpgt_epi32( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ... a3<=b3 ] */
175 :
176 : /* Conditional operations */
177 :
178 : #define vi_czero(c,f) _mm_andnot_si128( (c), (f) ) /* [ c0? 0:f0 c1? 0:f1 ... c3? 0:f3 ] */
179 : #define vi_notczero(c,f) _mm_and_si128( (c), (f) ) /* [ c0?f0: 0 c1?f1: 0 ... c3?f3: 0 ] */
180 :
181 : #define vi_if(c,t,f) _mm_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c3?t3:f3 ] */
182 :
183 : /* Conversion operations */
184 :
185 : /* Summarizing:
186 :
187 : vi_to_vc(a) returns [ !!a0 !!a1 ... !!a3 ]
188 :
189 : vi_to_vu(a) returns [ (uint)a0 (uint)a1 ... (uint)a3 ]
190 :
191 : vi_to_vf(a) returns [ (float)a0 (float)a1 ... (float)a3 ]
192 :
193 : vi_to_vd(a,imm_i0,imm_i1) returns [ (double)a(imm_i0) (double)a(imm_i1) ]
194 :
195 : vi_to_vl(a,imm_i0,imm_i1) returns [ (long)a(imm_i0) (long)a(imm_i1) ]
196 :
197 : vi_to_vv(a,imm_i0,imm_i1) returns [ (ulong)a(imm_i0) (ulong)a(imm_i1) ]
198 :
199 : where imm_i* should be a compile time constant in 0:3.
200 :
201 : The raw variants just treat the raw bits as the corresponding vector
202 : type. For vi_to_vc_raw, the user promises vi contains a proper
203 : vector conditional (i.e. 0 or -1 in each lane). vi_to_vf_raw is
204 : useful for doing advanced bit tricks on floating point values. The
205 : others are probably dubious but are provided for completness. */
206 :
207 : #define vi_to_vc(a) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( (a), _mm_setzero_si128() ) )
208 : #define vi_to_vf(a) _mm_cvtepi32_ps( (a) )
209 : #define vi_to_vu(a) (a)
210 : #define vi_to_vd(a,imm_i0,imm_i1) _mm_cvtepi32_pd ( _mm_shuffle_epi32( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
211 : #define vi_to_vl(a,imm_i0,imm_i1) _mm_cvtepi32_epi64( _mm_shuffle_epi32( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
212 : #define vi_to_vv(a,imm_i0,imm_i1) _mm_cvtepi32_epi64( _mm_shuffle_epi32( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
213 :
214 : #define vi_to_vc_raw(a) (a)
215 : #define vi_to_vf_raw(a) _mm_castsi128_ps( (a) )
216 : #define vi_to_vu_raw(a) (a)
217 : #define vi_to_vd_raw(a) _mm_castsi128_pd( (a) )
218 : #define vi_to_vl_raw(a) (a)
219 : #define vi_to_vv_raw(a) (a)
220 :
221 : /* Reduction operations */
222 :
223 : static inline vi_t
224 196608 : vi_sum_all( vi_t x ) { /* Returns vi_bcast( sum( x ) ) */
225 196608 : x = _mm_hadd_epi32( x, x ); /* x01 x23 ... */
226 196608 : return _mm_hadd_epi32( x, x ); /* xsum ... */
227 196608 : }
228 :
229 : static inline vi_t
230 196608 : vi_min_all( vi_t x ) { /* Returns vi_bcast( min( x ) ) */
231 196608 : __m128i y;
232 196608 : y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x2 x3 x0 x1 */
233 196608 : x = _mm_min_epi32( x, y ); /* x02 x13 ... */
234 196608 : y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x13 x02 ... */
235 196608 : x = _mm_min_epi32( x, y ); /* xmin ... */
236 196608 : return x;
237 196608 : }
238 :
239 : static inline vi_t
240 196608 : vi_max_all( vi_t x ) { /* Returns vi_bcast( max( x ) ) */
241 196608 : __m128i y;
242 196608 : y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x2 x3 x0 x1 */
243 196608 : x = _mm_max_epi32( x, y ); /* x02 x13 ... */
244 196608 : y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x13 x02 ... */
245 196608 : x = _mm_max_epi32( x, y ); /* xmax ... */
246 196608 : return x;
247 196608 : }
248 :
249 : /* Misc operations */
250 :
251 : /* vi_gather(b,i) returns [ b[i(0)] b[i(1)] ... b[i(3)] ] where b is a
252 : "int const *" and i is a vi_t. */
253 :
254 71761971 : #define vi_gather(b,i) _mm_i32gather_epi32( (b), (i), 4 )
255 :
256 : /* vi_transpose_4x4 transposes the 4x4 matrix stored in vi_t r0,r1,r2,r3
257 : and stores the result in 4x4 matrix vi_t c0,c1,c2,c3. All
258 : c0,c1,c2,c3 should be different for a well defined result.
259 : Otherwise, in-place operation and/or using the same vi_t to specify
260 : multiple rows of r is fine. */
261 :
262 196608 : #define vi_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do { \
263 196608 : vi_t _vi_transpose_r0 = (r0); vi_t _vi_transpose_r1 = (r1); vi_t _vi_transpose_r2 = (r2); vi_t _vi_transpose_r3 = (r3); \
264 196608 : vi_t _vi_transpose_t; \
265 196608 : /* Transpose 2x2 blocks */ \
266 196608 : _vi_transpose_t = _vi_transpose_r0; _vi_transpose_r0 = _mm_unpacklo_epi32( _vi_transpose_t, _vi_transpose_r2 ); \
267 196608 : /**/ _vi_transpose_r2 = _mm_unpackhi_epi32( _vi_transpose_t, _vi_transpose_r2 ); \
268 196608 : _vi_transpose_t = _vi_transpose_r1; _vi_transpose_r1 = _mm_unpacklo_epi32( _vi_transpose_t, _vi_transpose_r3 ); \
269 196608 : /**/ _vi_transpose_r3 = _mm_unpackhi_epi32( _vi_transpose_t, _vi_transpose_r3 ); \
270 196608 : /* Transpose 1x1 blocks */ \
271 196608 : /**/ (c0) = _mm_unpacklo_epi32( _vi_transpose_r0, _vi_transpose_r1 ); \
272 196608 : /**/ (c1) = _mm_unpackhi_epi32( _vi_transpose_r0, _vi_transpose_r1 ); \
273 196608 : /**/ (c2) = _mm_unpacklo_epi32( _vi_transpose_r2, _vi_transpose_r3 ); \
274 196608 : /**/ (c3) = _mm_unpackhi_epi32( _vi_transpose_r2, _vi_transpose_r3 ); \
275 196608 : } while(0)
|