Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_sse_h
2 : #error "Do not include this directly; use fd_sse.h"
3 : #endif
4 :
5 : /* TODO: the below is much very designed for a 32-bit SIMD lane world
6 : (with 64-bit SIMD lane support hacked on afterward). Revamp these to
7 : be more general for 8, 16, 32 and 64 bit lanes. */
8 :
9 : /* Vector conditional API *********************************************/
10 :
11 : /* A vc_t is a vector conditional. This is, it is a vector of integers
12 : where each 32-bit wide lane is either 0 (all zero bits), indicating
13 : the condition is true for that lane or -1 (all one bits), indicating
14 : the condition is false for that lane. This allows fast bit
15 : operations to mask other types of vectors. If this API is used on
16 : vectors that aren't proper vector conditionals, results are
17 : undefined. When vector conditional are applied to vector doubles,
18 : longs and ulongs, adjacent lanes (0-1 / 2-3) should have identical
19 : values, otherwise results will be undefined.
20 :
21 : These mirror the other APIs as much as possible. Macros are
22 : preferred over static inlines when it is possible to do it robustly
23 : to reduce the risk of the compiler mucking it up. */
24 :
25 38931456 : #define vc_t __m128i
26 :
27 : /* Constructors */
28 :
29 : /* vc returns a vc_t corresponding to the c-style logical values c0:c3.
30 : This will always create a proper vector conditional regardless how
31 : logical values were presented to them. That is, the provided values
32 : will be treated as c-style logical values such that zero/false will
33 : become zero/false in the vector and non-zero/true will become -1/true
34 : in the vector conditional). Similarly for vc_bcast*. Summarizing:
35 :
36 : vc(c0,c1,c2,c3) return [c0 c1 c2 c3]
37 : vc_bcast(c0) return [c0 c0 c0 c0]
38 : vc_bcast_pair(c0,c1) return [c0 c1 c0 c1]
39 : vc_bcast_wide(c0,c1) return [c0 c0 c1 c1] */
40 :
41 589914 : #define vc(c0,c1,c2,c3) _mm_setr_epi32( -!!(c0), -!!(c1), -!!(c2), -!!(c3) )
42 :
43 : #if 0 /* Compiler sometimes tries to turn this into branches ... sigh */
44 : #define vc_bcast(c0) _mm_set1_epi32( -!!(c0) )
45 : #else
46 : static inline __m128i
47 225 : vc_bcast( int c0 ) {
48 225 : c0 = -!!c0; FD_COMPILER_FORGET( c0 );
49 225 : return _mm_set1_epi32( c0 );
50 225 : }
51 : #endif
52 :
53 : static inline vc_t
54 45 : vc_bcast_pair( int c0, int c1 ) {
55 45 : c0 = -!!c0; c1 = -!!c1;
56 45 : return _mm_setr_epi32( c0, c1, c0, c1 );
57 45 : }
58 :
59 : static inline vc_t
60 590004 : vc_bcast_wide( int c0, int c1 ) {
61 590004 : c0 = -!!c0; c1 = -!!c1;
62 590004 : return _mm_setr_epi32( c0, c0, c1, c1 );
63 590004 : }
64 :
65 : /* vc_permute(c,imm_i0,imm_i1,imm_i2,imm_i3) returns
66 : [ c(imm_i0) c(imm_i1) c(imm_i2) c(imm_i3) ]. imm_i* should be
67 : compile time constants in 0:3. */
68 :
69 : #define vc_permute(c,imm_i0,imm_i1,imm_i2,imm_i3) _mm_shuffle_epi32( (c), _MM_SHUFFLE( (imm_i3), (imm_i2), (imm_i1), (imm_i0) ) )
70 :
71 : /* Predefined constants. */
72 :
73 : #define vc_false() _mm_setzero_si128() /* vc_false() returns [ f f f f ] */
74 : #define vc_true() _mm_set1_epi32( -1 ) /* vc_true() returns [ t t t t ] */
75 :
76 : /* Memory operations */
77 :
78 : /* vc_ld returns the 4 integers at the 16-byte aligned / 16-byte sized
79 : location p as a proper vector conditional (see above note about
80 : c-style logicals). vc_ldu is the same but p does not have to be
81 : aligned. In the fast variants, the caller promises that p already
82 : holds a proper vector conditions (e.g. 0/-1 for true/false). vc_st
83 : writes the vector conditional c at the 16-byte aligned / 16-byte size
84 : location p (0/-1 for true/false). vc_stu is the same but p does not
85 : have to be aligned. Lane l will be at p[l]. FIXME: USE ATTRIBUTES
86 : ON P PASSED TO THESE?
87 :
88 : Note: gcc knows that __m128i may alias. */
89 :
90 : static inline vc_t
91 11797431 : vc_ld( int const * p ) {
92 11797431 : return _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( _mm_load_si128( (__m128i const *)p ), _mm_setzero_si128() ) );
93 11797431 : }
94 11797431 : static inline vc_t vc_ld_fast( int const * p ) { return _mm_load_si128( (__m128i const *)p ); }
95 11797431 : static inline void vc_st( int * p, vc_t c ) { _mm_store_si128( (__m128i *)p, c ); }
96 :
97 : static inline vc_t
98 47189724 : vc_ldu( void const * p ) {
99 47189724 : return _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( _mm_loadu_si128( (__m128i const *)p ), _mm_setzero_si128() ) );
100 47189724 : }
101 47189724 : static inline vc_t vc_ldu_fast( void const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
102 47189724 : static inline void vc_stu( void * p, vc_t c ) { _mm_storeu_si128( (__m128i *)p, c ); }
103 :
104 : /* vc_ldif is an optimized equivalent to vc_and(c,vc_ldu(p)). Similarly
105 : for vc_ldif_fast (either may have different behavior if c is not a
106 : proper vector conditional). vc_ldif_fast assumes p already holds a
107 : proper vector conditional. These are provided for symmetry with the
108 : vc_stif operation. vc_stif stores x(n) to p[n] if c(n) is true and
109 : leaves p[n] unchanged otherwise. Undefined behavior if c is not a
110 : proper vector conditional. */
111 :
112 : #define vc_ldif(c,p) _mm_xor_si128(_mm_set1_epi32(-1),_mm_cmpeq_epi32( _mm_maskload_epi32((p),(c)),_mm_setzero_si128()))
113 : #define vc_ldif_fast(c,p) _mm_maskload_epi32((p),(c))
114 : #define vc_stif(c,p,x) _mm_maskstore_epi32((p),(c),(x))
115 :
116 : /* Element operations */
117 :
118 : /* vc_extract extracts the value of lane imm from the vector conditional
119 : as an int 0 (false) or 1 (true). vc_insert returns the vector
120 : conditional formed by replacing the value in lane imm of a with the
121 : provided c-style logical. imm should be a compile time constant in
122 : 0:3. vc_extract_variable and vc_insert_variable are the slower but
123 : the lane does not have to be known at compile time (should still be
124 : in 0:3). */
125 :
126 47189724 : #define vc_extract(c,imm) ((_mm_movemask_ps( _mm_castsi128_ps( (c) ) ) >> (imm)) & 1)
127 47189724 : #define vc_insert(a,imm,c) _mm_insert_epi32( (a), -!!(c), (imm) )
128 :
129 47189724 : #define vc_extract_variable(c,n) ((_mm_movemask_ps( _mm_castsi128_ps( (c) ) ) >> (n) ) & 1)
130 : #define vc_insert_variable(a,n,c) \
131 47189724 : _mm_cmpgt_epi32( _mm_and_si128( _mm_set1_epi32( (_mm_movemask_ps( _mm_castsi128_ps( (a) ) ) & (~(1<<(n)))) | ((!!(c))<<n) ), \
132 47189724 : _mm_setr_epi32( 1<<0, 1<<1, 1<<2, 1<<3 ) ), _mm_setzero_si128() )
133 :
134 : /* Given [ a0 a1 a2 a3 ] and/or [ b0 b1 b2 b3 ], return ... */
135 :
136 : /* Arithmetic operations */
137 :
138 : /* Note: arithmetic and shift operations are not well defined for a vc_t
139 : as it isn't clear if user would like to treat the vector conditional
140 : these as 4 1-bit signed ints (0/-1), 4 1-bit unsigned ints (0/1) or
141 : 4-GF2 elements (f/t but sign is meaningless) or do cross lane motion
142 : of the condition. Instead, the user should use vc_to_{vi,vl}[_raw]
143 : as necessary and use the appropriate binary, arithmetic, permute
144 : and/or shift operations there. */
145 :
146 : /* Binary operations */
147 :
148 : #define vc_not(a) _mm_xor_si128( _mm_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a3 ] */
149 :
150 : #define vc_and(a,b) _mm_and_si128( (a),(b)) /* [ a0 &b0 a1 &b1 ... a3 &b3 ] */
151 : #define vc_or(a,b) _mm_or_si128( (a),(b)) /* [ a0 |b0 a1 |b1 ... a3 |b3 ] */
152 : #define vc_xor(a,b) _mm_xor_si128( (a),(b)) /* [ a0 ^b0 a1 ^b1 ... a3 ^b3 ] */
153 : #define vc_andnot(a,b) _mm_andnot_si128((a),(b)) /* [ (~a0)&b0 (~a1)&b1 ... (~a3)&b3 ] */
154 :
155 : /* Logical operations */
156 :
157 : /* Note: vc_{gt,lt,ge,le} are provided for completeness and treat
158 : true>false. */
159 :
160 : #define vc_lnot(a) _mm_xor_si128( _mm_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a3 ] */
161 : #define vc_lnotnot(a) (a) /* [ a0 a1 ... a3 ] */
162 :
163 : #define vc_eq(a,b) _mm_cmpeq_epi32( (a),(b)) /* [ a0==b0 a1==b1 ... a3==b3 ] */
164 : #define vc_gt(a,b) _mm_andnot_si128((b),(a)) /* [ a0> b0 a1> b1 ... a3> b3 ] */
165 : #define vc_lt(a,b) _mm_andnot_si128((a),(b)) /* [ a0< b0 a1< b1 ... a3< b3 ] */
166 : #define vc_ne(a,b) _mm_xor_si128( (a),(b)) /* [ a0!=b0 a1!=b1 ... a3!=b3 ] */
167 : #define vc_ge(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_andnot_si128( (a), (b) ) ) /* [ a0>=b0 a1>=b1 ... a3>=b3 ] */
168 : #define vc_le(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_andnot_si128( (b), (a) ) ) /* [ a0<=b0 a1<=b1 ... a3<=b3 ] */
169 :
170 : /* Conditional operations */
171 :
172 : /* FIXME: Define vc_czero / vc_notczero? Equivalent TO vc_andnot and
173 : vc_and but have arithmetic connotations. */
174 :
175 : #define vc_if(c,t,f) _mm_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c3?t3:f3 ] */
176 :
177 : /* Conversion operations */
178 :
179 : /* vc_to_{vf,vi,vu,vd,vl,vv} convert a proper vector conditional into a
180 : vector float/int/double/long/ulong with f mapping to 0 and t mapping
181 : to 1 in each lane.
182 :
183 : vc_to_{vf,vi,vu,vd,vl,vv}_raw just treat the raw bits in the vector
184 : conditional as the corresponding vector type. vc_to_{vi,vu}_raw map
185 : false(true) to 0(-1) and similarly for vc_to_{vl,vv}_raw when c has
186 : paired lanes. vc_to_{vf,vd}_raw probably are not useful in practice
187 : but are provided for completeness; vc_to_vf_raw maps false(true) to
188 : 0(-nan) and similarly for vc_to_vd_raw when c has paired lanes. */
189 :
190 : #define vc_to_vf(a) _mm_and_ps( _mm_castsi128_ps( (a) ), _mm_set1_ps( 1.f ) )
191 : #define vc_to_vi(a) _mm_and_si128( (a), _mm_set1_epi32( 1 ) )
192 : #define vc_to_vu(a) _mm_and_si128( (a), _mm_set1_epi32( 1 ) )
193 : #define vc_to_vd(a) _mm_and_pd( _mm_castsi128_pd( (a) ), _mm_set1_pd( 1. ) ) /* vc should have paired lanes */
194 : #define vc_to_vl(a) _mm_and_si128( (a), _mm_set1_epi64x( 1L ) ) /* vc should have paired lanes */
195 : #define vc_to_vv(a) _mm_and_si128( (a), _mm_set1_epi64x( 1L ) ) /* vc should have paired lanes */
196 :
197 : #define vc_to_vf_raw(a) _mm_castsi128_ps( (a) )
198 : #define vc_to_vi_raw(a) (a)
199 : #define vc_to_vu_raw(a) (a)
200 : #define vc_to_vd_raw(a) _mm_castsi128_pd( (a) )
201 : #define vc_to_vl_raw(a) (a)
202 : #define vc_to_vv_raw(a) (a)
203 :
204 : /* Reduction operations */
205 :
206 : /* vc_any/vc_all returns logical true if any/all conditions in c is true */
207 :
208 930357276 : #define vc_any(c) (_mm_movemask_ps( _mm_castsi128_ps( (c) ) )!=0x0)
209 1313347911 : #define vc_all(c) (_mm_movemask_ps( _mm_castsi128_ps( (c) ) )==0xf)
210 :
211 : /* Misc operations */
212 :
213 : /* vc_pack returns an int where bit i equals 0(1) if lane i of c is
214 : false(true) for i in [0,4). Vice versa for vc_unpack. */
215 :
216 1563823158 : #define vc_pack(c) _mm_movemask_ps( _mm_castsi128_ps( (c) ) )
217 1536 : #define vc_unpack(b) _mm_cmpgt_epi32( _mm_and_si128( _mm_set1_epi32( (b) ), _mm_setr_epi32( 1<<0, 1<<1, 1<<2, 1<<3 ) ), \
218 1536 : _mm_setzero_si128() )
219 :
220 : /* vc_expand expands c0:c1 (imm_hi==0) or c2:c3 (imm_hi==1) into a
221 : paired lane conditional. That is:
222 :
223 : vc_expand(c,0) returns [ c0 c0 c1 c1 ]
224 : vc_expand(c,1) returns [ c2 c2 c3 c3 ]
225 :
226 : Conversely:
227 :
228 : vc_narrow(a,b) returns [ a0 a2 b0 b2 ]
229 :
230 : which is useful for turning two paired lane conditionals into a
231 : single lane conditional. U.B. if a, b, and/or c are not proper
232 : vector conditional. These are useful, for example, for vectorizing
233 : 64-bit pointer arithmetic used in 32-bit lane SIMD. */
234 :
235 90 : static inline vc_t vc_expand( vc_t c, int imm_hi ) {
236 90 : return _mm_cvtepi32_epi64( imm_hi ? _mm_shuffle_epi32( c, _MM_SHUFFLE(3,2,3,2) ) : c ); /* compile time */
237 90 : }
238 :
239 : #define vc_narrow(a,b) _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( (a) ), _mm_castsi128_ps( (b) ), _MM_SHUFFLE(2,0,2,0) ) )
240 :
241 : /* vc_gather(b,i) returns [ -!!b[i(0)] -!!b[i(1)] ... -!!b[i(3)] ] where
242 : b is an "int const *" (0/non-zero map to false/true) and i is a vi_t.
243 :
244 : vc_gather_fast(b,i) returns [ b[i(0)] b[i(1)] ... b[i(3)] ] where b s
245 : an "int const *". User promises b[i(:)] values are already either 0
246 : or -1. i here is a vi_t. */
247 :
248 : #if defined(__AVX2__)
249 11797431 : #define vc_gather(b,i) _mm_xor_si128( _mm_set1_epi32( -1 ), \
250 11797431 : _mm_cmpeq_epi32( _mm_i32gather_epi32( (b), (i), 4 ), _mm_setzero_si128() ) )
251 11797431 : #define vc_gather_fast(b,i) _mm_i32gather_epi32( (b), (i), 4 )
252 : #endif
253 :
254 : /* vc_transpose_4x4 transposes the 4x4 matrix stored in vc_t r0,r1,r2,r3
255 : and stores the result in 4x4 matrix vc_t c0,c1,c2,c3. All
256 : c0,c1,c2,c3 should be different for a well defined result.
257 : Otherwise, in-place operation and/or using the same vc_t to specify
258 : multiple rows of r is fine. */
259 :
260 45 : #define vc_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do { \
261 45 : vc_t _vc_transpose_r0 = (r0); vc_t _vc_transpose_r1 = (r1); vc_t _vc_transpose_r2 = (r2); vc_t _vc_transpose_r3 = (r3); \
262 45 : vc_t _vc_transpose_t; \
263 45 : /* Transpose 2x2 blocks */ \
264 45 : _vc_transpose_t = _vc_transpose_r0; _vc_transpose_r0 = _mm_unpacklo_epi32( _vc_transpose_t, _vc_transpose_r2 ); \
265 45 : /**/ _vc_transpose_r2 = _mm_unpackhi_epi32( _vc_transpose_t, _vc_transpose_r2 ); \
266 45 : _vc_transpose_t = _vc_transpose_r1; _vc_transpose_r1 = _mm_unpacklo_epi32( _vc_transpose_t, _vc_transpose_r3 ); \
267 45 : /**/ _vc_transpose_r3 = _mm_unpackhi_epi32( _vc_transpose_t, _vc_transpose_r3 ); \
268 45 : /* Transpose 1x1 blocks */ \
269 45 : /**/ (c0) = _mm_unpacklo_epi32( _vc_transpose_r0, _vc_transpose_r1 ); \
270 45 : /**/ (c1) = _mm_unpackhi_epi32( _vc_transpose_r0, _vc_transpose_r1 ); \
271 45 : /**/ (c2) = _mm_unpacklo_epi32( _vc_transpose_r2, _vc_transpose_r3 ); \
272 45 : /**/ (c3) = _mm_unpackhi_epi32( _vc_transpose_r2, _vc_transpose_r3 ); \
273 45 : } while(0)
|