Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx_h
2 : #error "Do not include this directly; use fd_avx.h"
3 : #endif
4 :
5 : /* Vector byte API *****************************************************/
6 :
7 : /* A wb_t is a vector where each 8-bit wide lane holds an unsigned 8-bit
8 : integer (a "uchar").
9 :
10 : These mirror the other APIs as much as possible. Macros are
11 : preferred over static inlines when it is possible to do it robustly
12 : to reduce the risk of the compiler mucking it up. */
13 :
14 4925087643 : #define wb_t __m256i
15 :
16 : /* Constructors */
17 :
18 : /* TODO: update older SIMD modules to follow the more general convention
19 : below. */
20 :
21 : /* Given the uchar values, return ... */
22 :
23 : #define wb(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,b11,b12,b13,b14,b15, \
24 : b16,b17,b18,b19,b20,b21,b22,b23,b24,b25,b26,b27,b28,b29,b30,b31) /* [ b0 b1 ... b31 ] */ \
25 6118340325 : _mm256_setr_epi8( (char)( b0), (char)( b1), (char)( b2), (char)( b3), (char)( b4), (char)( b5), (char)( b6), (char)( b7), \
26 6118340325 : (char)( b8), (char)( b9), (char)(b10), (char)(b11), (char)(b12), (char)(b13), (char)(b14), (char)(b15), \
27 6118340325 : (char)(b16), (char)(b17), (char)(b18), (char)(b19), (char)(b20), (char)(b21), (char)(b22), (char)(b23), \
28 6118340325 : (char)(b24), (char)(b25), (char)(b26), (char)(b27), (char)(b28), (char)(b29), (char)(b30), (char)(b31) )
29 :
30 393216 : #define wb_bcast(b0) _mm256_set1_epi8( (char)(b0) ) /* [ b0 b0 ... b0 ] */
31 :
32 : static inline wb_t /* [ b0 b1 b0 b1 ... b0 b1 ] */
33 1376550 : wb_bcast_pair( uchar b0, uchar b1 ) {
34 1376550 : return _mm256_setr_epi8( (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1),
35 1376550 : (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1),
36 1376550 : (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1),
37 1376550 : (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1) );
38 1376550 : }
39 :
40 : static inline wb_t /* [ b0 b1 b2 b3 b0 b1 b2 b3 ... b0 b1 b2 b3 ] */
41 589824 : wb_bcast_quad( uchar b0, uchar b1, uchar b2, uchar b3 ) {
42 589824 : return _mm256_setr_epi8( (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3),
43 589824 : (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3),
44 589824 : (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3),
45 589824 : (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3) );
46 589824 : }
47 :
48 : static inline wb_t /* [ b0 b1 ... b7 b0 b1 ... b7 b0 b1 ... b7 b0 b1 ... b7 ] */
49 196608 : wb_bcast_oct( uchar b0, uchar b1, uchar b2, uchar b3, uchar b4, uchar b5, uchar b6, uchar b7 ) {
50 196608 : return _mm256_setr_epi8( (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7),
51 196608 : (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7),
52 196608 : (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7),
53 196608 : (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7) );
54 196608 : }
55 :
56 : static inline wb_t /* [ b0 b1 ... b15 b0 b1 ... b15 ] */
57 : wb_bcast_hex( uchar b0, uchar b1, uchar b2, uchar b3, uchar b4, uchar b5, uchar b6, uchar b7,
58 196608 : uchar b8, uchar b9, uchar b10, uchar b11, uchar b12, uchar b13, uchar b14, uchar b15 ) {
59 196608 : return _mm256_setr_epi8( (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7),
60 196608 : (char)(b8), (char)(b9), (char)(b10), (char)(b11), (char)(b12), (char)(b13), (char)(b14), (char)(b15),
61 196608 : (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7),
62 196608 : (char)(b8), (char)(b9), (char)(b10), (char)(b11), (char)(b12), (char)(b13), (char)(b14), (char)(b15) );
63 196608 : }
64 :
65 : static inline wb_t /* [ b0 b0 ... b0 b1 b1 ... b1 ] */
66 196608 : wb_expand_pair( uchar b0, uchar b1 ) {
67 196608 : return _mm256_setr_epi8( (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0),
68 196608 : (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0),
69 196608 : (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1),
70 196608 : (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1) );
71 196608 : }
72 :
73 : static inline wb_t /* [ b0 b0 ... b0 b1 b1 ... b1 b2 b2 ... b2 b3 b3 ... b3 ] */
74 196608 : wb_expand_quad( uchar b0, uchar b1, uchar b2, uchar b3 ) {
75 196608 : return _mm256_setr_epi8( (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0),
76 196608 : (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1),
77 196608 : (char)(b2), (char)(b2), (char)(b2), (char)(b2), (char)(b2), (char)(b2), (char)(b2), (char)(b2),
78 196608 : (char)(b3), (char)(b3), (char)(b3), (char)(b3), (char)(b3), (char)(b3), (char)(b3), (char)(b3) );
79 196608 : }
80 :
81 : static inline wb_t /* [ b0 b0 b0 b0 b1 b1 b1 b1 ... b7 b7 b7 b7 ] */
82 196608 : wb_expand_oct( uchar b0, uchar b1, uchar b2, uchar b3, uchar b4, uchar b5, uchar b6, uchar b7 ) {
83 196608 : return _mm256_setr_epi8( (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b1), (char)(b1), (char)(b1), (char)(b1),
84 196608 : (char)(b2), (char)(b2), (char)(b2), (char)(b2), (char)(b3), (char)(b3), (char)(b3), (char)(b3),
85 196608 : (char)(b4), (char)(b4), (char)(b4), (char)(b4), (char)(b5), (char)(b5), (char)(b5), (char)(b5),
86 196608 : (char)(b6), (char)(b6), (char)(b6), (char)(b6), (char)(b7), (char)(b7), (char)(b7), (char)(b7) );
87 196608 : }
88 :
89 : static inline wb_t /* [ b0 b0 b1 b1 ... b15 b15 ] */
90 : wb_expand_hex( uchar b0, uchar b1, uchar b2, uchar b3, uchar b4, uchar b5, uchar b6, uchar b7,
91 196608 : uchar b8, uchar b9, uchar b10, uchar b11, uchar b12, uchar b13, uchar b14, uchar b15 ) {
92 196608 : return _mm256_setr_epi8( (char)( b0), (char)( b0), (char)( b1), (char)( b1), (char)( b2), (char)( b2), (char)( b3), (char)( b3),
93 196608 : (char)( b4), (char)( b4), (char)( b5), (char)( b5), (char)( b6), (char)( b6), (char)( b7), (char)( b7),
94 196608 : (char)( b8), (char)( b8), (char)( b9), (char)( b9), (char)(b10), (char)(b10), (char)(b11), (char)(b11),
95 196608 : (char)(b12), (char)(b12), (char)(b13), (char)(b13), (char)(b14), (char)(b14), (char)(b15), (char)(b15) );
96 196608 : }
97 :
98 : /* No general wb_permute due to cross-128-bit lane limitations in AVX.
99 : Useful cases are provided below. Given [ b0 b1 ... b31 ], return ... */
100 :
101 : #define wb_exch_adj(x) /* [ b1 b0 b3 b2 ... b31 b30 ] */ \
102 294 : _mm256_shuffle_epi8( (x), wb( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, \
103 294 : 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 ) )
104 :
105 : #define wb_exch_adj_pair(x) /* [ b2 b3 b0 b1 .. b30 b31 b28 b29 ] */ \
106 3058481814 : _mm256_shuffle_epi8( (x), wb( 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, \
107 3058481814 : 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ) )
108 :
109 : #define wb_exch_adj_quad(x) /* [ b4 b5 b6 b7 b0 b1 b2 b3 .. b28 b29 b30 b31 ] */ \
110 : _mm256_shuffle_epi8( (x), wb( 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, \
111 : 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 ) )
112 :
113 : #define wb_exch_adj_oct(x) /* [ b8 b9 ... b15 b0 b1 ... b7 b24 b25 ... b31 b16 b17 ... b23 ] */ \
114 : _mm256_shuffle_epi8( (x), wb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, \
115 : 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) )
116 :
117 : static inline wb_t /* [ b16 b17 ... b31 b0 b1 ... b15 ] */
118 196608 : wb_exch_adj_hex( wb_t x ) {
119 196608 : return _mm256_permute2f128_si256( x, x, 1 );
120 196608 : }
121 :
122 : #define wb_bcast_even(x) /* [ b0 b0 b2 b2 ... b30 b30 ] */ \
123 : _mm256_shuffle_epi8( (x), wb( 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, \
124 : 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 ) )
125 :
126 : #define wb_bcast_odd(x) /* [ b1 b1 b3 b3 ... b31 b31 ] */ \
127 : _mm256_shuffle_epi8( (x), wb( 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, \
128 : 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 ) )
129 :
130 : /* Predefined constants */
131 :
132 870490330 : #define wb_zero() _mm256_setzero_si256() /* Return [ 0 0 ... 0 ] */
133 : #define wb_one() _mm256_set1_epi8( 1 ) /* Return [ 1 1 ... 1 ] */
134 :
135 : /* Memory operations */
136 :
137 : /* wb_ld return the 32 uchars at the 32-byte aligned / 32-byte sized
138 : location p as a vector uchar. wb_ldu is the same but p does not have
139 : to be aligned. wb_st writes the vector uchar to the 32-byte aligned /
140 : 32-byte sized location p as 32 uchars. wb_stu is the same but p does not
141 : have to be aligned. In all these lane l will be at p[l]. FIXME: USE
142 : ATTRIBUTES ON P PASSED TO THESE?
143 :
144 : Note: gcc knows a __m256i may alias. */
145 :
146 15415434319 : static inline wb_t wb_ld( uchar const * p ) { return _mm256_load_si256( (__m256i const *)p ); }
147 20643981 : static inline void wb_st( uchar * p, wb_t i ) { _mm256_store_si256( (__m256i *)p, i ); }
148 :
149 3875194216 : static inline wb_t wb_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
150 1408716134 : static inline void wb_stu( void * p, wb_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
151 :
152 : /* Sadly, no maskload_epi8, so we can't provide a wb_ldif or wb_stif.
153 : TODO: consider emulating this? */
154 :
155 : /* Element operations */
156 :
157 : /* wb_extract extracts the uchar in lane imm from the vector uchar.
158 : wb_insert returns the vector uchar formed by replacing the value in
159 : lane imm of a wb_t with the provided uchar. imm should be a compile
160 : time constant in 0:31. wb_extract_variable and wb_insert_variable
161 : are the slower but the lane n does not have to be known at compile
162 : time (should still be in 0:31).
163 :
164 : Note: C99 TC3 allows type punning through a union. */
165 :
166 660603072 : #define wb_extract(a,imm) ((uchar)_mm256_extract_epi8( (a), (imm) ))
167 660603072 : #define wb_insert(a,imm,v) _mm256_insert_epi8( (a), (char)(v), (imm) )
168 :
169 : static inline uchar
170 660603072 : wb_extract_variable( wb_t a, int n ) {
171 660603072 : union { __m256i m[1]; uchar i[32]; } t[1];
172 660603072 : _mm256_store_si256( t->m, a );
173 660603072 : return t->i[n];
174 660603072 : }
175 :
176 : static inline wb_t
177 660603072 : wb_insert_variable( wb_t a, int n, uchar v ) {
178 660603072 : union { __m256i m[1]; uchar i[32]; } t[1];
179 660603072 : _mm256_store_si256( t->m, a );
180 660603072 : t->i[n] = v;
181 660603072 : return _mm256_load_si256( t->m );
182 660603072 : }
183 :
184 : /* Given [a0 a1 ... a31] and/or [b0 b1 ... b31], return ... */
185 :
186 : /* Arithmetic operations */
187 :
188 : #define wb_neg(a) _mm256_sub_epi8( _mm256_setzero_si256(), (a) ) /* [ -a0 -a1 ... -a31 ] (twos complement handling) */
189 : #define wb_abs(a) (a) /* [ |a0| |a1| ... |a31| ] (unsigned type, so identity) */
190 :
191 : #define wb_min(a,b) _mm256_min_epu8( (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a31,b31) ] */
192 : #define wb_max(a,b) _mm256_max_epu8( (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a31,b31) ] */
193 1698 : #define wb_add(a,b) _mm256_add_epi8( (a), (b) ) /* [ a0 +b0 a1 +b1 ... a31 +b31 ] */
194 3270 : #define wb_sub(a,b) _mm256_sub_epi8( (a), (b) ) /* [ a0 -b0 a1 -b1 ... a31 -b31 ] */
195 :
196 : /* No wb_mul because there's no instruction for multiplying uchars. You
197 : can build one with two invocations to _mm_mullo_epi16, but it won't
198 : be particularly fast. Multiplication by add and shift might be
199 : faster honestly. TODO: consider emulating for completeness? */
200 :
201 : /* Bit operations */
202 :
203 : /* Note: wb_shl/wb_shr is an unsigned left/right shift by imm bits; imm
204 : must be a compile time constant in 0:7. The variable variants are
205 : slower but do not require the shift amount to be known at compile
206 : time (should still be in 0:7).
207 :
208 : vector shift amount variants are omitted for the time being as these
209 : are rarely needed and there seems to be little support for it.
210 : Probably could be done via two 16-wide vector shifts for the even/odd
211 : lanes and some masking tricks. */
212 :
213 : #define wb_not(a) _mm256_xor_si256( _mm256_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a31 ] */
214 :
215 : #define wb_shl(a,imm) wb_and( _mm256_slli_epi16( (a), (imm) ), wb_bcast( (uchar)(0xFFUL << (imm)) ) ) /* [ a0<<imm a1<<imm ... a31<<imm ] */
216 328587396 : #define wb_shr(a,imm) wb_and( _mm256_srli_epi16( (a), (imm) ), wb_bcast( (uchar)(0xFFUL >> (imm)) ) ) /* [ a0>>imm a1>>imm ... a31>>imm ] */
217 :
218 : #define wb_shl_variable(a,n) wb_and( _mm256_sll_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) ), \
219 : wb_bcast( (uchar)(0xFFUL << (n)) ) )
220 : #define wb_shr_variable(a,n) wb_and( _mm256_srl_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) ), \
221 : wb_bcast( (uchar)(0xFFUL >> (n)) ) )
222 :
223 657174939 : #define wb_and(a,b) _mm256_and_si256( (a), (b) ) /* [ a0 &b0 a1& b1 ... a31& b31 ] */
224 : #define wb_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ... (~a31)&b31 ] */
225 6292977 : #define wb_or(a,b) _mm256_or_si256( (a), (b) ) /* [ a0 |b0 a1 |b1 ... a31 |b31 ] */
226 26838393374 : #define wb_xor(a,b) _mm256_xor_si256( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ... a31 ^b31 ] */
227 :
228 1572864 : static inline wb_t wb_rol( wb_t a, int imm ) { return wb_or( wb_shl( a, imm & 7 ), wb_shr( a, (-imm) & 7 ) ); }
229 1572864 : static inline wb_t wb_ror( wb_t a, int imm ) { return wb_or( wb_shr( a, imm & 7 ), wb_shl( a, (-imm) & 7 ) ); }
230 :
231 1572864 : static inline wb_t wb_rol_variable( wb_t a, int n ) { return wb_or( wb_shl_variable( a, n&7 ), wb_shr_variable( a, (-n)&7 ) ); }
232 1572864 : static inline wb_t wb_ror_variable( wb_t a, int n ) { return wb_or( wb_shr_variable( a, n&7 ), wb_shl_variable( a, (-n)&7 ) ); }
233 :
234 : /* Logical operations */
235 :
236 : #define wb_lnot(a) _mm256_cmpeq_epi8( (a), _mm256_setzero_si256() ) /* [ !a0 !a1 ... !a31 ] */
237 : #define wb_lnotnot(a) /* [ !!a0 !!a1 ... !!a31 ] */ \
238 : _mm256_xor_si256( _mm256_set1_epi32( -1 ), wb_lnot( (a) ) )
239 :
240 : #define wb_eq(a,b) _mm256_cmpeq_epi8( (a), (b) ) /* [ a0==b0 a1==b1 ... a31==b31 ] */
241 : #define wb_gt(a,b) /* [ a0> b0 a1> b1 ... a31> b31 ] */\
242 1698 : _mm256_cmpgt_epi8( _mm256_sub_epi8( (a), _mm256_set1_epi8( (char)(1U<<7) ) ), \
243 1698 : _mm256_sub_epi8( (b), _mm256_set1_epi8( (char)(1U<<7) ) ) )
244 1698 : #define wb_lt(a,b) wb_gt( (b), (a) ) /* [ a0< b0 a1< b1 ... a31< b31 ] */
245 : #define wb_ne(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi8( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a31!=b31 ] */
246 : #define wb_ge(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), wb_gt( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ... a31>=b31 ] */
247 : #define wb_le(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), wb_gt( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ... a31<=b31 ] */
248 :
249 : /* Conditional operations */
250 :
251 : #define wb_czero(c,f) _mm256_andnot_si256( (c), (f) ) /* [ c0? 0:f0 c1? 0:f1 ... c31? 0:f31 ] */
252 : #define wb_notczero(c,f) _mm256_and_si256( (c), (f) ) /* [ c0?f0: 0 c1?f1: 0 ... c31?f31: 0 ] */
253 :
254 147 : #define wb_if(c,t,f) _mm256_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c31?t31:f31 ] */
255 :
256 : /* Conversion operations */
257 :
258 : /* Summarizing:
259 :
260 : wb_to_wc(a, 0) returns [ !!a0 !!a1 ... !!a7 ]
261 : wb_to_wc(a, 1) returns [ !!a8 !!a9 ... !!a15 ]
262 : wb_to_wc(a, 2) returns [ !!a16 !!a17 ... !!a23 ]
263 : wb_to_wc(a, 3) returns [ !!a24 !!a25 ... !!a31 ]
264 : // TODO: wc varints for 8, 16, and 64 wide SIMD conditionals
265 :
266 : wb_to_wf(a, 0) returns [ (float)a0 (float)a1 ... (float)a7 ]
267 : wb_to_wf(a, 1) returns [ (float)a8 (float)a9 ... (float)a15 ]
268 : wb_to_wf(a, 2) returns [ (float)a16 (float)a17 ... (float)a23 ]
269 : wb_to_wf(a, 3) returns [ (float)a24 (float)a25 ... (float)a31 ]
270 :
271 : wb_to_wi(a, 0) returns [ (int)a0 (int)a1 ... (int)a7 ]
272 : wb_to_wi(a, 1) returns [ (int)a8 (int)a9 ... (int)a15 ]
273 : wb_to_wi(a, 2) returns [ (int)a16 (int)a17 ... (int)a23 ]
274 : wb_to_wi(a, 3) returns [ (int)a24 (int)a25 ... (int)a31 ]
275 :
276 : wb_to_wu(a, 0) returns [ (uint)a0 (uint)a1 ... (uint)a7 ]
277 : wb_to_wu(a, 1) returns [ (uint)a8 (uint)a9 ... (uint)a15 ]
278 : wb_to_wu(a, 2) returns [ (uint)a16 (uint)a17 ... (uint)a23 ]
279 : wb_to_wu(a, 3) returns [ (uint)a24 (uint)a25 ... (uint)a31 ]
280 :
281 : wb_to_wd(a,0) returns [ (double)a0 (double)a1 (double)a2 (double)a3 ]
282 : wb_to_wd(a,1) returns [ (double)a4 (double)a5 (double)a6 (double)a7 ]
283 : ...
284 : wb_to_wd(a,7) returns [ (double)a28 (double)a29 (double)a30 (double)a31 ]
285 :
286 : wb_to_wl(a,0) returns [ (long)a0 (long)a1 (long)a2 (long)a3 ]
287 : wb_to_wl(a,1) returns [ (long)a4 (long)a5 (long)a6 (long)a7 ]
288 : ...
289 : wb_to_wl(a,7) returns [ (long)a28 (long)a29 (long)a30 (long)a31 ]
290 :
291 : wb_to_wv(a,0) returns [ (ulong)a0 (ulong)a1 (ulong)a2 (ulong)a3 ]
292 : wb_to_wv(a,1) returns [ (ulong)a4 (ulong)a5 (ulong)a6 (ulong)a7 ]
293 : ...
294 : wb_to_wv(a,7) returns [ (ulong)a28 (ulong)a29 (ulong)a30 (ulong)a31 ]
295 :
296 : where the above values should be compile time constants. */
297 :
298 : /* wb_expand_internal_{4, 8} selects the right group of {4,8} x 32 bits
299 : (zero extending it) */
300 :
301 : static inline __m256i
302 3145728 : wb_expand_internal_8( wb_t a, int imm ) {
303 3145728 : switch( imm ) {
304 786432 : case 0: return _mm256_cvtepu8_epi32( _mm256_extractf128_si256( a, 0 ) );
305 786432 : case 1: return _mm256_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 0 ), 8 ) );
306 786432 : case 2: return _mm256_cvtepu8_epi32( _mm256_extractf128_si256( a, 1 ) );
307 786432 : case 3: return _mm256_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 1 ), 8 ) );
308 3145728 : }
309 0 : return _mm256_setzero_si256(); /* Unreachable */
310 3145728 : }
311 :
312 : static inline __m128i
313 4718592 : wb_expand_internal_4( wb_t a, int imm ) {
314 4718592 : switch( imm ) {
315 589824 : case 0: return _mm_cvtepu8_epi32( _mm256_extractf128_si256( a, 0 ) );
316 589824 : case 1: return _mm_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 0 ), 4 ) );
317 589824 : case 2: return _mm_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 0 ), 8 ) );
318 589824 : case 3: return _mm_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 0 ), 12 ) );
319 589824 : case 4: return _mm_cvtepu8_epi32( _mm256_extractf128_si256( a, 1 ) );
320 589824 : case 5: return _mm_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 1 ), 4 ) );
321 589824 : case 6: return _mm_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 1 ), 8 ) );
322 589824 : case 7: return _mm_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 1 ), 12 ) );
323 4718592 : }
324 0 : return _mm_setzero_si128(); /* Unreachable */
325 4718592 : }
326 :
327 : #define wb_to_wc( a, imm ) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( wb_expand_internal_8( (a), (imm) ), _mm256_setzero_si256() ) )
328 : #define wb_to_wf( a, imm ) _mm256_cvtepi32_ps( wb_expand_internal_8( (a), (imm) ) )
329 : #define wb_to_wi( a, imm ) wb_expand_internal_8( (a), (imm) )
330 : #define wb_to_wu( a, imm ) wb_expand_internal_8( (a), (imm) )
331 : #define wb_to_wd( a, imm ) _mm256_cvtepi32_pd ( wb_expand_internal_4( (a), (imm) ) )
332 : #define wb_to_wl( a, imm ) _mm256_cvtepu32_epi64( wb_expand_internal_4( (a), (imm) ) ) /* This could be slightly faster with _mm256_cvtepu8_epi64 */
333 : #define wb_to_wv( a, imm ) _mm256_cvtepu32_epi64( wb_expand_internal_4( (a), (imm) ) ) /* This could be slightly faster with _mm256_cvtepu8_epi64 */
334 :
335 : #define wb_to_wc_raw(a) (a)
336 : #define wb_to_wf_raw(a) _mm256_castsi256_ps( (a) )
337 : #define wb_to_wi_raw(a) (a)
338 : #define wb_to_wu_raw(a) (a)
339 : #define wb_to_wd_raw(a) _mm256_castsi256_pd( (a) )
340 : #define wb_to_wv_raw(a) (a)
341 : #define wb_to_wl_raw(a) (a)
342 :
343 : /* Reduction operations */
344 :
345 : static inline wb_t
346 196608 : wb_sum_all( wb_t x ) { /* Returns wb_bcast( sum( x ) ) */
347 196608 : x = _mm256_sad_epu8( x, _mm256_setzero_si256() ); /* x[0-7] x[8-15] x[16-23] x[24-31] (each stored in 64 bits) */
348 196608 : x = _mm256_add_epi64( x, _mm256_permute2f128_si256( x, x, 1 ) ); /* x[0-7,16-23] x[8-15,24-31] x[0-7,16-23] x[8-15,24-31] (each stored in 64 bits) */
349 196608 : return _mm256_add_epi8( _mm256_shuffle_epi8( x, wb_bcast( 0 ) ) , _mm256_shuffle_epi8( x, wb_bcast( 8 ) ) ); /* Grab the low byte of each sum, broadcast it, then sum */
350 196608 : }
351 :
352 : static inline wb_t
353 196608 : wb_min_all( wb_t x ) { /* Returns wb_bcast( min( x ) ) */
354 196608 : x = _mm256_min_epu8( x, _mm256_permute2f128_si256( x, x, 1 ) ); /* x0,16 x1,17 .. x15,31 x0,16 x1,17 ... x15,31 */
355 196608 : x = _mm256_min_epu8( x, _mm256_shuffle_epi8( x, wb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
356 196608 : 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) ) ); /* x0,8,16,24 x1,9,17,25 .. x7,15,23,31 (repeats 3 more times) */
357 196608 : x = _mm256_min_epu8( x, _mm256_shuffle_epi8( x, wb( 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3,
358 196608 : 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 ) ) ); /* x0,4,8,12,16,20,24,28 .. x3,7,11,15,19,23,27,31 (repeats 7 more times)*/
359 196608 : x = _mm256_min_epu8( x, _mm256_shuffle_epi8( x, wb_bcast_quad( 2, 3, 0, 1 ) ) );
360 196608 : x = _mm256_min_epu8( x, _mm256_shuffle_epi8( x, wb_bcast_pair( 1, 0 ) ) );
361 196608 : return x;
362 196608 : }
363 :
364 : static inline wb_t
365 196608 : wb_max_all( wb_t x ) { /* Returns wb_bcast( max( x ) ) */
366 196608 : x = _mm256_max_epu8( x, _mm256_permute2f128_si256( x, x, 1 ) ); /* x0,16 x1,17 .. x15,31 x0,16 x1,17 ... x15,31 */
367 196608 : x = _mm256_max_epu8( x, _mm256_shuffle_epi8( x, wb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
368 196608 : 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) ) ); /* x0,8,16,24 x1,9,17,25 .. x7,15,23,31 (repeats 3 more times) */
369 196608 : x = _mm256_max_epu8( x, _mm256_shuffle_epi8( x, wb( 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3,
370 196608 : 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 ) ) ); /* x0,4,8,12,16,20,24,28 .. x3,7,11,15,19,23,27,31 (repeats 7 more times)*/
371 196608 : x = _mm256_max_epu8( x, _mm256_shuffle_epi8( x, wb_bcast_quad( 2, 3, 0, 1 ) ) );
372 196608 : x = _mm256_max_epu8( x, _mm256_shuffle_epi8( x, wb_bcast_pair( 1, 0 ) ) );
373 196608 : return x;
374 196608 : }
375 :
376 : /* Misc operations */
377 :
378 : /* TODO: These probably are actually part of the wc post generalization
379 : to different width SIMD types. */
380 :
381 : /* wb_{any, all} return 1 if any/all of the elements are non-zero. The
382 : _fast variants are suitable for use with the return value of any of
383 : the wb comparison functions (e.g. wb_gt ). */
384 :
385 41287692 : #define wb_any_fast( x ) ( 0 != _mm256_movemask_epi8( x ) )
386 41287692 : #define wb_any( x ) wb_any_fast( wb_ne( (x), wb_zero( ) ) )
387 185794614 : #define wb_all_fast( x ) ( -1 == _mm256_movemask_epi8( x ) )
388 185794614 : #define wb_all( x ) wb_all_fast( wb_ne( (x), wb_zero( ) ) )
|