Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_sse_h
2 : #error "Do not include this directly; use fd_sse.h"
3 : #endif
4 :
5 : /* Vector byte API *****************************************************/
6 :
7 : /* A vb_t is a vector where each 8-bit wide lane holds an unsigned 8-bit
8 : integer (a "uchar").
9 :
10 : These mirror the other APIs as much as possible. Macros are
11 : preferred over static inlines when it is possible to do it robustly
12 : to reduce the risk of the compiler mucking it up. */
13 :
14 2832186627 : #define vb_t __m128i
15 :
16 : /* Constructors */
17 :
18 : /* Given the uchar values, return ... */
19 :
20 : #define vb(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,b11,b12,b13,b14,b15 ) /* [ b0 b1 ... b15 ] */ \
21 1321376376 : _mm_setr_epi8( (char)( b0), (char)( b1), (char)( b2), (char)( b3), (char)( b4), (char)( b5), (char)( b6), (char)( b7), \
22 1321376376 : (char)( b8), (char)( b9), (char)(b10), (char)(b11), (char)(b12), (char)(b13), (char)(b14), (char)(b15) )
23 :
24 393216 : #define vb_bcast(b0) _mm_set1_epi8( (char)(b0) ) /* [ b0 b0 ... b0 ] */
25 :
26 : static inline vb_t /* [ b0 b1 b0 b1 ... b0 b1 ] */
27 1376256 : vb_bcast_pair( uchar b0, uchar b1 ) {
28 1376256 : return _mm_setr_epi8( (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1),
29 1376256 : (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1) );
30 1376256 : }
31 :
32 : static inline vb_t /* [ b0 b1 b2 b3 b0 b1 b2 b3 ... b0 b1 b2 b3 ] */
33 589824 : vb_bcast_quad( uchar b0, uchar b1, uchar b2, uchar b3 ) {
34 589824 : return _mm_setr_epi8( (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3),
35 589824 : (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3) );
36 589824 : }
37 :
38 : static inline vb_t /* [ b0 b1 ... b7 b0 b1 ... b7 ] */
39 196608 : vb_bcast_oct( uchar b0, uchar b1, uchar b2, uchar b3, uchar b4, uchar b5, uchar b6, uchar b7 ) {
40 196608 : return _mm_setr_epi8( (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7),
41 196608 : (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7) );
42 196608 : }
43 :
44 : static inline vb_t /* [ b0 b0 ... b0 b1 b1 ... b1 ] */
45 196608 : vb_expand_pair( uchar b0, uchar b1 ) {
46 196608 : return _mm_setr_epi8( (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0),
47 196608 : (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1) );
48 196608 : }
49 :
50 : static inline vb_t /* [ b0 b0 b1 b1 ... b7 b7 ] */
51 196608 : vb_expand_quad( uchar b0, uchar b1, uchar b2, uchar b3 ) {
52 196608 : return _mm_setr_epi8( (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b1), (char)(b1), (char)(b1), (char)(b1),
53 196608 : (char)(b2), (char)(b2), (char)(b2), (char)(b2), (char)(b3), (char)(b3), (char)(b3), (char)(b3) );
54 196608 : }
55 :
56 : static inline vb_t /* [ b0 b0 b1 b1 ... b7 b7 ] */
57 196608 : vb_expand_oct( uchar b0, uchar b1, uchar b2, uchar b3, uchar b4, uchar b5, uchar b6, uchar b7 ) {
58 196608 : return _mm_setr_epi8( (char)(b0), (char)(b0), (char)(b1), (char)(b1), (char)(b2), (char)(b2), (char)(b3), (char)(b3),
59 196608 : (char)(b4), (char)(b4), (char)(b5), (char)(b5), (char)(b6), (char)(b6), (char)(b7), (char)(b7) );
60 196608 : }
61 :
62 : #define vb_permute(x,i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15) /* [ x[i0] x[i1] ... x[i15] ] */ \
63 : _mm_shuffle_epi8( (x), vb( (i0), (i1), (i2), (i3), (i4), (i5), (i6), (i7), \
64 : (i8), (i9), (i10), (i11), (i12), (i13), (i14), (i15) ) )
65 :
66 : /* Useful cases are provided below. Given [ b0 b1 b2 b3 b4 ... b15 ], return ... */
67 :
68 : #define vb_exch_adj(x) /* [ b1 b0 b3 b2 ... b15 b14 ] */ \
69 : _mm_shuffle_epi8( (x), vb( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 ) )
70 :
71 : #define vb_exch_adj_pair(x) /* [ b2 b3 b0 b1 .. b14 b15 b12 b13 ] */ \
72 : _mm_shuffle_epi8( (x), vb( 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ) )
73 :
74 : #define vb_exch_adj_quad(x) /* [ b4 b5 b6 b7 b0 b1 b2 b3 .. b8 b9 b10 b11 ] */ \
75 : _mm_shuffle_epi8( (x), vb( 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 ) )
76 :
77 : #define vb_exch_adj_oct(x) /* [ b8 b9 ... b15 b0 b1 ... b7 */ \
78 : _mm_shuffle_epi8( (x), vb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) )
79 :
80 : #define vb_bcast_even(x) /* [ b0 b0 b2 b2 b4 b4 .. b12 b12 b14 b14 ] */ \
81 : _mm_shuffle_epi8( (x), vb( 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 ) )
82 :
83 : #define vb_bcast_odd(x) /* [ b1 b1 b3 b3 b5 b5 .. b13 b13 b15 b15 ] */ \
84 : _mm_shuffle_epi8( (x), vb( 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 ) )
85 :
86 : /* Predefined constants */
87 :
88 40108044 : #define vb_zero() _mm_setzero_si128() /* Return [ 0 0 ... 0 ] */
89 : #define vb_one() _mm_set1_epi8( 1 ) /* Return [ 1 1 ... 1 ] */
90 :
91 : /* Memory operations */
92 :
93 : /* vb_ld return the 16 uchars at the 16-byte aligned / 16-byte sized
94 : location p as a vector uchar. vb_ldu is the same but p does not have
95 : to be aligned. vb_st writes the vector uchar to the 16-byte aligned /
96 : 16-byte sized location p as 16 uchars. vb_stu is the same but p does
97 : not have to be aligned. In all these lane l will be at p[l]. FIXME:
98 : USE ATTRIBUTES ON P PASSED TO THESE?
99 :
100 : Note: gcc knows a __m128i may alias. */
101 :
102 20054022 : static inline vb_t vb_ld( uchar const * p ) { return _mm_load_si128( (__m128i const *)p ); }
103 1187109528 : static inline void vb_st( uchar * p, vb_t i ) { _mm_store_si128( (__m128i *)p, i ); }
104 :
105 213480165 : static inline vb_t vb_ldu( void const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
106 160432176 : static inline void vb_stu( void * p, vb_t i ) { _mm_storeu_si128( (__m128i *)p, i ); }
107 :
108 : /* Sadly, no maskload_epi8, so we can't provide a vb_ldif or vb_stif.
109 : TODO: consider emulating this? */
110 :
111 : /* Element operations */
112 :
113 : /* vb_extract extracts the uchar in lane imm from the vector uchar.
114 : vb_insert returns the vector uchar formed by replacing the value in
115 : lane imm of a vbth the provided uchar. imm should be a compile time
116 : constant in 0:15. vb_extract_variable and vb_insert_variable are the
117 : slower but the lane n does not have to be known at compile time
118 : (should still be in 0:15).
119 :
120 : Note: C99 TC3 allows type punning through a union. */
121 :
122 320864352 : #define vb_extract(a,imm) ((uchar)_mm_extract_epi8( (a), (imm) ))
123 320864352 : #define vb_insert(a,imm,v) _mm_insert_epi8( (a), (char)(v), (imm) )
124 :
125 : static inline uchar
126 320864352 : vb_extract_variable( vb_t a, int n ) {
127 320864352 : union { __m128i m[1]; uchar i[16]; } t[1];
128 320864352 : _mm_store_si128( t->m, a );
129 320864352 : return t->i[n];
130 320864352 : }
131 :
132 : static inline vb_t
133 320864352 : vb_insert_variable( vb_t a, int n, uchar v ) {
134 320864352 : union { __m128i m[1]; uchar i[16]; } t[1];
135 320864352 : _mm_store_si128( t->m, a );
136 320864352 : t->i[n] = v;
137 320864352 : return _mm_load_si128( t->m );
138 320864352 : }
139 :
140 : /* Given [a0 a1 ... a15] and/or [b0 b1 ... b15], return ... */
141 :
142 : /* Arithmetic operations */
143 :
144 : #define vb_neg(a) _mm_sub_epi8( _mm_setzero_si128(), (a) ) /* [ -a0 -a1 ... -a15 ] (twos complement handling) */
145 : #define vb_abs(a) (a) /* [ |a0| |a1| ... |a15| ] (unsigned type, so identity) */
146 :
147 : #define vb_min(a,b) _mm_min_epu8( (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a15,b15) ] */
148 : #define vb_max(a,b) _mm_max_epu8( (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a15,b15) ] */
149 : #define vb_add(a,b) _mm_add_epi8( (a), (b) ) /* [ a0 +b0 a1 +b1 ... a15 +b15 ] */
150 : #define vb_sub(a,b) _mm_sub_epi8( (a), (b) ) /* [ a0 -b0 a1 -b1 ... a15 -b15 ] */
151 :
152 : /* No vb_mul because there's no instruction for multiplying uchars. You
153 : can build one with two invocations to _mm_mullo_epi16, but it won't
154 : be particularly fast. Multiplication by add and shift might be
155 : faster honestly. TODO: consider emulating for completeness? */
156 :
157 : /* Bit operations */
158 :
159 : /* Note: vb_shl/vb_shr is an unsigned left/right shift by imm bits; imm
160 : must be a compile time constant in 0:7. The variable variants are
161 : slower but do not require the shift amount to be known at compile
162 : time (should still be in 0:7). */
163 :
164 : #define vb_not(a) _mm_xor_si128( _mm_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a15 ] */
165 :
166 : #define vb_shl(a,imm) vb_and( _mm_slli_epi16( (a), (imm) ), vb_bcast( (uchar)(0xFFUL << (imm)) ) ) /* [ a0<<imm a1<<imm ... a15<<imm ] */
167 : #define vb_shr(a,imm) vb_and( _mm_srli_epi16( (a), (imm) ), vb_bcast( (uchar)(0xFFUL >> (imm)) ) ) /* [ a0>>imm a1>>imm ... a15>>imm ] (treat a as unsigned) */
168 :
169 : #define vb_shl_variable(a,n) vb_and( _mm_sll_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) ), \
170 : vb_bcast( (uchar)(0xFFUL << (n)) ) )
171 : #define vb_shr_variable(a,n) vb_and( _mm_srl_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) ), \
172 : vb_bcast( (uchar)(0xFFUL >> (n)) ) )
173 :
174 : #define vb_and(a,b) _mm_and_si128( (a), (b) ) /* [ a0 &b0 a1& b1 ... a15& b15 ] */
175 : #define vb_andnot(a,b) _mm_andnot_si128( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ... (~a15)&b15 ] */
176 6291456 : #define vb_or(a,b) _mm_or_si128( (a), (b) ) /* [ a0 |b0 a1 |b1 ... a15 |b15 ] */
177 : #define vb_xor(a,b) _mm_xor_si128( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ... a15 ^b15 ] */
178 :
179 1572864 : static inline vb_t vb_rol( vb_t a, int imm ) { return vb_or( vb_shl( a, imm & 7 ), vb_shr( a, (-imm) & 7 ) ); }
180 1572864 : static inline vb_t vb_ror( vb_t a, int imm ) { return vb_or( vb_shr( a, imm & 7 ), vb_shl( a, (-imm) & 7 ) ); }
181 :
182 1572864 : static inline vb_t vb_rol_variable( vb_t a, int n ) { return vb_or( vb_shl_variable( a, n&7 ), vb_shr_variable( a, (-n)&7 ) ); }
183 1572864 : static inline vb_t vb_ror_variable( vb_t a, int n ) { return vb_or( vb_shr_variable( a, n&7 ), vb_shl_variable( a, (-n)&7 ) ); }
184 :
185 : /* Logical operations */
186 :
187 : #define vb_lnot(a) _mm_cmpeq_epi8( (a), _mm_setzero_si128() ) /* [ !a0 !a1 ... !a15 ] */
188 : #define vb_lnotnot(a) /* [ !!a0 !!a1 ... !!a15 ] */ \
189 : _mm_xor_si128( _mm_set1_epi32( -1 ), vb_lnot( (a) ) )
190 :
191 : #define vb_eq(a,b) _mm_cmpeq_epi8( (a), (b) ) /* [ a0==b0 a1==b1 ... a15==b15 ] */
192 : #define vb_gt(a,b) _mm_cmpgt_epi8( _mm_sub_epi8( (a), _mm_set1_epi8( (char)(1U<<7) ) ), /* [ a0> b0 a1> b1 ... a15> b15 ] */ \
193 : _mm_sub_epi8( (b), _mm_set1_epi8( (char)(1U<<7) ) ) )
194 : #define vb_lt(a,b) vb_gt( (b), (a) ) /* [ a0< b0 a1< b1 ... a15< b15 ] */
195 : #define vb_ne(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi8( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a15!=b15 ] */
196 : #define vb_ge(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), vb_gt( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ... a15>=b15 ] */
197 : #define vb_le(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), vb_gt( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ... a15<=b15 ] */
198 :
199 : /* Conditional operations */
200 :
201 : #define vb_czero(c,f) _mm_andnot_si128( (c), (f) ) /* [ c0? 0:f0 c1? 0:f1 ... c15? 0:f15 ] */
202 : #define vb_notczero(c,f) _mm_and_si128( (c), (f) ) /* [ c0?f0: 0 c1?f1: 0 ... c15?f15: 0 ] */
203 :
204 : #define vb_if(c,t,f) _mm_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c15?t15:f15 ] */
205 :
206 : /* Conversion operations */
207 :
208 : /* Summarizing:
209 :
210 : vb_to_vc(a, 0) returns [ !!a0 !!a1 !!a2 !!a3 ]
211 : vb_to_vc(a, 1) returns [ !!a4 !!a5 !!a6 !!a7 ]
212 : vb_to_vc(a, 2) returns [ !!a8 !!a9 !!a10 !!a11 ]
213 : vb_to_vc(a, 3) returns [ !!a12 !!a13 !!a14 !!a15 ]
214 :
215 : vb_to_vf(a, 0) returns [ (float)a0 (float)a1 (float)a2 (float)a3 ]
216 : vb_to_vf(a, 1) returns [ (float)a4 (float)a5 (float)a6 (float)a7 ]
217 : vb_to_vf(a, 2) returns [ (float)a8 (float)a9 (float)a10 (float)a11 ]
218 : vb_to_vf(a, 3) returns [ (float)a12 (float)a13 (float)a14 (float)a15 ]
219 :
220 : vb_to_vi(a, 0) returns [ (int)a0 (int)a1 (int)a2 (int)a3 ]
221 : vb_to_vi(a, 1) returns [ (int)a4 (int)a5 (int)a6 (int)a7 ]
222 : vb_to_vi(a, 2) returns [ (int)a8 (int)a9 (int)a10 (int)a11 ]
223 : vb_to_vi(a, 3) returns [ (int)a12 (int)a13 (int)a14 (int)a15 ]
224 :
225 : vb_to_vu(a, 0) returns [ (uint)a0 (uint)a1 (uint)a2 (uint)a3 ]
226 : vb_to_vu(a, 1) returns [ (uint)a4 (uint)a5 (uint)a6 (uint)a7 ]
227 : vb_to_vu(a, 2) returns [ (uint)a8 (uint)a9 (uint)a10 (uint)a11 ]
228 : vb_to_vu(a, 3) returns [ (uint)a12 (uint)a13 (uint)a14 (uint)a15 ]
229 :
230 : vb_to_vd(a,0) returns [ (double)a0 (double)a1 ]
231 : vb_to_vd(a,1) returns [ (double)a2 (double)a3 ]
232 : ...
233 : vb_to_vd(a,7) returns [ (double)a14 (double)a15 ]
234 :
235 : vb_to_vl(a,0) returns [ (long)a0 (long)a1 ]
236 : vb_to_vl(a,1) returns [ (long)a2 (long)a3 ]
237 : ...
238 : vb_to_vl(a,7) returns [ (long)a14 (long)a15 ]
239 :
240 : vb_to_vv(a,0) returns [ (ulong)a0 (ulong)a1 ]
241 : vb_to_vv(a,1) returns [ (ulong)a2 (ulong)a3 ]
242 : ...
243 : vb_to_vv(a,7) returns [ (ulong)a14 (ulong)a15 ]
244 :
245 : where the above values should be compile time constants. */
246 :
247 : #define vb_to_vc( a, imm ) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 4*(imm) ) ) , _mm_setzero_si128() ) )
248 : #define vb_to_vf( a, imm ) _mm_cvtepi32_ps( _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 4*(imm) ) ) )
249 : #define vb_to_vi( a, imm ) _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 4*(imm) ) )
250 : #define vb_to_vu( a, imm ) _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 4*(imm) ) )
251 : #define vb_to_vd( a, imm ) _mm_cvtepi32_pd( _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 2*(imm) ) ) )
252 : #define vb_to_vl( a, imm ) _mm_cvtepu8_epi64( _mm_bsrli_si128( (a), 2*(imm) ) )
253 : #define vb_to_vv( a, imm ) _mm_cvtepu8_epi64( _mm_bsrli_si128( (a), 2*(imm) ) )
254 :
255 : #define vb_to_vc_raw(a) (a)
256 : #define vb_to_vf_raw(a) _mm_castsi128_ps( (a) )
257 : #define vb_to_vi_raw(a) (a)
258 : #define vb_to_vu_raw(a) (a)
259 : #define vb_to_vd_raw(a) _mm_castsi128_pd( (a) )
260 : #define vb_to_vl_raw(a) (a)
261 : #define vb_to_vv_raw(a) (a)
262 :
263 : /* Reduction operations */
264 :
265 : static inline vb_t
266 196608 : vb_sum_all( vb_t x ) { /* Returns vb_bcast( sum( x ) ) */
267 196608 : x = _mm_sad_epu8( x, _mm_setzero_si128() ); /* x[0-7] x[8-15] (each stored in 64 bits) */
268 196608 : return _mm_add_epi8( _mm_shuffle_epi8( x, vb_bcast( 0 ) ) , _mm_shuffle_epi8( x, vb_bcast( 8 ) ) ); /* Grab the low byte of each sum, broadcast it, then sum */
269 196608 : }
270 :
271 : static inline vb_t
272 196608 : vb_min_all( vb_t x ) { /* Returns vb_bcast( min( x ) ) */
273 196608 : x = _mm_min_epu8( x, _mm_shuffle_epi8( x, vb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) ) ); /* x0,8 x1,9 .. x7,15 (repeats 1 more time) */
274 196608 : x = _mm_min_epu8( x, _mm_shuffle_epi8( x, vb( 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 ) ) ); /* x0,4,8,12 .. x3,7,11,15 (repeats 3 more times)*/
275 196608 : x = _mm_min_epu8( x, _mm_shuffle_epi8( x, vb_bcast_quad( 2, 3, 0, 1 ) ) ); /* x_even x_odd (repeats 7 more times) */
276 196608 : x = _mm_min_epu8( x, _mm_shuffle_epi8( x, vb_bcast_pair( 1, 0 ) ) ); /* x_all (repeats 15 more times) */
277 196608 : return x;
278 196608 : }
279 :
280 : static inline vb_t
281 196608 : vb_max_all( vb_t x ) { /* Returns vb_bcast( max( x ) ) */
282 196608 : x = _mm_max_epu8( x, _mm_shuffle_epi8( x, vb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) ) ); /* x0,8 x1,9 .. x7,15 (repeats 1 more time) */
283 196608 : x = _mm_max_epu8( x, _mm_shuffle_epi8( x, vb( 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 ) ) ); /* x0,4,8,12 .. x3,7,11,15 (repeats 3 more times)*/
284 196608 : x = _mm_max_epu8( x, _mm_shuffle_epi8( x, vb_bcast_quad( 2, 3, 0, 1 ) ) ); /* x_even x_odd (repeats 7 more times) */
285 196608 : x = _mm_max_epu8( x, _mm_shuffle_epi8( x, vb_bcast_pair( 1, 0 ) ) ); /* x_all (repeats 15 more times) */
286 196608 : return x;
287 196608 : }
288 :
289 : /* Misc operations */
290 :
291 : /* TODO: These are probably are actually part of the vc post
292 : generalization to different width SIMD types. */
293 :
294 : /* vb_{any, all} return 1 if any/all of the elements are non-zero. The
295 : _fast variants are suitable for use with the return value of any of
296 : the vb comparison functions (e.g. vb_gt ). */
297 :
298 40108044 : #define vb_any_fast( x ) ( 0 != _mm_movemask_epi8( x ) )
299 40108044 : #define vb_any( x ) vb_any_fast( vb_ne( (x), vb_zero( ) ) )
300 180486198 : #define vb_all_fast( x ) ( 0xFFFF == _mm_movemask_epi8( x ) )
301 180486198 : #define vb_all( x ) vb_all_fast( vb_ne( (x), vb_zero( ) ) )
|