Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx_h
2 : #error "Do not include this directly; use fd_avx.h"
3 : #endif
4 :
5 : /* Vector ushort API **************************************************/
6 :
7 : /* A wh_t is a vector where each 16-bit wide lane holds an unsigned
8 : 16-bit integer (a "ushort").
9 :
10 : These mirror the other APIs as much as possible. Macros are
11 : preferred over static inlines when it is possible to do it robustly
12 : to reduce the risk of the compiler mucking it up. */
13 :
14 0 : #define wh_t __m256i
15 :
16 : /* Constructors */
17 :
18 : /* Given the ushort values, return ... */
19 :
20 : #define wh(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9,h10,h11,h12,h13,h14,h15) /* [ h0 h1 ... h15 ] */ \
21 : _mm256_setr_epi16( (short)( h0), (short)( h1), (short)( h2), (short)( h3), \
22 : (short)( h4), (short)( h5), (short)( h6), (short)( h7), \
23 : (short)( h8), (short)( h9), (short)(h10), (short)(h11), \
24 : (short)(h12), (short)(h13), (short)(h14), (short)(h15) )
25 :
26 : #define wh_bcast(h0) _mm256_set1_epi16( (short)(h0) ) /* [ h0 h0 ... h0 ] */
27 :
28 : /* Predefined constants */
29 :
30 0 : #define wh_zero() _mm256_setzero_si256() /* Return [ 0 0 ... 0 ] */
31 : #define wh_one() _mm256_set1_epi16( 1 ) /* Return [ 1 1 ... 1 ] */
32 :
33 : /* Memory operations */
34 :
35 : /* wh_ld return the 16 ushorts at the 32-byte aligned / 32-byte sized
36 : location p as a vector ushort. wh_ldu is the same but p does not
37 : have to be aligned. wh_st writes the vector ushort to the 32-byte
38 : aligned / 32-byte sized location p as 16 ushorts. wh_stu is the same
39 : but p does not have to be aligned. In all these lane l will be at
40 : p[l].
41 :
42 : Note: gcc knows a __m256i may alias. */
43 :
44 0 : static inline wh_t wh_ld( ushort const * p ) { return _mm256_load_si256( (__m256i const *)p ); }
45 0 : static inline void wh_st( ushort * p, wh_t i ) { _mm256_store_si256( (__m256i *)p, i ); }
46 :
47 0 : static inline wh_t wh_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
48 0 : static inline void wh_stu( void * p, wh_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
49 :
50 : /* Element operations */
51 :
52 : /* wh_extract extracts the ushort in lane imm from the vector ushort.
53 : wh_insert returns the vector ushort formed by replacing the value in
54 : lane imm of a with the provided ushort. imm should be a compile time
55 : constant in 0:15. wh_extract_variable and wh_insert_variable are the
56 : slower but the lane n does not have to eb known at compile time
57 : (should still be in 0:15).
58 :
59 : Note: C99 TC3 allows type punning through a union. */
60 :
61 0 : #define wh_extract(a,imm) ((ushort)_mm256_extract_epi16( (a), (imm) ))
62 0 : #define wh_insert(a,imm,v) _mm256_insert_epi16( (a), (short)(v), (imm) )
63 :
64 : static inline ushort
65 0 : wh_extract_variable( wh_t a, int n ) {
66 0 : union { __m256i m[1]; ushort h[16]; } t[1];
67 0 : _mm256_store_si256( t->m, a );
68 0 : return (ushort)t->h[n];
69 0 : }
70 :
71 : static inline wh_t
72 0 : wh_insert_variable( wh_t a, int n, ushort v ) {
73 0 : union { __m256i m[1]; ushort h[16]; } t[1];
74 0 : _mm256_store_si256( t->m, a );
75 0 : t->h[n] = v;
76 0 : return _mm256_load_si256( t->m );
77 0 : }
78 :
79 : /* Arithmetic operations */
80 :
81 : #define wh_neg(a) _mm256_sub_epi16( _mm256_setzero_si256(), (a) ) /* [ -a0 -a1 ... -a15] (twos complement handling) */
82 : #define wh_abs(a) (a) /* [ |a0| |a1| ... |a15| ] (twos complement handling) */
83 :
84 : #define wh_min(a,b) _mm256_min_epu16( (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a15,a15) ] */
85 : #define wh_max(a,b) _mm256_max_epu16( (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a15,a15) ] */
86 : #define wh_add(a,b) _mm256_add_epi16( (a), (b) ) /* [ a0+b0 a1+b1 ... a15+b15 ] */
87 : #define wh_sub(a,b) _mm256_sub_epi16( (a), (b) ) /* [ a0-b0 a1-b1 ... a15-b15 ] */
88 : #define wh_mullo(a,b) _mm256_mullo_epi16( (a), (b) ) /* [ a0*b0 a1*b1 ... a15*b15 ] */
89 : #define wh_mulhi(a,b) _mm256_mulhi_epu16( (a), (b) ) /* [ (a0*b0)>>16 (a1*b1)>>16 ... (a15*b15)>>16 ] */
90 : #define wh_mul(a,b) wh_mullo((a),(b))
91 :
92 : /* Binary operations */
93 :
94 : /* Note: wh_shl/wh_shr is an unsigned left/right shift by imm bits; imm
95 : must be a compile time constant in [0,15]. The variable variants are
96 : slower but do not require the shift amount to be known at compile
97 : time (should still be in [0,15]). */
98 :
99 : #define wh_not(a) _mm256_xor_si256( _mm256_set1_epi16( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a15 ] */
100 :
101 : #define wh_shl(a,imm) _mm256_slli_epi16( (a), (imm) ) /* [ a0<<imm a1<<imm ... a15<<imm ] */
102 : #define wh_shr(a,imm) _mm256_srli_epi16( (a), (imm) ) /* [ a0>>imm a1>>imm ... a15>>imm ] */
103 :
104 : #define wh_shl_variable(a,n) _mm256_sll_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
105 : #define wh_shr_variable(a,n) _mm256_srl_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
106 :
107 : #define wh_shl_vector(a,b) _mm256_sllv_epi16( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a15<<b15 ] */
108 : #define wh_shr_vector(a,b) _mm256_srlv_epi16( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a15>>b15 ] */
109 :
110 : #define wh_and(a,b) _mm256_and_si256( (a), (b) ) /* [ a0 &b0 a1& b1 ... a15& b15 ] */
111 : #define wh_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ... (~a15)&b15 ] */
112 0 : #define wh_or(a,b) _mm256_or_si256( (a), (b) ) /* [ a0 |b0 a1 |b1 ... a15 |b15 ] */
113 : #define wh_xor(a,b) _mm256_xor_si256( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ... a15 ^b15 ] */
114 :
115 : /* wh_rol(x,n) returns wh( rotate_left (x0,n), rotate_left (x1,n), ... )
116 : wh_ror(x,n) returns wh( rotate_right(x0,n), rotate_right(x1,n), ... ) */
117 :
118 0 : static inline wh_t wh_rol( wh_t a, int imm ) { return wh_or( wh_shl( a, imm & 15 ), wh_shr( a, (-imm) & 15 ) ); }
119 0 : static inline wh_t wh_ror( wh_t a, int imm ) { return wh_or( wh_shr( a, imm & 15 ), wh_shl( a, (-imm) & 15 ) ); }
120 :
121 0 : static inline wh_t wh_rol_variable( wh_t a, int n ) { return wh_or( wh_shl_variable( a, n&15 ), wh_shr_variable( a, (-n)&15 ) ); }
122 0 : static inline wh_t wh_ror_variable( wh_t a, int n ) { return wh_or( wh_shr_variable( a, n&15 ), wh_shl_variable( a, (-n)&15 ) ); }
123 :
124 : /* Logical operations */
125 :
126 0 : #define wh_eq(a,b) _mm256_cmpeq_epi16( (a), (b) ) /* [ a0==b0 a1==b1 ... a15==b15 ] */
127 0 : #define wh_ne(a,b) _mm256_xor_si256( _mm256_set1_epi16( -1 ), _mm256_cmpeq_epi16( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a15!=b15 ] */
|