Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx_h
2 : #error "Do not include this directly; use fd_avx.h"
3 : #endif
4 :
5 : /* Vector short API ***************************************************/
6 :
7 : /* A ws_t is a vector wsere each 16-bit wsde lane holds a signed 16-bit
8 : integer (a "short").
9 :
10 : These mirror the other APIs as much as possible. Macros are
11 : preferred over static inlines wsen it is possible to do it robustly
12 : to reduce the risk of the compiler mucking it up. */
13 :
14 564 : #define ws_t __m256i
15 :
16 : /* Constructors */
17 :
18 : /* Given the short values, return ... */
19 :
20 : #define ws(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9,h10,h11,h12,h13,h14,h15) /* [ h0 h1 ... h15 ] */ \
21 : _mm256_setr_epi16( (short)( h0), (short)( h1), (short)( h2), (short)( h3), \
22 : (short)( h4), (short)( h5), (short)( h6), (short)( h7), \
23 : (short)( h8), (short)( h9), (short)(h10), (short)(h11), \
24 : (short)(h12), (short)(h13), (short)(h14), (short)(h15) )
25 :
26 : #define ws_bcast(h0) _mm256_set1_epi16( (short)(h0) ) /* [ h0 h0 ... h0 ] */
27 :
28 : /* Predefined constants */
29 :
30 12 : #define ws_zero() _mm256_setzero_si256() /* Return [ 0 0 ... 0 ] */
31 : #define ws_one() _mm256_set1_epi16( 1 ) /* Return [ 1 1 ... 1 ] */
32 :
33 : /* Memory operations */
34 :
35 : /* ws_ld return the 16 shorts at the 32-byte aligned / 32-byte sized
36 : location p as a vector short. ws_ldu is the same but p does not
37 : have to be aligned. ws_st writes the vector short to the 32-byte
38 : aligned / 32-byte sized location p as 16 shorts. ws_stu is the same
39 : but p does not have to be aligned. In all these lane l wsll be at
40 : p[l].
41 :
42 : Note: gcc knows a __m256i may alias. */
43 :
44 282 : static inline ws_t ws_ld( short const * p ) { return _mm256_load_si256( (__m256i const *)p ); }
45 0 : static inline void ws_st( short * p, ws_t i ) { _mm256_store_si256( (__m256i *)p, i ); }
46 :
47 0 : static inline ws_t ws_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
48 0 : static inline void ws_stu( void * p, ws_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
49 :
50 : /* Element operations */
51 :
52 : /* ws_extract extracts the short in lane imm from the vector short.
53 : ws_insert returns the vector short formed by replacing the value in
54 : lane imm of a wsth the provided short. imm should be a compile time
55 : constant in 0:15. ws_extract_variable and ws_insert_variable are the
56 : slower but the lane n does not have to eb known at compile time
57 : (should still be in 0:15).
58 :
59 : Note: C99 TC3 allows type punning through a union. */
60 :
61 0 : #define ws_extract(a,imm) ((short)_mm256_extract_epi16( (a), (imm) ))
62 0 : #define ws_insert(a,imm,v) _mm256_insert_epi16( (a), (v), (imm) )
63 :
64 : static inline short
65 0 : ws_extract_variable( ws_t a, int n ) {
66 0 : union { __m256i m[1]; short h[16]; } t[1];
67 0 : _mm256_store_si256( t->m, a );
68 0 : return (short)t->h[n];
69 0 : }
70 :
71 : static inline ws_t
72 0 : ws_insert_variable( ws_t a, int n, short v ) {
73 0 : union { __m256i m[1]; short h[16]; } t[1];
74 0 : _mm256_store_si256( t->m, a );
75 0 : t->h[n] = v;
76 0 : return _mm256_load_si256( t->m );
77 0 : }
78 :
79 : /* Arithmetic operations */
80 :
81 : #define ws_neg(a) _mm256_sub_epi16( _mm256_setzero_si256(), (a) ) /* [ -a0 -a1 ... -a7 ] (twos complement handling) */
82 : #define ws_abs(a) _mm256_abs_epi16( (a) ) /* [ |a0| |a1| ... |a7| ] (twos complement handling) */
83 :
84 : #define ws_min(a,b) _mm256_min_epi16( (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a7,b7) ] */
85 : #define ws_max(a,b) _mm256_max_epi16( (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a7,b7) ] */
86 : #define ws_add(a,b) _mm256_add_epi16( (a), (b) ) /* [ a0 +b0 a1 +b1 ... a7 +b7 ] */
87 : #define ws_sub(a,b) _mm256_sub_epi16( (a), (b) ) /* [ a0 -b0 a1 -b1 ... a7 -b7 ] */
88 282 : #define ws_mullo(a,b) _mm256_mullo_epi16( (a), (b) ) /* [ a0*b0 a1*b1 ... a15*b15 ] */
89 : #define ws_mulhi(a,b) _mm256_mulhi_epi16( (a), (b) ) /* [ (a0*b0)>>16 (a1*b1)>>16 ... (a15*b15)>>16 ] */
90 : #define ws_mul(a,b) ws_mullo((a),(b))
91 :
92 : /* Binary operations */
93 :
94 : /* Note: ws_hl/ws_shr/ws_shru is a left/signed right/unsigned right
95 : shift by imm bits; imm must be a compile tiem constant in [0,15].
96 : The variable variants are slower but do not require the shift amount
97 : to be known at compile time (should still be in [0,15]). */
98 :
99 : #define ws_not(a) _mm256_xor_si256( _mm256_set1_epi16( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a7 ] */
100 :
101 : #define ws_shl(a,imm) _mm256_slli_epi16( (a), (imm) ) /* [ a0<<imm a1<<imm ... a7<<imm ] */
102 : #define ws_shr(a,imm) _mm256_srai_epi16( (a), (imm) ) /* [ a0>>imm a1>>imm ... a7>>imm ] (treat a as signed)*/
103 : #define ws_shru(a,imm) _mm256_srli_epi16( (a), (imm) ) /* [ a0>>imm a1>>imm ... a7>>imm ] (treat a as unsigned) */
104 :
105 : #define ws_shl_variable(a,n) _mm256_sll_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
106 : #define ws_shr_variable(a,n) _mm256_sra_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
107 : #define ws_shru_variable(a,n) _mm256_srl_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
108 :
109 : #define ws_shl_vector(a,b) _mm256_sllv_epi16( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a7<<b7 ] */
110 : #define ws_shr_vector(a,b) _mm256_srav_epi16( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a7>>b7 ] (treat a as signed) */
111 : #define ws_shru_vector(a,b) _mm256_srlv_epi16( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a7>>b7 ] (treat a as unsigned) */
112 :
113 282 : #define ws_and(a,b) _mm256_and_si256( (a), (b) ) /* [ a0 &b0 a1& b1 ... a7& b7 ] */
114 : #define ws_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ... (~a7)&b7 ] */
115 0 : #define ws_or(a,b) _mm256_or_si256( (a), (b) ) /* [ a0 |b0 a1 |b1 ... a7 |b7 ] */
116 : #define ws_xor(a,b) _mm256_xor_si256( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ... a7 ^b7 ] */
117 :
118 : /* ws_rol(x,n) returns ws( rotate_left (x0,n), rotate_left (x1,n), ... )
119 : ws_ror(x,n) returns ws( rotate_right(x0,n), rotate_right(x1,n), ... ) */
120 :
121 0 : static inline ws_t ws_rol( ws_t a, int imm ) { return ws_or( ws_shl( a, imm & 15 ), ws_shru( a, (-imm) & 15 ) ); }
122 0 : static inline ws_t ws_ror( ws_t a, int imm ) { return ws_or( ws_shru( a, imm & 15 ), ws_shl( a, (-imm) & 15 ) ); }
123 :
124 0 : static inline ws_t ws_rol_variable( ws_t a, int n ) { return ws_or( ws_shl_variable( a, n&15 ), ws_shru_variable( a, (-n)&15 ) ); }
125 0 : static inline ws_t ws_ror_variable( ws_t a, int n ) { return ws_or( ws_shru_variable( a, n&15 ), ws_shl_variable( a, (-n)&15 ) ); }
126 :
127 : /* Logical operations */
128 :
129 0 : #define ws_eq(a,b) _mm256_cmpeq_epi16( (a), (b) ) /* [ a0==b0 a1==b1 ... a15==b15 ] */
130 0 : #define ws_ne(a,b) _mm256_xor_si256( _mm256_set1_epi16( -1 ), _mm256_cmpeq_epi16( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a15!=b15 ] */
|