Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx512_h
2 : #error "Do not include this directly; use fd_avx512.h"
3 : #endif
4 :
5 : /* TODO: REDUCE, EXTRACT, ADDITIONAL LANE OPS, ... */
6 :
7 : /* Vector long API ****************************************************/
8 :
9 : /* A wwl_t is a vector where each adjacent pair of 32-bit wide lanes
10 : (e.g. 0-1 / 2-3 / 4-5 / 6-7) holds a signed 64-bit twos-complement
11 : integer (a "long").
12 :
13 : These mirror the other APIs as much as possible. Macros are
14 : preferred over static inlines when it is possible to do it robustly
15 : to reduce the risk of the compiler mucking it up. */
16 :
17 44704251641 : #define wwl_t __m512i
18 :
19 : /* wwl(x0,x1,x2,x3,x4,x5,x6,x7) returns the wwl_t [x0 x1 ... x7] where
20 : x* are longs */
21 :
22 4766143978 : #define wwl(x0,x1,x2,x3,x4,x5,x6,x7) _mm512_setr_epi64( (x0), (x1), (x2), (x3), (x4), (x5), (x6), (x7) )
23 :
24 481187968 : #define wwl_bcast(x) _mm512_set1_epi64( (x) ) /* wwl(x, x, ... x) */
25 :
26 : /* wwl_permute(p,x) returns:
27 : wwl( x(p(0)), x(p(1)), ... x(p(i)) ).
28 : As such p(*) should be longs in [0,7]. */
29 :
30 8262184719 : #define wwl_permute(p,x) _mm512_permutexvar_epi64( (p), (x) )
31 :
32 : /* wwl_select(s,x,y) concatenates the wwl_t's x and y into
33 : z = [ x0 x1 ... x7 y0 y1 ... y7 ]
34 : and then returns:
35 : wwl( z(p(0)), z(p(1)), ... z(p(7)) ).
36 : As such p(*) should be longs in [0,15]. */
37 :
38 903494052 : #define wwl_select(p,x,y) _mm512_permutex2var_epi64( (x), (p), (y) )
39 :
40 : /* Predefined constants */
41 :
42 1701299719 : #define wwl_zero() _mm512_setzero_si512() /* wwl(0, 0, ... 0) */
43 : #define wwl_one() _mm512_set1_epi64( 1L ) /* wwl(1, 1, ... 1) */
44 :
45 : /* Memory operations */
46 : /* Note: wwl_{ld,st} assume m is 64-byte aligned while wwl_{ldu,stu}
47 : allow m to have arbitrary alignment */
48 :
49 1000000 : static inline wwl_t wwl_ld( long const * m ) { return _mm512_load_epi64( m ); } /* wwl( m[0], m[1], ... m[7] ) */
50 209000000 : static inline void wwl_st( long * m, wwl_t x ) { _mm512_store_epi64( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[7] = x7 */
51 :
52 1000000 : static inline wwl_t wwl_ldu( void const * m ) { return _mm512_loadu_epi64( m ); } /* wwl( m[0], m[1], ... m[7]) */
53 1000000 : static inline void wwl_stu( void * m, wwl_t x ) { _mm512_storeu_epi64( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[7] = x7 */
54 :
55 : /* Arithmetic operations */
56 :
57 : #define wwl_neg(x) _mm512_sub_epi64( _mm512_setzero_si512(), (x) ) /* wwl(-x0, -x1, ...-x7 ), twos complement */
58 : #define wwl_abs(x) _mm512_abs_epi64( (x) ) /* wwl(|x0|,|x1|,...|x7|), twos complement */
59 :
60 : #define wwl_min(x,y) _mm512_min_epi64 ( (x), (y) ) /* wwl( min(x0,y0), min(x1,y1), ... min(x7,y7) ) */
61 : #define wwl_max(x,y) _mm512_max_epi64 ( (x), (y) ) /* wwl( max(x0,y0), max(x1,y1), ... max(x7,y7) ) */
62 11954437184 : #define wwl_add(x,y) _mm512_add_epi64 ( (x), (y) ) /* wwl( x0+y0, x1+y1, ... x7+y7 ) */
63 81224838 : #define wwl_sub(x,y) _mm512_sub_epi64 ( (x), (y) ) /* wwl( x0-y0, x1-y1, ... x7-y7 ) */
64 : #define wwl_mul(x,y) _mm512_mullo_epi64( (x), (y) ) /* wwl( x0*y0, x1*y1, ... x7*y7 ) */
65 : #define wwl_mul_ll(x,y) _mm512_mul_epi32 ( (x), (y) ) /* wwl( x0l*y0l, x1l*y1l, ... x7l*y7l ) */
66 :
67 : /* Binary operations */
68 : /* Note: shifts assumes n and or y* in [0,63]. Rotates work for
69 : arbitrary values. */
70 :
71 4000000 : #define wwl_not(x) _mm512_xor_epi64( _mm512_set1_epi64( -1L ), (x) )
72 :
73 4147082109 : #define wwl_shl(x,n) _mm512_slli_epi64 ( (x), (uint)(n) ) /* wwl( x0<<n, x1<<n, ... x7<<n ) */
74 53482010 : #define wwl_shr(x,n) _mm512_srai_epi64 ( (x), (uint)(n) ) /* wwl( x0>>n, x1>>n, ... x7>>n ) */
75 693804066 : #define wwl_shru(x,n) _mm512_srli_epi64 ( (x), (uint)(n) ) /* wwl( x0>>n, x1>>n, ... x7>>n ) (unsigned right shift) */
76 2619881391 : #define wwl_shl_vector(x,y) _mm512_sllv_epi64 ( (x), (y) ) /* wwl( x0<<y0, x1<<y1, ... x7<<y7 ) */
77 26741005 : #define wwl_shr_vector(x,y) _mm512_srav_epi64 ( (x), (y) ) /* wwl( x0>>y0, x1>>y1, ... x7>>y7 ) */
78 380112726 : #define wwl_shru_vector(x,y) _mm512_srlv_epi64 ( (x), (y) ) /* wwl( x0>>y0, x1>>y1, ... x7>>y7 ) (unsigned right shift) */
79 1082009024 : #define wwl_and(x,y) _mm512_and_epi64 ( (x), (y) ) /* wwl( x0&y0, x1&y1, ... x7&y7 ) */
80 : #define wwl_andnot(x,y) _mm512_andnot_epi64( (x), (y) ) /* wwl( ~x0&y0, ~x1&y1, ... ~x7&y7 ) */
81 4000000 : #define wwl_or(x,y) _mm512_or_epi64 ( (x), (y) ) /* wwl( x0|y0, x1|y1, ... x7|y7 ) */
82 : #define wwl_xor(x,y) _mm512_xor_epi64 ( (x), (y) ) /* wwl( x0^y0, x1^y1, ... x7^y7 ) */
83 :
84 : /* wwl_rol(x,n) returns wwl( rotate_left (x0,n ), rotate_left (x1,n ), ... )
85 : wwl_ror(x,n) returns wwl( rotate_right(x0,n ), rotate_right(x1,n ), ... )
86 : wwl_rol_variable(x,n) returns wwl( rotate_left (x0,n ), rotate_left (x1,n ), ... )
87 : wwl_ror_variable(x,n) returns wwl( rotate_right(x0,n ), rotate_right(x1,n ), ... )
88 : wwl_rol_vector(x,y) returns wwl( rotate_left (x0,y0), rotate_left (x1,y1), ... )
89 : wwl_ror_vector(x,y) returns wwl( rotate_right(x0,y0), rotate_right(x1,y1), ... )
90 :
91 : The variable variants are slower but do not require the shift amount
92 : to be known at compile time. */
93 :
94 : #define wwl_rol(a,imm) _mm512_rol_epi64( (a), (imm)&63L )
95 : #define wwl_ror(a,imm) _mm512_ror_epi64( (a), (imm)&63L )
96 :
97 1000000 : static inline wwl_t wwl_rol_variable( wwl_t a, long n ) { return wwl_or( wwl_shl ( a, n & 63L ), wwl_shru( a, (-n) & 63L ) ); }
98 1000000 : static inline wwl_t wwl_ror_variable( wwl_t a, long n ) { return wwl_or( wwl_shru( a, n & 63L ), wwl_shl ( a, (-n) & 63L ) ); }
99 :
100 1000000 : static inline wwl_t wwl_rol_vector( wwl_t a, wwl_t b ) {
101 1000000 : wwl_t m = wwl_bcast( 63L );
102 1000000 : return wwl_or( wwl_shl_vector ( a, wwl_and( b, m ) ), wwl_shru_vector( a, wwl_and( wwl_neg( b ), m ) ) );
103 1000000 : }
104 :
105 1000000 : static inline wwl_t wwl_ror_vector( wwl_t a, wwl_t b ) {
106 1000000 : wwl_t m = wwl_bcast( 63L );
107 1000000 : return wwl_or( wwl_shru_vector( a, wwl_and( b, m ) ), wwl_shl_vector ( a, wwl_and( wwl_neg( b ), m ) ) );
108 1000000 : }
109 :
110 : /* Comparison operations */
111 : /* mask(c0,c1,...) means (((int)c0)<<0) | (((int)c1)<<1) | ... */
112 :
113 5915522 : #define wwl_eq(x,y) ((int)_mm512_cmpeq_epi64_mask( (x), (y) )) /* mask( x0==y0, x1==y1, ... ) */
114 : #define wwl_gt(x,y) ((int)_mm512_cmpgt_epi64_mask( (x), (y) )) /* mask( x0> y0, x1> y1, ... ) */
115 : #define wwl_lt(x,y) ((int)_mm512_cmplt_epi64_mask( (x), (y) )) /* mask( x0< y0, x1< y1, ... ) */
116 : #define wwl_ne(x,y) ((int)_mm512_cmpneq_epi64_mask( (x), (y) )) /* mask( x0!=y0, x1!=y1, ... ) */
117 : #define wwl_ge(x,y) ((int)_mm512_cmpge_epi64_mask( (x), (y) )) /* mask( x0>=y0, x1>=y1, ... ) */
118 : #define wwl_le(x,y) ((int)_mm512_cmple_epi64_mask( (x), (y) )) /* mask( x0<=y0, x1<=y1, ... ) */
119 :
120 : #define wwl_lnot(x) wwl_eq( (x), wwl_zero() ) /* mask( !x0, !x1, ... ) */
121 : #define wwl_lnotnot(x) wwl_ne( (x), wwl_zero() ) /* mask( !!x0, !!x1, ... ) */
122 :
123 : /* Conditional operations */
124 : /* cn means bit n of c */
125 :
126 1185277136 : #define wwl_if(c,x,y) _mm512_mask_blend_epi64 ( (__mmask8)(c), (y), (x) ) /* wwl( c0? x0 :y0, ... ) */
127 :
128 : #define wwl_add_if(c,x,y,z) _mm512_mask_add_epi64 ( (z), (__mmask8)(c), (x), (y) ) /* wwl( c0?(x0+y0):z0, ... ) */
129 : #define wwl_sub_if(c,x,y,z) _mm512_mask_sub_epi64 ( (z), (__mmask8)(c), (x), (y) ) /* wwl( c0?(x0-y0):z0, ... ) */
130 :
131 : #define wwl_and_if(c,x,y,z) _mm512_mask_and_epi64 ( (z), (__mmask8)(c), (x), (y) ) /* wwl( c0?(x0&y0):z0, ... ) */
132 : #define wwl_andnot_if(c,x,y,z) _mm512_mask_andnot_epi64( (z), (__mmask8)(c), (x), (y) ) /* wwl( c0?(~x0&y0):z0, ... ) */
133 : #define wwl_or_if(c,x,y,z) _mm512_mask_or_epi64 ( (z), (__mmask8)(c), (x), (y) ) /* wwl( c0?(x0|y0):z0, ... ) */
134 : #define wwl_xor_if(c,x,y,z) _mm512_mask_xor_epi64 ( (z), (__mmask8)(c), (x), (y) ) /* wwl( c0?(x0^y0):z0, ... ) */
135 :
136 : /* Conversions */
137 :
138 : /* wwl_to_wwi(x) returns [ (int)x0,0, (int)x1,0, ... (int)x7,0 ]
139 : wwl_to_wwu(x) returns [ (uint)x0,0, (uint)x1,0, ... (uint)x7,0 ]
140 : wwl_to_wwv(x) returns [ (ulong)x0, (ulong)x1, ... (ulong)x7 ] */
141 :
142 : #define wwl_to_wwi(x) wwl_and( (x), wwl_bcast( (long)UINT_MAX ) )
143 : #define wwl_to_wwu(x) wwl_and( (x), wwl_bcast( (long)UINT_MAX ) )
144 : #define wwl_to_wwv(x) (x)
145 :
146 : #define wwl_to_wwi_raw(x) (x)
147 : #define wwl_to_wwu_raw(x) (x)
148 : #define wwl_to_wwv_raw(x) (x)
149 :
150 : /* Misc operations */
151 :
152 : /* wwl_pack_halves(x,imm0,y,imm1) packs half of x and half of y into a
153 : wwl. imm0/imm1 select which half of x and y to pack. imm0 / imm1
154 : should be in [0,1]. That is, this returns:
155 :
156 : [ if( imm0, x(4:7), x(0:3) ) if( imm1, y(4:7), y(0:3) ) ]
157 :
158 : wwl_pack_h0_h1(x,y) does the wwl_pack_halves(x,0,y,1) case faster.
159 : Hat tip to Philip Taffet for pointing this out. */
160 :
161 3282586345 : #define wwl_pack_halves(x,imm0,y,imm1) _mm512_shuffle_i64x2( (x), (y), 68+10*(imm0)+160*(imm1) )
162 395358648 : #define wwl_pack_h0_h1(x,y) _mm512_mask_blend_epi64( (__mmask8)0xF0, (x), (y) )
163 :
164 : /* wwl_madd52lo(a,b,c) returns LO64( a + LO52( LO52(b)*LO52(c) )
165 : wwl_madd52hi(a,b,c) returns LO64( a + HI52( LO52(b)*LO52(c) ) */
166 :
167 7440705733 : #define wwl_madd52lo(a,b,c) _mm512_madd52lo_epu64( (a), (b), (c) )
168 : #define wwl_madd52hi(a,b,c) _mm512_madd52hi_epu64( (a), (b), (c) )
169 :
170 : /* wwl_slide(x,y,imm) treats as a x FIFO with the oldest / newest
171 : element at lane 0 / 7. Returns the result of dequeing x imm times
172 : and enqueing the values y0 ... y{imm-1} in that order. imm should be
173 : in [0,7]. For example, with imm==5 case, returns:
174 : [ x5 x6 x7 y0 y1 y2 y3 y4 ]. */
175 :
176 2617205149 : #define wwl_slide(x,y,imm) _mm512_alignr_epi64( (y), (x), (imm) )
177 :
178 : /* wwl_unpack unpacks the wwl x into its long components x0,x1,...x7. */
179 :
180 1000000 : #define wwl_unpack( x, x0,x1,x2,x3,x4,x5,x6,x7 ) do { \
181 1000000 : __m512i _wwl_unpack_x = (x); \
182 1000000 : __m256i _wwl_unpack_xl = _mm512_extracti64x4_epi64( _wwl_unpack_x, 0 ); \
183 1000000 : __m256i _wwl_unpack_xh = _mm512_extracti64x4_epi64( _wwl_unpack_x, 1 ); \
184 1000000 : (x0) = _mm256_extract_epi64( _wwl_unpack_xl, 0 ); \
185 1000000 : (x1) = _mm256_extract_epi64( _wwl_unpack_xl, 1 ); \
186 1000000 : (x2) = _mm256_extract_epi64( _wwl_unpack_xl, 2 ); \
187 1000000 : (x3) = _mm256_extract_epi64( _wwl_unpack_xl, 3 ); \
188 1000000 : (x4) = _mm256_extract_epi64( _wwl_unpack_xh, 0 ); \
189 1000000 : (x5) = _mm256_extract_epi64( _wwl_unpack_xh, 1 ); \
190 1000000 : (x6) = _mm256_extract_epi64( _wwl_unpack_xh, 2 ); \
191 1000000 : (x7) = _mm256_extract_epi64( _wwl_unpack_xh, 3 ); \
192 1000000 : } while(0)
193 :
194 : /* wwl_transpose_8x8 sets wwl_t's c0,c1,...c7 to the columns of an 8x8
195 : ulong matrix given the rows of the matrix in wwl_t's r0,r1,...r7.
196 : In-place operation fine. */
197 :
198 1000000 : #define wwl_transpose_8x8( r0,r1,r2,r3,r4,r5,r6,r7, c0,c1,c2,c3,c4,c5,c6,c7 ) do { \
199 1000000 : wwl_t _wwl_transpose_r0 = (r0); wwl_t _wwl_transpose_r1 = (r1); \
200 1000000 : wwl_t _wwl_transpose_r2 = (r2); wwl_t _wwl_transpose_r3 = (r3); \
201 1000000 : wwl_t _wwl_transpose_r4 = (r4); wwl_t _wwl_transpose_r5 = (r5); \
202 1000000 : wwl_t _wwl_transpose_r6 = (r6); wwl_t _wwl_transpose_r7 = (r7); \
203 1000000 : \
204 1000000 : /* Outer 4x4 transpose of 2x2 blocks */ \
205 1000000 : wwl_t _wwl_transpose_t0 = _mm512_shuffle_i64x2( _wwl_transpose_r0, _wwl_transpose_r2, 0x88 ); \
206 1000000 : wwl_t _wwl_transpose_t1 = _mm512_shuffle_i64x2( _wwl_transpose_r1, _wwl_transpose_r3, 0x88 ); \
207 1000000 : wwl_t _wwl_transpose_t2 = _mm512_shuffle_i64x2( _wwl_transpose_r0, _wwl_transpose_r2, 0xdd ); \
208 1000000 : wwl_t _wwl_transpose_t3 = _mm512_shuffle_i64x2( _wwl_transpose_r1, _wwl_transpose_r3, 0xdd ); \
209 1000000 : wwl_t _wwl_transpose_t4 = _mm512_shuffle_i64x2( _wwl_transpose_r4, _wwl_transpose_r6, 0x88 ); \
210 1000000 : wwl_t _wwl_transpose_t5 = _mm512_shuffle_i64x2( _wwl_transpose_r5, _wwl_transpose_r7, 0x88 ); \
211 1000000 : wwl_t _wwl_transpose_t6 = _mm512_shuffle_i64x2( _wwl_transpose_r4, _wwl_transpose_r6, 0xdd ); \
212 1000000 : wwl_t _wwl_transpose_t7 = _mm512_shuffle_i64x2( _wwl_transpose_r5, _wwl_transpose_r7, 0xdd ); \
213 1000000 : \
214 1000000 : /**/ _wwl_transpose_r0 = _mm512_shuffle_i64x2( _wwl_transpose_t0, _wwl_transpose_t4, 0x88 ); \
215 1000000 : /**/ _wwl_transpose_r1 = _mm512_shuffle_i64x2( _wwl_transpose_t1, _wwl_transpose_t5, 0x88 ); \
216 1000000 : /**/ _wwl_transpose_r2 = _mm512_shuffle_i64x2( _wwl_transpose_t2, _wwl_transpose_t6, 0x88 ); \
217 1000000 : /**/ _wwl_transpose_r3 = _mm512_shuffle_i64x2( _wwl_transpose_t3, _wwl_transpose_t7, 0x88 ); \
218 1000000 : /**/ _wwl_transpose_r4 = _mm512_shuffle_i64x2( _wwl_transpose_t0, _wwl_transpose_t4, 0xdd ); \
219 1000000 : /**/ _wwl_transpose_r5 = _mm512_shuffle_i64x2( _wwl_transpose_t1, _wwl_transpose_t5, 0xdd ); \
220 1000000 : /**/ _wwl_transpose_r6 = _mm512_shuffle_i64x2( _wwl_transpose_t2, _wwl_transpose_t6, 0xdd ); \
221 1000000 : /**/ _wwl_transpose_r7 = _mm512_shuffle_i64x2( _wwl_transpose_t3, _wwl_transpose_t7, 0xdd ); \
222 1000000 : \
223 1000000 : /* Inner 2x2 transpose of 1x1 blocks */ \
224 1000000 : /**/ (c0) = _mm512_unpacklo_epi64( _wwl_transpose_r0, _wwl_transpose_r1 ); \
225 1000000 : /**/ (c1) = _mm512_unpackhi_epi64( _wwl_transpose_r0, _wwl_transpose_r1 ); \
226 1000000 : /**/ (c2) = _mm512_unpacklo_epi64( _wwl_transpose_r2, _wwl_transpose_r3 ); \
227 1000000 : /**/ (c3) = _mm512_unpackhi_epi64( _wwl_transpose_r2, _wwl_transpose_r3 ); \
228 1000000 : /**/ (c4) = _mm512_unpacklo_epi64( _wwl_transpose_r4, _wwl_transpose_r5 ); \
229 1000000 : /**/ (c5) = _mm512_unpackhi_epi64( _wwl_transpose_r4, _wwl_transpose_r5 ); \
230 1000000 : /**/ (c6) = _mm512_unpacklo_epi64( _wwl_transpose_r6, _wwl_transpose_r7 ); \
231 1000000 : /**/ (c7) = _mm512_unpackhi_epi64( _wwl_transpose_r6, _wwl_transpose_r7 ); \
232 1000000 : } while(0)
|