Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx512_h
2 : #error "Do not include this directly; use fd_avx512.h"
3 : #endif
4 :
5 : /* TODO: REDUCE, EXTRACT, ADDITIONAL LANE OPS, ... */
6 :
7 : /* Vector ulong API ***************************************************/
8 :
9 : /* A wwv_t is a vector where each adjacent pair of 32-bit wide lanes
10 : (e.g. 0-1 / 2-3 / 4-5 / 6-7) holds an unsigned 64-bit integer (a
11 : "ulong").
12 :
13 : These mirror the other APIs as much as possible. Macros are
14 : preferred over static inlines when it is possible to do it robustly
15 : to reduce the risk of the compiler mucking it up. */
16 :
17 2622101291 : #define wwv_t __m512i
18 :
19 : /* Constructors */
20 :
21 : /* wwv(x0,x1,x2,x3,x4,x5,x6,x7) returns the wwv_t [x0 x1 ... x7] where
22 : x* are ulongs */
23 :
24 : #define wwv(x0,x1,x2,x3,x4,x5,x6,x7) \
25 30185296 : _mm512_setr_epi64( (long)(x0), (long)(x1), (long)(x2), (long)(x3), (long)(x4), (long)(x5), (long)(x6), (long)(x7) )
26 :
27 1133207727 : #define wwv_bcast(x) _mm512_set1_epi64( (long)(x) ) /* wwv(x, x, ... x) */
28 :
29 : /* wwv_permute(p,x) returns:
30 : wwv( x(p(0)), x(p(1)), ... x(p(i)) ).
31 : As such p(*) should be ulongs in [0,7]. */
32 :
33 : #define wwv_permute(p,x) _mm512_permutexvar_epi64( (p), (x) )
34 :
35 : /* wwv_select(s,x,y) concatenates the wwv_t's x and y into
36 : z = [ x0 x1 ... x7 y0 y1 ... y7 ]
37 : and then returns:
38 : wwv( z(p(0)), z(p(1)), ... z(p(7)) ).
39 : As such p(*) should be ulongs in [0,15]. */
40 :
41 : #define wwv_select(p,x,y) _mm512_permutex2var_epi64( (x), (p), (y) )
42 :
43 : /* Predefined constants */
44 :
45 18865414 : #define wwv_zero() _mm512_setzero_si512() /* wwv(0, 0, ... 0) */
46 4989383 : #define wwv_one() _mm512_set1_epi64( 1L ) /* wwv(1, 1, ... 1) */
47 :
48 : /* Memory operations */
49 : /* Note: wwv_{ld,st} assume m is 64-byte aligned while wwv_{ldu,stu}
50 : allow m to have arbitrary alignment */
51 :
52 2996518423 : static inline wwv_t wwv_ld( ulong const * m ) { return _mm512_load_epi64( m ); } /* wwv( m[0], m[1], ... m[7] ) */
53 1423719239 : static inline void wwv_st( ulong * m, wwv_t x ) { _mm512_store_epi64( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[7] = x7 */
54 :
55 85067392 : static inline wwv_t wwv_ldu( void const * m ) { return _mm512_loadu_epi64( m ); } /* wwv( m[0], m[1], ... m[7]) */
56 7213102 : static inline void wwv_stu( void * m, wwv_t x ) { _mm512_storeu_epi64( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[7] = x7 */
57 :
58 : /* Arithmetic operations */
59 :
60 : #define wwv_neg(x) _mm512_sub_epi64( _mm512_setzero_si512(), (x) ) /* wwv( -x0, -x1, ... -x7 ) */
61 : #define wwv_abs(x) (x) /* wwv( x0, x1, ... x7 ) */
62 :
63 : #define wwv_min(x,y) _mm512_min_epu64 ( (x), (y) ) /* wwv( min(x0,y0), min(x1,y1), ... min(x7,y7) ) */
64 : #define wwv_max(x,y) _mm512_max_epu64 ( (x), (y) ) /* wwv( max(x0,y0), max(x1,y1), ... max(x7,y7) ) */
65 2066189691 : #define wwv_add(x,y) _mm512_add_epi64 ( (x), (y) ) /* wwv( x0+y0, x1+y1, ... x7+y7 ) */
66 971281677 : #define wwv_sub(x,y) _mm512_sub_epi64 ( (x), (y) ) /* wwv( x0-y0, x1-y1, ... x7-y7 ) */
67 : #define wwv_mul(x,y) _mm512_mullo_epi64( (x), (y) ) /* wwv( x0*y0, x1*y1, ... x7*y7 ) */
68 : #define wwv_mul_ll(x,y) _mm512_mul_epu32 ( (x), (y) ) /* wwv( x0l*y0l, x1l*y1l, ... x7l*y7l ) */
69 :
70 : /* Binary operations */
71 : /* Note: shifts assumes n and or y* in [0,63]. Rotates work for
72 : arbitrary values */
73 :
74 4000000 : #define wwv_not(x) _mm512_xor_epi64( _mm512_set1_epi64( -1L ), (x) )
75 :
76 : #define wwv_shl(x,n) _mm512_slli_epi64 ( (x), (uint)(n) ) /* wwv( x0<<n, x1<<n, ... x7<<n ) */
77 : #define wwv_shr(x,n) _mm512_srli_epi64 ( (x), (uint)(n) ) /* wwv( x0>>n, x1>>n, ... x7>>n ) */
78 : #define wwv_shl_vector(x,y) _mm512_sllv_epi64 ( (x), (y) ) /* wwv( x0<<y0, x1<<y1, ... x7<<y7 ) */
79 : #define wwv_shr_vector(x,y) _mm512_srlv_epi64 ( (x), (y) ) /* wwv( x0>>y0, x1>>y1, ... x7>>y7 ) */
80 : #define wwv_and(x,y) _mm512_and_epi64 ( (x), (y) ) /* wwv( x0&y0, x1&y1, ... x7&y7 ) */
81 13749702 : #define wwv_andnot(x,y) _mm512_andnot_epi64( (x), (y) ) /* wwv( ~x0&y0, ~x1&y1, ... ~x7&y7 ) */
82 23154348 : #define wwv_or(x,y) _mm512_or_epi64 ( (x), (y) ) /* wwv( x0|y0, x1|y1, ... x7|y7 ) */
83 : #define wwv_xor(x,y) _mm512_xor_epi64 ( (x), (y) ) /* wwv( x0^y0, x1^y1, ... x7^y7 ) */
84 :
85 : /* wwv_rol(x,n) returns wwv( rotate_left (x0,n ), rotate_left (x1,n ), ... )
86 : wwv_ror(x,n) returns wwv( rotate_right(x0,n ), rotate_right(x1,n ), ... )
87 : wwv_rol_variable(x,n) returns wwv( rotate_left (x0,n ), rotate_left (x1,n ), ... )
88 : wwv_ror_variable(x,n) returns wwv( rotate_right(x0,n ), rotate_right(x1,n ), ... )
89 : wwv_rol_vector(x,y) returns wwv( rotate_left (x0,y0), rotate_left (x1,y1), ... )
90 : wwv_ror_vector(x,y) returns wwv( rotate_right(x0,y0), rotate_right(x1,y1), ... )
91 :
92 : The variable variants are slower but do not require the shift amount
93 : to be known at compile time. */
94 :
95 : #define wwv_rol(a,imm) _mm512_rol_epi64( (a), (imm)&63 )
96 : #define wwv_ror(a,imm) _mm512_ror_epi64( (a), (imm)&63 )
97 :
98 1000000 : static inline wwv_t wwv_rol_variable( wwv_t a, ulong n ) { return wwv_or( wwv_shl( a, n & 63UL ), wwv_shr( a, (-n) & 63UL ) ); }
99 1000000 : static inline wwv_t wwv_ror_variable( wwv_t a, ulong n ) { return wwv_or( wwv_shr( a, n & 63UL ), wwv_shl( a, (-n) & 63UL ) ); }
100 :
101 1000000 : static inline wwv_t wwv_rol_vector( wwv_t a, wwv_t b ) {
102 1000000 : wwv_t m = wwv_bcast( 63UL );
103 1000000 : return wwv_or( wwv_shl_vector( a, wwv_and( b, m ) ), wwv_shr_vector( a, wwv_and( wwv_neg( b ), m ) ) );
104 1000000 : }
105 :
106 1000000 : static inline wwv_t wwv_ror_vector( wwv_t a, wwv_t b ) {
107 1000000 : wwv_t m = wwv_bcast( 63UL );
108 1000000 : return wwv_or( wwv_shr_vector( a, wwv_and( b, m ) ), wwv_shl_vector( a, wwv_and( wwv_neg( b ), m ) ) );
109 1000000 : }
110 :
111 : /* wwv_bswap(x) returns wwv( bswap(x0), bswap(x1), ... ) */
112 :
113 6213102 : #define wwv_bswap( x ) _mm512_shuffle_epi8( (x), _mm512_set_epi8( 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7, \
114 6213102 : 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7, \
115 6213102 : 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7, \
116 6213102 : 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7 ) )
117 :
118 : /* Comparison operations */
119 : /* mask(c0,c1,...) means (((int)c0)<<0) | (((int)c1)<<1) | ... */
120 :
121 : #define wwv_eq(x,y) ((int)_mm512_cmpeq_epu64_mask( (x), (y) )) /* mask( x0==y0, x1==y1, ... ) */
122 : #define wwv_gt(x,y) ((int)_mm512_cmpgt_epu64_mask( (x), (y) )) /* mask( x0> y0, x1> y1, ... ) */
123 : #define wwv_lt(x,y) ((int)_mm512_cmplt_epu64_mask( (x), (y) )) /* mask( x0< y0, x1< y1, ... ) */
124 131077985 : #define wwv_ne(x,y) ((int)_mm512_cmpneq_epu64_mask( (x), (y) )) /* mask( x0!=y0, x1!=y1, ... ) */
125 : #define wwv_ge(x,y) ((int)_mm512_cmpge_epu64_mask( (x), (y) )) /* mask( x0>=y0, x1>=y1, ... ) */
126 : #define wwv_le(x,y) ((int)_mm512_cmple_epu64_mask( (x), (y) )) /* mask( x0<=y0, x1<=y1, ... ) */
127 :
128 : #define wwv_lnot(x) wwv_eq( (x), wwv_zero() ) /* mask( !x0, !x1, ... ) */
129 : #define wwv_lnotnot(x) wwv_ne( (x), wwv_zero() ) /* mask( !!x0, !!x1, ... ) */
130 :
131 : /* Conditional operations */
132 : /* cn means bit n of c */
133 :
134 133077985 : #define wwv_if(c,x,y) _mm512_mask_blend_epi64 ( (__mmask8)(c), (y), (x) ) /* wwv( c0? x0 :y0, ... ) */
135 4136308263 : #define wwv_add_if(c,x,y,z) _mm512_mask_add_epi64 ( (z), (__mmask8)(c), (x), (y) ) /* wwv( c0?(x0+y0):z0, ... ) */
136 238337007 : #define wwv_sub_if(c,x,y,z) _mm512_mask_sub_epi64 ( (z), (__mmask8)(c), (x), (y) ) /* wwv( c0?(x0-y0):z0, ... ) */
137 :
138 : #define wwv_and_if(c,x,y,z) _mm512_mask_and_epi64 ( (z), (__mmask8)(c), (x), (y) ) /* wwv( c0?(x0&y0):z0, ... ) */
139 : #define wwv_andnot_if(c,x,y,z) _mm512_mask_andnot_epi64( (z), (__mmask8)(c), (x), (y) ) /* wwv( c0?(~x0&y0):z0, ... ) */
140 : #define wwv_or_if(c,x,y,z) _mm512_mask_or_epi64 ( (z), (__mmask8)(c), (x), (y) ) /* wwv( c0?(x0|y0):z0, ... ) */
141 : #define wwv_xor_if(c,x,y,z) _mm512_mask_xor_epi64 ( (z), (__mmask8)(c), (x), (y) ) /* wwv( c0?(x0^y0):z0, ... ) */
142 :
143 : /* Conversions */
144 :
145 : /* wwv_to_wwi(x) returns [ (int)x0,0, (int)x1,0, ... (int)x7,0 ]
146 : wwv_to_wwu(x) returns [ (uint)x0,0, (uint)x1,0, ... (uint)x7,0 ]
147 : wwv_to_wwv(x) returns [ (ulong)x0, (ulong)x1, ... (ulong)x7 ] */
148 :
149 : #define wwv_to_wwi(x) wwv_and( (x), wwv_bcast( (ulong)UINT_MAX ) )
150 : #define wwv_to_wwu(x) wwv_and( (x), wwv_bcast( (ulong)UINT_MAX ) )
151 : #define wwv_to_wwl(x) (x)
152 :
153 : #define wwv_to_wwi_raw(x) (x)
154 : #define wwv_to_wwu_raw(x) (x)
155 : #define wwv_to_wwl_raw(x) (x)
156 :
157 : /* Misc operations */
158 :
159 : /* wwv_pack_halves(x,imm0,y,imm1) packs half of x and half of y into a
160 : wwv. imm0/imm1 select which half of x and y to pack. imm0 / imm1
161 : should be in [0,1]. That is, this returns:
162 :
163 : [ if( imm0, x(4:7), x(0:3) ) if( imm1, y(4:7), y(0:3) ) ]
164 :
165 : wwv_pack_h0_h1(x,y) does the wwv_pack_halves(x,0,y,1) case faster.
166 : Hat tip to Philip Taffet for pointing this out. */
167 :
168 : #define wwv_pack_halves(x,imm0,y,imm1) _mm512_shuffle_i64x2( (x), (y), 68+10*(imm0)+160*(imm1) )
169 : #define wwv_pack_h0_h1(x,y) _mm512_mask_blend_epi64( (__mmask8)0xF0, (x), (y) )
170 :
171 : /* wwv_madd52lo(a,b,c) returns LO64( a + LO52( LO52(b)*LO52(c) )
172 : wwv_madd52hi(a,b,c) returns LO64( a + HI52( LO52(b)*LO52(c) ) */
173 :
174 : #define wwv_madd52lo(a,b,c) _mm512_madd52lo_epu64( (a), (b), (c) )
175 : #define wwv_madd52hi(a,b,c) _mm512_madd52hi_epu64( (a), (b), (c) )
176 :
177 : /* wwv_slide(x,y,imm) treats as a x FIFO with the oldest / newest
178 : element at lane 0 / 7. Returns the result of dequeing x imm times
179 : and enqueing the values y0 ... y{imm-1} in that order. imm should be
180 : in [0,7]. For example, with imm==5 case, returns:
181 : [ x5 x6 x7 y0 y1 y2 y3 y4 ]. */
182 :
183 : #define wwv_slide(x,y,imm) _mm512_alignr_epi64( (y), (x), (imm) )
184 :
185 : /* wwv_unpack unpacks the wwv x into its ulong components x0,x1,...x7. */
186 :
187 122932795 : #define wwv_unpack( x, x0,x1,x2,x3,x4,x5,x6,x7 ) do { \
188 122932795 : __m512i _wwv_unpack_x = (x); \
189 122932795 : __m256i _wwv_unpack_xl = _mm512_extracti64x4_epi64( _wwv_unpack_x, 0 ); \
190 122932795 : __m256i _wwv_unpack_xh = _mm512_extracti64x4_epi64( _wwv_unpack_x, 1 ); \
191 122932795 : (x0) = (ulong)_mm256_extract_epi64( _wwv_unpack_xl, 0 ); \
192 122932795 : (x1) = (ulong)_mm256_extract_epi64( _wwv_unpack_xl, 1 ); \
193 122932795 : (x2) = (ulong)_mm256_extract_epi64( _wwv_unpack_xl, 2 ); \
194 122932795 : (x3) = (ulong)_mm256_extract_epi64( _wwv_unpack_xl, 3 ); \
195 122932795 : (x4) = (ulong)_mm256_extract_epi64( _wwv_unpack_xh, 0 ); \
196 122932795 : (x5) = (ulong)_mm256_extract_epi64( _wwv_unpack_xh, 1 ); \
197 122932795 : (x6) = (ulong)_mm256_extract_epi64( _wwv_unpack_xh, 2 ); \
198 122932795 : (x7) = (ulong)_mm256_extract_epi64( _wwv_unpack_xh, 3 ); \
199 122932795 : } while(0)
200 :
201 : /* wwv_transpose_8x8 sets wwv_t's c0,c1,...c7 to the columns of an 8x8
202 : ulong matrix given the rows of the matrix in wwv_t's r0,r1,...r7.
203 : In-place operation fine. */
204 :
205 11959914 : #define wwv_transpose_8x8( r0,r1,r2,r3,r4,r5,r6,r7, c0,c1,c2,c3,c4,c5,c6,c7 ) do { \
206 11959914 : wwv_t _wwv_transpose_r0 = (r0); wwv_t _wwv_transpose_r1 = (r1); \
207 11959914 : wwv_t _wwv_transpose_r2 = (r2); wwv_t _wwv_transpose_r3 = (r3); \
208 11959914 : wwv_t _wwv_transpose_r4 = (r4); wwv_t _wwv_transpose_r5 = (r5); \
209 11959914 : wwv_t _wwv_transpose_r6 = (r6); wwv_t _wwv_transpose_r7 = (r7); \
210 11959914 : \
211 11959914 : /* Outer 4x4 transpose of 2x2 blocks */ \
212 11959914 : wwv_t _wwv_transpose_t0 = _mm512_shuffle_i64x2( _wwv_transpose_r0, _wwv_transpose_r2, 0x88 ); \
213 11959914 : wwv_t _wwv_transpose_t1 = _mm512_shuffle_i64x2( _wwv_transpose_r1, _wwv_transpose_r3, 0x88 ); \
214 11959914 : wwv_t _wwv_transpose_t2 = _mm512_shuffle_i64x2( _wwv_transpose_r0, _wwv_transpose_r2, 0xdd ); \
215 11959914 : wwv_t _wwv_transpose_t3 = _mm512_shuffle_i64x2( _wwv_transpose_r1, _wwv_transpose_r3, 0xdd ); \
216 11959914 : wwv_t _wwv_transpose_t4 = _mm512_shuffle_i64x2( _wwv_transpose_r4, _wwv_transpose_r6, 0x88 ); \
217 11959914 : wwv_t _wwv_transpose_t5 = _mm512_shuffle_i64x2( _wwv_transpose_r5, _wwv_transpose_r7, 0x88 ); \
218 11959914 : wwv_t _wwv_transpose_t6 = _mm512_shuffle_i64x2( _wwv_transpose_r4, _wwv_transpose_r6, 0xdd ); \
219 11959914 : wwv_t _wwv_transpose_t7 = _mm512_shuffle_i64x2( _wwv_transpose_r5, _wwv_transpose_r7, 0xdd ); \
220 11959914 : \
221 11959914 : /**/ _wwv_transpose_r0 = _mm512_shuffle_i64x2( _wwv_transpose_t0, _wwv_transpose_t4, 0x88 ); \
222 11959914 : /**/ _wwv_transpose_r1 = _mm512_shuffle_i64x2( _wwv_transpose_t1, _wwv_transpose_t5, 0x88 ); \
223 11959914 : /**/ _wwv_transpose_r2 = _mm512_shuffle_i64x2( _wwv_transpose_t2, _wwv_transpose_t6, 0x88 ); \
224 11959914 : /**/ _wwv_transpose_r3 = _mm512_shuffle_i64x2( _wwv_transpose_t3, _wwv_transpose_t7, 0x88 ); \
225 11959914 : /**/ _wwv_transpose_r4 = _mm512_shuffle_i64x2( _wwv_transpose_t0, _wwv_transpose_t4, 0xdd ); \
226 11959914 : /**/ _wwv_transpose_r5 = _mm512_shuffle_i64x2( _wwv_transpose_t1, _wwv_transpose_t5, 0xdd ); \
227 11959914 : /**/ _wwv_transpose_r6 = _mm512_shuffle_i64x2( _wwv_transpose_t2, _wwv_transpose_t6, 0xdd ); \
228 11959914 : /**/ _wwv_transpose_r7 = _mm512_shuffle_i64x2( _wwv_transpose_t3, _wwv_transpose_t7, 0xdd ); \
229 11959914 : \
230 11959914 : /* Inner 2x2 transpose of 1x1 blocks */ \
231 11959914 : /**/ (c0) = _mm512_unpacklo_epi64( _wwv_transpose_r0, _wwv_transpose_r1 ); \
232 11959914 : /**/ (c1) = _mm512_unpackhi_epi64( _wwv_transpose_r0, _wwv_transpose_r1 ); \
233 11959914 : /**/ (c2) = _mm512_unpacklo_epi64( _wwv_transpose_r2, _wwv_transpose_r3 ); \
234 11959914 : /**/ (c3) = _mm512_unpackhi_epi64( _wwv_transpose_r2, _wwv_transpose_r3 ); \
235 11959914 : /**/ (c4) = _mm512_unpacklo_epi64( _wwv_transpose_r4, _wwv_transpose_r5 ); \
236 11959914 : /**/ (c5) = _mm512_unpackhi_epi64( _wwv_transpose_r4, _wwv_transpose_r5 ); \
237 11959914 : /**/ (c6) = _mm512_unpacklo_epi64( _wwv_transpose_r6, _wwv_transpose_r7 ); \
238 11959914 : /**/ (c7) = _mm512_unpackhi_epi64( _wwv_transpose_r6, _wwv_transpose_r7 ); \
239 11959914 : } while(0)
|