Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx512_h
2 : #error "Do not include this directly; use fd_avx512.h"
3 : #endif
4 :
5 : /* TODO: REDUCE, EXTRACT, ADDITIONAL LANE OPS, ... */
6 : /* TODO: USE INT FOR THS SCALAR N ROL/ROR (AND IN OTHER ROL/ROR)? */
7 : /* TODO: BACKPORT UNPACKS TO AVX AND SSE? */
8 :
9 : /* Vector uint API ***************************************************/
10 :
11 : /* A wwu_t is a vector where each 32-bit wide lane holds an unsigned
12 : 32-bit integer (a "uint").
13 :
14 : These mirror the other APIs as much as possible. Macros are
15 : preferred over static inlines when it is possible to do it robustly
16 : to reduce the risk of the compiler mucking it up. */
17 :
18 10412550177 : #define wwu_t __m512i
19 :
20 : /* Constructors */
21 :
22 : /* wwu(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf)
23 : returns the wwu_t [x0 x1 ... xf] where x* are uints */
24 :
25 : #define wwu(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf) \
26 36376122 : _mm512_setr_epi32( (int)(x0), (int)(x1), (int)(x2), (int)(x3), (int)(x4), (int)(x5), (int)(x6), (int)(x7), \
27 36376122 : (int)(x8), (int)(x9), (int)(xa), (int)(xb), (int)(xc), (int)(xd), (int)(xe), (int)(xf) )
28 :
29 118251302 : #define wwu_bcast(x) _mm512_set1_epi32( (int)(x) ) /* wwu(x, x, ... x) */
30 :
31 : /* wwu_permute(p,x) returns:
32 : wwu( x(p(0)), x(p(1)), ... x(p(15)) ).
33 : As such p(*) should be uints in [0,15]. */
34 :
35 : #define wwu_permute(p,x) _mm512_permutexvar_epi32( (p), (x) )
36 :
37 : /* wwu_select(s,x,y) concatenates the wwu_t's x and y into
38 : z = [ x0 x1 ... xf y0 y1 ... yf ]
39 : and then returns:
40 : wwu( z(p(0)), z(p(1)), ... z(p(15)) ).
41 : As such p(*) should be uints in [0,31]. */
42 :
43 41246456 : #define wwu_select(p,x,y) _mm512_permutex2var_epi32( (x), (p), (y) )
44 :
45 : /* Predefined constants */
46 :
47 51534677 : #define wwu_zero() _mm512_setzero_si512() /* wwu(0, 0, ... 0) */
48 : #define wwu_one() _mm512_set1_epi32( 1 ) /* wwu(1, 1, ... 1) */
49 :
50 : /* Memory operations */
51 : /* Note: wwu_{ld,st} assume m is 64-byte aligned while wwu_{ldu,stu}
52 : allow m to have arbitrary alignment */
53 :
54 42502423 : static inline wwu_t wwu_ld( uint const * m ) { return _mm512_load_epi32( m ); } /* wwu( m[0], m[1], ... m[15] ) */
55 171000000 : static inline void wwu_st( uint * m, wwu_t x ) { _mm512_store_epi32( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[15] = xf */
56 :
57 935957008 : static inline wwu_t wwu_ldu( void const * m ) { return _mm512_loadu_epi32( m ); } /* wwu( m[0], m[1], ... m[15]) */
58 1000000 : static inline void wwu_stu( void * m, wwu_t x ) { _mm512_storeu_epi32( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[15] = xf */
59 :
60 : /* Arithmetic operations */
61 :
62 : #define wwu_neg(x) _mm512_sub_epi32( _mm512_setzero_si512(), (x) ) /* wwu( -x0, -x1, ... -xf ) */
63 : #define wwu_abs(x) (x) /* wwu( x0, x1, ... xf ) */
64 :
65 : #define wwu_min(x,y) _mm512_min_epu32 ( (x), (y) ) /* wwu( min(x0,y0), min(x1,y1), ... min(xf,yf) ) */
66 : #define wwu_max(x,y) _mm512_max_epu32 ( (x), (y) ) /* wwu( max(x0,y0), max(x1,y1), ... max(xf,yf) ) */
67 17764183152 : #define wwu_add(x,y) _mm512_add_epi32 ( (x), (y) ) /* wwu( x0+y0, x1+y1, ... xf+yf ) */
68 : #define wwu_sub(x,y) _mm512_sub_epi32 ( (x), (y) ) /* wwu( x0-y0, x1-y1, ... xf-yf ) */
69 : #define wwu_mul(x,y) _mm512_mullo_epi32( (x), (y) ) /* wwu( x0*y0, x1*y1, ... xf*yf ) */
70 :
71 : /* Binary operations */
72 : /* Note: shifts assumes n and or y* in [0,31]. Rotates work for
73 : arbitrary values */
74 :
75 4000000 : #define wwu_not(x) _mm512_xor_epi32( _mm512_set1_epi32( -1 ), (x) )
76 :
77 : #define wwu_shl(x,n) _mm512_slli_epi32 ( (x), (uint)(n) ) /* wwu( x0<<n, x1<<n, ... xf<<n ) */
78 0 : #define wwu_shr(x,n) _mm512_srli_epi32 ( (x), (uint)(n) ) /* wwu( x0>>n, x1>>n, ... xf>>n ) */
79 958951476 : #define wwu_shl_vector(x,y) _mm512_sllv_epi32 ( (x), (y) ) /* wwu( x0<<y0, x1<<y1, ... xf<<yf ) */
80 : #define wwu_shr_vector(x,y) _mm512_srlv_epi32 ( (x), (y) ) /* wwu( x0>>y0, x1>>y1, ... xf>>yf ) */
81 1546539368 : #define wwu_and(x,y) _mm512_and_epi32 ( (x), (y) ) /* wwu( x0&y0, x1&y1, ... xf&yf ) */
82 : #define wwu_andnot(x,y) _mm512_andnot_epi32( (x), (y) ) /* wwu( ~x0&y0, ~x1&y1, ... ~xf&yf ) */
83 1550539368 : #define wwu_or(x,y) _mm512_or_epi32 ( (x), (y) ) /* wwu( x0|y0, x1|y1, ... xf|yf ) */
84 1067063630 : #define wwu_xor(x,y) _mm512_xor_epi32 ( (x), (y) ) /* wwu( x0^y0, x1^y1, ... xf^yf ) */
85 :
86 : /* wwu_rol(x,n) returns wwu( rotate_left (x0,n ), rotate_left (x1,n ), ... )
87 : wwu_ror(x,n) returns wwu( rotate_right(x0,n ), rotate_right(x1,n ), ... )
88 : wwu_rol_variable(x,n) returns wwu( rotate_left (x0,n ), rotate_left (x1,n ), ... )
89 : wwu_ror_variable(x,n) returns wwu( rotate_right(x0,n ), rotate_right(x1,n ), ... )
90 : wwu_rol_vector(x,y) returns wwu( rotate_left (x0,y0), rotate_left (x1,y1), ... )
91 : wwu_ror_vector(x,y) returns wwu( rotate_right(x0,y0), rotate_right(x1,y1), ... )
92 :
93 : The variable variants are slower but do not require the shift amount
94 : to be known at compile time. */
95 :
96 4000000 : #define wwu_rol(a,imm) _mm512_rol_epi32( (a), (imm)&31U )
97 4000000 : #define wwu_ror(a,imm) _mm512_ror_epi32( (a), (imm)&31U )
98 :
99 1000000 : static inline wwu_t wwu_rol_variable( wwu_t a, uint n ) { return wwu_or( wwu_shl( a, n & 31U ), wwu_shr( a, (-n) & 31U ) ); }
100 1000000 : static inline wwu_t wwu_ror_variable( wwu_t a, uint n ) { return wwu_or( wwu_shr( a, n & 31U ), wwu_shl( a, (-n) & 31U ) ); }
101 :
102 1000000 : static inline wwu_t wwu_rol_vector( wwu_t a, wwu_t b ) {
103 1000000 : wwu_t m = wwu_bcast( 31U );
104 1000000 : return wwu_or( wwu_shl_vector( a, wwu_and( b, m ) ), wwu_shr_vector( a, wwu_and( wwu_neg( b ), m ) ) );
105 1000000 : }
106 :
107 1000000 : static inline wwu_t wwu_ror_vector( wwu_t a, wwu_t b ) {
108 1000000 : wwu_t m = wwu_bcast( 31U );
109 1000000 : return wwu_or( wwu_shr_vector( a, wwu_and( b, m ) ), wwu_shl_vector( a, wwu_and( wwu_neg( b ), m ) ) );
110 1000000 : }
111 :
112 : /* wwu_bswap(x) returns wwu( bswap(x0), bswap(x1), ... ) */
113 :
114 : #define wwu_bswap( x ) _mm512_shuffle_epi8( (x), _mm512_set_epi8( 12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3, \
115 : 12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3, \
116 : 12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3, \
117 : 12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3 ) )
118 :
119 : /* Comparison operations */
120 : /* mask(c0,c1,...) means (((int)c0)<<0) | (((int)c1)<<1) | ... */
121 :
122 : #define wwu_eq(x,y) ((int)_mm512_cmpeq_epu32_mask( (x), (y) )) /* mask( x0==y0, x1==y1, ... ) */
123 : #define wwu_gt(x,y) ((int)_mm512_cmpgt_epu32_mask( (x), (y) )) /* mask( x0> y0, x1> y1, ... ) */
124 : #define wwu_lt(x,y) ((int)_mm512_cmplt_epu32_mask( (x), (y) )) /* mask( x0< y0, x1< y1, ... ) */
125 : #define wwu_ne(x,y) ((int)_mm512_cmpneq_epu32_mask( (x), (y) )) /* mask( x0!=y0, x1!=y1, ... ) */
126 : #define wwu_ge(x,y) ((int)_mm512_cmpge_epu32_mask( (x), (y) )) /* mask( x0>=y0, x1>=y1, ... ) */
127 : #define wwu_le(x,y) ((int)_mm512_cmple_epu32_mask( (x), (y) )) /* mask( x0<=y0, x1<=y1, ... ) */
128 :
129 : #define wwu_lnot(x) wwu_eq( (x), wwu_zero() ) /* mask( !x0, !x1, ... ) */
130 : #define wwu_lnotnot(x) wwu_ne( (x), wwu_zero() ) /* mask( !!x0, !!x1, ... ) */
131 :
132 : /* Conditional operations */
133 : /* cn means bit n of c */
134 :
135 2000000 : #define wwu_if(c,x,y) _mm512_mask_blend_epi32 ( (__mmask16)(c), (y), (x) ) /* wwu( c0? x0 :y0, ... ) */
136 :
137 467478504 : #define wwu_add_if(c,x,y,z) _mm512_mask_add_epi32 ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0+y0):z0, ... ) */
138 : #define wwu_sub_if(c,x,y,z) _mm512_mask_sub_epi32 ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0-y0):z0, ... ) */
139 :
140 : #define wwu_and_if(c,x,y,z) _mm512_mask_and_epi32 ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0&y0):z0, ... ) */
141 : #define wwu_andnot_if(c,x,y,z) _mm512_mask_andnot_epi32( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(~x0&y0):z0, ... ) */
142 : #define wwu_or_if(c,x,y,z) _mm512_mask_or_epi32 ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0|y0):z0, ... ) */
143 : #define wwu_xor_if(c,x,y,z) _mm512_mask_xor_epi32 ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0^y0):z0, ... ) */
144 :
145 : /* Conversions */
146 :
147 : /* wwu_to_wwi( x ) returns wwi( (int)x0, (int)x1, ... (int)x15 )
148 :
149 : wwu_to_wwl( x, 0 ) returns wwl( (long)x0, (long)x2, ... (long)x14 )
150 : wwu_to_wwl( x, 1 ) returns wwl( (long)x1, (long)x3, ... (long)x15 )
151 :
152 : wwu_to_wwv( x, 0 ) returns wwv( (ulong)x0, (ulong)x2, ... (ulong)x14 )
153 : wwu_to_wwv( x, 1 ) returns wwv( (ulong)x1, (ulong)x3, ... (ulong)x15 )
154 :
155 : TODO: consider _mm512_cvtepu32_* intrinsics? */
156 :
157 : #define wwu_to_wwi( x ) (x)
158 : #define wwu_to_wwl( x, odd ) /* trinary should be compile time */ \
159 : (__extension__({ wwl_t _wwu_to_wwl_tmp = (x); wwl_shru( (odd) ? _wwu_to_wwl_tmp : wwl_shl( _wwu_to_wwl_tmp, 32 ), 32 ); }))
160 : #define wwu_to_wwv( x, odd ) /* trinary should be compile time */ \
161 : (__extension__({ wwv_t _wwu_to_wwv_tmp = (x); wwv_shr ( (odd) ? _wwu_to_wwv_tmp : wwv_shl( _wwu_to_wwv_tmp, 32 ), 32 ); }))
162 :
163 : #define wwu_to_wwi_raw(x) (x)
164 : #define wwu_to_wwl_raw(x) (x)
165 : #define wwu_to_wwv_raw(x) (x)
166 :
167 : /* Misc operations */
168 :
169 : /* wwu_pack_halves(x,imm0,y,imm1) packs half of x and half of y into a
170 : wwu. imm0/imm1 select which half of x and y to pack. imm0 / imm1
171 : should be in [0,1]. That is, this returns:
172 :
173 : [ if( imm0, x(8:15), x(0:7) ) if( imm1, y(8:15), y(0:7) ) ]
174 :
175 : wwu_pack_h0_h1(x,y) does the wwu_pack_halves(x,0,y,1) case faster.
176 : Hat tip to Philip Taffet for pointing this out. */
177 :
178 : #define wwu_pack_halves(x,imm0,y,imm1) _mm512_shuffle_i32x4( (x), (y), 68+10*(imm0)+160*(imm1) )
179 : #define wwu_pack_h0_h1(x,y) _mm512_mask_blend_epi32( (__mmask16)0xFF00, (x), (y) )
180 :
181 : /* wwu_slide(x,y,imm) treats as a x FIFO with the oldest / newest
182 : element at lane 0 / 15. Returns the result of dequeing x imm times
183 : and enqueing the values y0 ... y{imm-1} in that order. imm should be
184 : in [0,15]. For example, with imm==5 case, returns:
185 : [ x5 x6 ... xf y0 y1 y2 y3 y4 ]. */
186 :
187 : #define wwu_slide(x,y,imm) _mm512_alignr_epi32( (y), (x), (imm) )
188 :
189 : /* wwv_unpack unpacks the wwv x into its uint components x0,x1,...xf. */
190 :
191 1000000 : #define wwu_unpack( x, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf ) do { \
192 1000000 : __m512i _wwu_unpack_x = (x); \
193 1000000 : __m256i _wwu_unpack_xl = _mm512_extracti32x8_epi32( _wwu_unpack_x, 0 ); \
194 1000000 : __m256i _wwu_unpack_xh = _mm512_extracti32x8_epi32( _wwu_unpack_x, 1 ); \
195 1000000 : (x0) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 0 ); \
196 1000000 : (x1) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 1 ); \
197 1000000 : (x2) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 2 ); \
198 1000000 : (x3) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 3 ); \
199 1000000 : (x4) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 4 ); \
200 1000000 : (x5) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 5 ); \
201 1000000 : (x6) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 6 ); \
202 1000000 : (x7) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 7 ); \
203 1000000 : (x8) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 0 ); \
204 1000000 : (x9) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 1 ); \
205 1000000 : (xa) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 2 ); \
206 1000000 : (xb) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 3 ); \
207 1000000 : (xc) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 4 ); \
208 1000000 : (xd) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 5 ); \
209 1000000 : (xe) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 6 ); \
210 1000000 : (xf) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 7 ); \
211 1000000 : } while(0)
212 :
213 : /* wwu_transpose_16x16 sets wwu_t's c0,c1,...cf to the columns of a
214 : 16x16 uint matrix given the rows of the matrix in wwu_t's
215 : r0,r1,...rf. In-place operation fine. */
216 :
217 : #define wwu_transpose_16x16( r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,ra,rb,rc,rd,re,rf, \
218 59434813 : c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf ) do { \
219 59434813 : wwu_t _wwu_transpose_r0 = (r0); wwu_t _wwu_transpose_r1 = (r1); \
220 59434813 : wwu_t _wwu_transpose_r2 = (r2); wwu_t _wwu_transpose_r3 = (r3); \
221 59434813 : wwu_t _wwu_transpose_r4 = (r4); wwu_t _wwu_transpose_r5 = (r5); \
222 59434813 : wwu_t _wwu_transpose_r6 = (r6); wwu_t _wwu_transpose_r7 = (r7); \
223 59434813 : wwu_t _wwu_transpose_r8 = (r8); wwu_t _wwu_transpose_r9 = (r9); \
224 59434813 : wwu_t _wwu_transpose_ra = (ra); wwu_t _wwu_transpose_rb = (rb); \
225 59434813 : wwu_t _wwu_transpose_rc = (rc); wwu_t _wwu_transpose_rd = (rd); \
226 59434813 : wwu_t _wwu_transpose_re = (re); wwu_t _wwu_transpose_rf = (rf); \
227 59434813 : \
228 59434813 : /* Outer 4x4 transpose of 4x4 blocks */ \
229 59434813 : wwu_t _wwu_transpose_t0 = _mm512_shuffle_i32x4( _wwu_transpose_r0, _wwu_transpose_r4, 0x88 ); \
230 59434813 : wwu_t _wwu_transpose_t1 = _mm512_shuffle_i32x4( _wwu_transpose_r1, _wwu_transpose_r5, 0x88 ); \
231 59434813 : wwu_t _wwu_transpose_t2 = _mm512_shuffle_i32x4( _wwu_transpose_r2, _wwu_transpose_r6, 0x88 ); \
232 59434813 : wwu_t _wwu_transpose_t3 = _mm512_shuffle_i32x4( _wwu_transpose_r3, _wwu_transpose_r7, 0x88 ); \
233 59434813 : wwu_t _wwu_transpose_t4 = _mm512_shuffle_i32x4( _wwu_transpose_r0, _wwu_transpose_r4, 0xdd ); \
234 59434813 : wwu_t _wwu_transpose_t5 = _mm512_shuffle_i32x4( _wwu_transpose_r1, _wwu_transpose_r5, 0xdd ); \
235 59434813 : wwu_t _wwu_transpose_t6 = _mm512_shuffle_i32x4( _wwu_transpose_r2, _wwu_transpose_r6, 0xdd ); \
236 59434813 : wwu_t _wwu_transpose_t7 = _mm512_shuffle_i32x4( _wwu_transpose_r3, _wwu_transpose_r7, 0xdd ); \
237 59434813 : wwu_t _wwu_transpose_t8 = _mm512_shuffle_i32x4( _wwu_transpose_r8, _wwu_transpose_rc, 0x88 ); \
238 59434813 : wwu_t _wwu_transpose_t9 = _mm512_shuffle_i32x4( _wwu_transpose_r9, _wwu_transpose_rd, 0x88 ); \
239 59434813 : wwu_t _wwu_transpose_ta = _mm512_shuffle_i32x4( _wwu_transpose_ra, _wwu_transpose_re, 0x88 ); \
240 59434813 : wwu_t _wwu_transpose_tb = _mm512_shuffle_i32x4( _wwu_transpose_rb, _wwu_transpose_rf, 0x88 ); \
241 59434813 : wwu_t _wwu_transpose_tc = _mm512_shuffle_i32x4( _wwu_transpose_r8, _wwu_transpose_rc, 0xdd ); \
242 59434813 : wwu_t _wwu_transpose_td = _mm512_shuffle_i32x4( _wwu_transpose_r9, _wwu_transpose_rd, 0xdd ); \
243 59434813 : wwu_t _wwu_transpose_te = _mm512_shuffle_i32x4( _wwu_transpose_ra, _wwu_transpose_re, 0xdd ); \
244 59434813 : wwu_t _wwu_transpose_tf = _mm512_shuffle_i32x4( _wwu_transpose_rb, _wwu_transpose_rf, 0xdd ); \
245 59434813 : \
246 59434813 : /**/ _wwu_transpose_r0 = _mm512_shuffle_i32x4( _wwu_transpose_t0, _wwu_transpose_t8, 0x88 ); \
247 59434813 : /**/ _wwu_transpose_r1 = _mm512_shuffle_i32x4( _wwu_transpose_t1, _wwu_transpose_t9, 0x88 ); \
248 59434813 : /**/ _wwu_transpose_r2 = _mm512_shuffle_i32x4( _wwu_transpose_t2, _wwu_transpose_ta, 0x88 ); \
249 59434813 : /**/ _wwu_transpose_r3 = _mm512_shuffle_i32x4( _wwu_transpose_t3, _wwu_transpose_tb, 0x88 ); \
250 59434813 : /**/ _wwu_transpose_r4 = _mm512_shuffle_i32x4( _wwu_transpose_t4, _wwu_transpose_tc, 0x88 ); \
251 59434813 : /**/ _wwu_transpose_r5 = _mm512_shuffle_i32x4( _wwu_transpose_t5, _wwu_transpose_td, 0x88 ); \
252 59434813 : /**/ _wwu_transpose_r6 = _mm512_shuffle_i32x4( _wwu_transpose_t6, _wwu_transpose_te, 0x88 ); \
253 59434813 : /**/ _wwu_transpose_r7 = _mm512_shuffle_i32x4( _wwu_transpose_t7, _wwu_transpose_tf, 0x88 ); \
254 59434813 : /**/ _wwu_transpose_r8 = _mm512_shuffle_i32x4( _wwu_transpose_t0, _wwu_transpose_t8, 0xdd ); \
255 59434813 : /**/ _wwu_transpose_r9 = _mm512_shuffle_i32x4( _wwu_transpose_t1, _wwu_transpose_t9, 0xdd ); \
256 59434813 : /**/ _wwu_transpose_ra = _mm512_shuffle_i32x4( _wwu_transpose_t2, _wwu_transpose_ta, 0xdd ); \
257 59434813 : /**/ _wwu_transpose_rb = _mm512_shuffle_i32x4( _wwu_transpose_t3, _wwu_transpose_tb, 0xdd ); \
258 59434813 : /**/ _wwu_transpose_rc = _mm512_shuffle_i32x4( _wwu_transpose_t4, _wwu_transpose_tc, 0xdd ); \
259 59434813 : /**/ _wwu_transpose_rd = _mm512_shuffle_i32x4( _wwu_transpose_t5, _wwu_transpose_td, 0xdd ); \
260 59434813 : /**/ _wwu_transpose_re = _mm512_shuffle_i32x4( _wwu_transpose_t6, _wwu_transpose_te, 0xdd ); \
261 59434813 : /**/ _wwu_transpose_rf = _mm512_shuffle_i32x4( _wwu_transpose_t7, _wwu_transpose_tf, 0xdd ); \
262 59434813 : \
263 59434813 : /* Inner 4x4 transpose of 1x1 blocks */ \
264 59434813 : /**/ _wwu_transpose_t0 = _mm512_unpacklo_epi32( _wwu_transpose_r0, _wwu_transpose_r2 ); \
265 59434813 : /**/ _wwu_transpose_t1 = _mm512_unpacklo_epi32( _wwu_transpose_r1, _wwu_transpose_r3 ); \
266 59434813 : /**/ _wwu_transpose_t2 = _mm512_unpackhi_epi32( _wwu_transpose_r0, _wwu_transpose_r2 ); \
267 59434813 : /**/ _wwu_transpose_t3 = _mm512_unpackhi_epi32( _wwu_transpose_r1, _wwu_transpose_r3 ); \
268 59434813 : /**/ _wwu_transpose_t4 = _mm512_unpacklo_epi32( _wwu_transpose_r4, _wwu_transpose_r6 ); \
269 59434813 : /**/ _wwu_transpose_t5 = _mm512_unpacklo_epi32( _wwu_transpose_r5, _wwu_transpose_r7 ); \
270 59434813 : /**/ _wwu_transpose_t6 = _mm512_unpackhi_epi32( _wwu_transpose_r4, _wwu_transpose_r6 ); \
271 59434813 : /**/ _wwu_transpose_t7 = _mm512_unpackhi_epi32( _wwu_transpose_r5, _wwu_transpose_r7 ); \
272 59434813 : /**/ _wwu_transpose_t8 = _mm512_unpacklo_epi32( _wwu_transpose_r8, _wwu_transpose_ra ); \
273 59434813 : /**/ _wwu_transpose_t9 = _mm512_unpacklo_epi32( _wwu_transpose_r9, _wwu_transpose_rb ); \
274 59434813 : /**/ _wwu_transpose_ta = _mm512_unpackhi_epi32( _wwu_transpose_r8, _wwu_transpose_ra ); \
275 59434813 : /**/ _wwu_transpose_tb = _mm512_unpackhi_epi32( _wwu_transpose_r9, _wwu_transpose_rb ); \
276 59434813 : /**/ _wwu_transpose_tc = _mm512_unpacklo_epi32( _wwu_transpose_rc, _wwu_transpose_re ); \
277 59434813 : /**/ _wwu_transpose_td = _mm512_unpacklo_epi32( _wwu_transpose_rd, _wwu_transpose_rf ); \
278 59434813 : /**/ _wwu_transpose_te = _mm512_unpackhi_epi32( _wwu_transpose_rc, _wwu_transpose_re ); \
279 59434813 : /**/ _wwu_transpose_tf = _mm512_unpackhi_epi32( _wwu_transpose_rd, _wwu_transpose_rf ); \
280 59434813 : \
281 59434813 : /**/ (c0) = _mm512_unpacklo_epi32( _wwu_transpose_t0, _wwu_transpose_t1 ); \
282 59434813 : /**/ (c1) = _mm512_unpackhi_epi32( _wwu_transpose_t0, _wwu_transpose_t1 ); \
283 59434813 : /**/ (c2) = _mm512_unpacklo_epi32( _wwu_transpose_t2, _wwu_transpose_t3 ); \
284 59434813 : /**/ (c3) = _mm512_unpackhi_epi32( _wwu_transpose_t2, _wwu_transpose_t3 ); \
285 59434813 : /**/ (c4) = _mm512_unpacklo_epi32( _wwu_transpose_t4, _wwu_transpose_t5 ); \
286 59434813 : /**/ (c5) = _mm512_unpackhi_epi32( _wwu_transpose_t4, _wwu_transpose_t5 ); \
287 59434813 : /**/ (c6) = _mm512_unpacklo_epi32( _wwu_transpose_t6, _wwu_transpose_t7 ); \
288 59434813 : /**/ (c7) = _mm512_unpackhi_epi32( _wwu_transpose_t6, _wwu_transpose_t7 ); \
289 59434813 : /**/ (c8) = _mm512_unpacklo_epi32( _wwu_transpose_t8, _wwu_transpose_t9 ); \
290 59434813 : /**/ (c9) = _mm512_unpackhi_epi32( _wwu_transpose_t8, _wwu_transpose_t9 ); \
291 59434813 : /**/ (ca) = _mm512_unpacklo_epi32( _wwu_transpose_ta, _wwu_transpose_tb ); \
292 59434813 : /**/ (cb) = _mm512_unpackhi_epi32( _wwu_transpose_ta, _wwu_transpose_tb ); \
293 59434813 : /**/ (cc) = _mm512_unpacklo_epi32( _wwu_transpose_tc, _wwu_transpose_td ); \
294 59434813 : /**/ (cd) = _mm512_unpackhi_epi32( _wwu_transpose_tc, _wwu_transpose_td ); \
295 59434813 : /**/ (ce) = _mm512_unpacklo_epi32( _wwu_transpose_te, _wwu_transpose_tf ); \
296 59434813 : /**/ (cf) = _mm512_unpackhi_epi32( _wwu_transpose_te, _wwu_transpose_tf ); \
297 59434813 : } while(0)
298 :
299 : /* wwu_transpose_2x8x8 transposes the 2 8x8 matrices whose rows are
300 : held in the lower and upper halves of wwu_t's r0,r1...r7 and
301 : stores the result in c0,c1...c7. In-place operation fine. */
302 :
303 : #define wwu_transpose_2x8x8( r0,r1,r2,r3,r4,r5,r6,r7, \
304 5155807 : c0,c1,c2,c3,c4,c5,c6,c7 ) { \
305 5155807 : wwu_t _wwu_transpose_r0 = (r0); wwu_t _wwu_transpose_r1 = (r1); \
306 5155807 : wwu_t _wwu_transpose_r2 = (r2); wwu_t _wwu_transpose_r3 = (r3); \
307 5155807 : wwu_t _wwu_transpose_r4 = (r4); wwu_t _wwu_transpose_r5 = (r5); \
308 5155807 : wwu_t _wwu_transpose_r6 = (r6); wwu_t _wwu_transpose_r7 = (r7); \
309 5155807 : \
310 5155807 : /* Outer 2x2 transpose of 4x4 blocks */ \
311 5155807 : /* No _mm256_permute2f128_si128 equiv? sigh ... probably a better method possible here */ \
312 5155807 : wwu_t _wwu_transpose_p = wwu( 0U, 1U, 2U, 3U,16U,17U,18U,19U, 8U, 9U,10U,11U,24U,25U,26U,27U); \
313 5155807 : wwu_t _wwu_transpose_q = wwu( 4U, 5U, 6U, 7U,20U,21U,22U,23U,12U,13U,14U,15U,28U,29U,30U,31U); \
314 5155807 : wwu_t _wwu_transpose_t0 = wwu_select( _wwu_transpose_p, _wwu_transpose_r0, _wwu_transpose_r4 ); \
315 5155807 : wwu_t _wwu_transpose_t1 = wwu_select( _wwu_transpose_p, _wwu_transpose_r1, _wwu_transpose_r5 ); \
316 5155807 : wwu_t _wwu_transpose_t2 = wwu_select( _wwu_transpose_p, _wwu_transpose_r2, _wwu_transpose_r6 ); \
317 5155807 : wwu_t _wwu_transpose_t3 = wwu_select( _wwu_transpose_p, _wwu_transpose_r3, _wwu_transpose_r7 ); \
318 5155807 : wwu_t _wwu_transpose_t4 = wwu_select( _wwu_transpose_q, _wwu_transpose_r0, _wwu_transpose_r4 ); \
319 5155807 : wwu_t _wwu_transpose_t5 = wwu_select( _wwu_transpose_q, _wwu_transpose_r1, _wwu_transpose_r5 ); \
320 5155807 : wwu_t _wwu_transpose_t6 = wwu_select( _wwu_transpose_q, _wwu_transpose_r2, _wwu_transpose_r6 ); \
321 5155807 : wwu_t _wwu_transpose_t7 = wwu_select( _wwu_transpose_q, _wwu_transpose_r3, _wwu_transpose_r7 ); \
322 5155807 : \
323 5155807 : /* Inner 4x4 transpose of 1x1 blocks */ \
324 5155807 : /**/ _wwu_transpose_r0 = _mm512_unpacklo_epi32( _wwu_transpose_t0, _wwu_transpose_t2 ); \
325 5155807 : /**/ _wwu_transpose_r1 = _mm512_unpacklo_epi32( _wwu_transpose_t1, _wwu_transpose_t3 ); \
326 5155807 : /**/ _wwu_transpose_r2 = _mm512_unpackhi_epi32( _wwu_transpose_t0, _wwu_transpose_t2 ); \
327 5155807 : /**/ _wwu_transpose_r3 = _mm512_unpackhi_epi32( _wwu_transpose_t1, _wwu_transpose_t3 ); \
328 5155807 : /**/ _wwu_transpose_r4 = _mm512_unpacklo_epi32( _wwu_transpose_t4, _wwu_transpose_t6 ); \
329 5155807 : /**/ _wwu_transpose_r5 = _mm512_unpacklo_epi32( _wwu_transpose_t5, _wwu_transpose_t7 ); \
330 5155807 : /**/ _wwu_transpose_r6 = _mm512_unpackhi_epi32( _wwu_transpose_t4, _wwu_transpose_t6 ); \
331 5155807 : /**/ _wwu_transpose_r7 = _mm512_unpackhi_epi32( _wwu_transpose_t5, _wwu_transpose_t7 ); \
332 5155807 : \
333 5155807 : /**/ (c0) = _mm512_unpacklo_epi32( _wwu_transpose_r0, _wwu_transpose_r1 ); \
334 5155807 : /**/ (c1) = _mm512_unpackhi_epi32( _wwu_transpose_r0, _wwu_transpose_r1 ); \
335 5155807 : /**/ (c2) = _mm512_unpacklo_epi32( _wwu_transpose_r2, _wwu_transpose_r3 ); \
336 5155807 : /**/ (c3) = _mm512_unpackhi_epi32( _wwu_transpose_r2, _wwu_transpose_r3 ); \
337 5155807 : /**/ (c4) = _mm512_unpacklo_epi32( _wwu_transpose_r4, _wwu_transpose_r5 ); \
338 5155807 : /**/ (c5) = _mm512_unpackhi_epi32( _wwu_transpose_r4, _wwu_transpose_r5 ); \
339 5155807 : /**/ (c6) = _mm512_unpacklo_epi32( _wwu_transpose_r6, _wwu_transpose_r7 ); \
340 5155807 : /**/ (c7) = _mm512_unpackhi_epi32( _wwu_transpose_r6, _wwu_transpose_r7 ); \
341 5155807 : } while(0)
|