Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx512_h
2 : #error "Do not include this directly; use fd_avx512.h"
3 : #endif
4 :
5 : /* TODO: REDUCE, EXTRACT, ADDITIONAL LANE OPS, ... */
6 :
7 : /* Vector int API ****************************************************/
8 :
9 : /* A wwi_t is a vector where each 32-bit wide lane holds an signed twos
10 : complement 32-bit integer (an "int").
11 :
12 : These mirror the other APIs as much as possible. Macros are
13 : preferred over static inlines when it is possible to do it robustly
14 : to reduce the risk of the compiler mucking it up. */
15 :
16 90000000 : #define wwi_t __m512i
17 :
18 : /* Constructors */
19 :
20 : /* wwi(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf)
21 : returns the wwi_t [x0 x1 ... xf] where x* are ints */
22 :
23 : #define wwi(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf) \
24 8000000 : _mm512_setr_epi32( (x0), (x1), (x2), (x3), (x4), (x5), (x6), (x7), (x8), (x9), (xa), (xb), (xc), (xd), (xe), (xf) )
25 :
26 2000000 : #define wwi_bcast(x) _mm512_set1_epi32( (x) ) /* wwi(x, x, ... x) */
27 :
28 : /* wwi_permute(p,x) returns:
29 : wwi( x(p(0)), x(p(1)), ... x(p(15)) ).
30 : As such p(*) should be ints in [0,15]. */
31 :
32 : #define wwi_permute(p,x) _mm512_permutexvar_epi32( (p), (x) )
33 :
34 : /* wwi_select(s,x,y) concatenates the wwi_t's x and y into
35 : z = [ x0 x1 ... xf y0 y1 ... yf ]
36 : and then returns:
37 : wwi( z(p(0)), z(p(1)), ... z(p(15)) ).
38 : As such p(*) should be ints in [0,31]. */
39 :
40 8000000 : #define wwi_select(p,x,y) _mm512_permutex2var_epi32( (x), (p), (y) )
41 :
42 : /* Predefined constants */
43 :
44 : #define wwi_zero() _mm512_setzero_si512() /* wwi(0, 0, ... 0) */
45 : #define wwi_one() _mm512_set1_epi32( 1 ) /* wwi(1, 1, ... 1) */
46 :
47 : /* Memory operations */
48 : /* Note: wwi_{ld,st} assume m is 64-byte aligned while wwi_{ldu,stu}
49 : allow m to have arbitrary alignment */
50 :
51 1000000 : static inline wwi_t wwi_ld( int const * m ) { return _mm512_load_epi32( m ); } /* wwi( m[0], m[1], ... m[15] ) */
52 172000000 : static inline void wwi_st( int * m, wwi_t x ) { _mm512_store_epi32( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[15] = xf */
53 :
54 1000000 : static inline wwi_t wwi_ldu( void const * m ) { return _mm512_loadu_epi32( m ); } /* wwi( m[0], m[1], ... m[15]) */
55 1000000 : static inline void wwi_stu( void * m, wwi_t x ) { _mm512_storeu_epi32( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[15] = xf */
56 :
57 : /* Arithmetic operations */
58 :
59 : #define wwi_neg(x) _mm512_sub_epi32( _mm512_setzero_si512(), (x) ) /* wwi( -x0, -x1, ... -xf ) */
60 : #define wwi_abs(x) _mm512_abs_epi32( (x) ) /* wwi( |x0|, |x1|, ... |xf| ) */
61 :
62 : #define wwi_min(x,y) _mm512_min_epi32 ( (x), (y) ) /* wwi( min(x0,y0), min(x1,y1), ... min(xf,yf) ) */
63 : #define wwi_max(x,y) _mm512_max_epi32 ( (x), (y) ) /* wwi( max(x0,y0), max(x1,y1), ... max(xf,yf) ) */
64 : #define wwi_add(x,y) _mm512_add_epi32 ( (x), (y) ) /* wwi( x0+y0, x1+y1, ... xf+yf ) */
65 : #define wwi_sub(x,y) _mm512_sub_epi32 ( (x), (y) ) /* wwi( x0-y0, x1-y1, ... xf-yf ) */
66 : #define wwi_mul(x,y) _mm512_mullo_epi32( (x), (y) ) /* wwi( x0*y0, x1*y1, ... xf*yf ) */
67 :
68 : /* Binary operations */
69 : /* Note: shifts assumes n and or y* in [0,31]. Rotates work for
70 : arbitrary values */
71 :
72 4000000 : #define wwi_not(x) _mm512_xor_epi32( _mm512_set1_epi32( -1 ), (x) )
73 :
74 : #define wwi_shl(x,n) _mm512_slli_epi32 ( (x), (uint)(n) ) /* wwi( x0<<n, x1<<n, ... xf<<n ) */
75 : #define wwi_shr(x,n) _mm512_srai_epi32 ( (x), (uint)(n) ) /* wwi( x0>>n, x1>>n, ... xf>>n ) */
76 : #define wwi_shru(x,n) _mm512_srli_epi32 ( (x), (uint)(n) ) /* wwi( x0>>n, x1>>n, ... xf>>n ) (unsigned right shift) */
77 : #define wwi_shl_vector(x,y) _mm512_sllv_epi32 ( (x), (y) ) /* wwi( x0<<y0, x1<<y1, ... xf<<yf ) */
78 : #define wwi_shr_vector(x,y) _mm512_srav_epi32 ( (x), (y) ) /* wwi( x0>>y0, x1>>y1, ... xf>>yf ) */
79 : #define wwi_shru_vector(x,y) _mm512_srlv_epi32 ( (x), (y) ) /* wwi( x0>>y0, x1>>y1, ... xf>>yf ) (unsigned right shift) */
80 : #define wwi_and(x,y) _mm512_and_epi32 ( (x), (y) ) /* wwi( x0&y0, x1&y1, ... xf&yf ) */
81 : #define wwi_andnot(x,y) _mm512_andnot_epi32( (x), (y) ) /* wwi( ~x0&y0, ~x1&y1, ... ~xf&yf ) */
82 4000000 : #define wwi_or(x,y) _mm512_or_epi32 ( (x), (y) ) /* wwi( x0|y0, x1|y1, ... xf|yf ) */
83 : #define wwi_xor(x,y) _mm512_xor_epi32 ( (x), (y) ) /* wwi( x0^y0, x1^y1, ... xf^yf ) */
84 :
85 : /* wwi_rol(x,n) returns wwi( rotate_left (x0,n ), rotate_left (x1,n ), ... )
86 : wwi_ror(x,n) returns wwi( rotate_right(x0,n ), rotate_right(x1,n ), ... )
87 : wwi_rol_variable(x,n) returns wwi( rotate_left (x0,n ), rotate_left (x1,n ), ... )
88 : wwi_ror_variable(x,n) returns wwi( rotate_right(x0,n ), rotate_right(x1,n ), ... )
89 : wwi_rol_vector(x,y) returns wwi( rotate_left (x0,y0), rotate_left (x1,y1), ... )
90 : wwi_ror_vector(x,y) returns wwi( rotate_right(x0,y0), rotate_right(x1,y1), ... )
91 :
92 : The variable variants are slower but do not require the shift amount
93 : to be known at compile time. */
94 :
95 4000000 : #define wwi_rol(a,imm) _mm512_rol_epi32( (a), (imm)&31 )
96 4000000 : #define wwi_ror(a,imm) _mm512_ror_epi32( (a), (imm)&31 )
97 :
98 1000000 : static inline wwi_t wwi_rol_variable( wwi_t a, int n ) { return wwi_or( wwi_shl ( a, n & 31 ), wwi_shru( a, (-n) & 31 ) ); }
99 1000000 : static inline wwi_t wwi_ror_variable( wwi_t a, int n ) { return wwi_or( wwi_shru( a, n & 31 ), wwi_shl ( a, (-n) & 31 ) ); }
100 :
101 :
102 1000000 : static inline wwi_t wwi_rol_vector( wwi_t a, wwi_t b ) {
103 1000000 : wwi_t m = wwi_bcast( 31 );
104 1000000 : return wwi_or( wwi_shl_vector ( a, wwi_and( b, m ) ), wwi_shru_vector( a, wwi_and( wwi_neg( b ), m ) ) );
105 1000000 : }
106 :
107 1000000 : static inline wwi_t wwi_ror_vector( wwi_t a, wwi_t b ) {
108 1000000 : wwi_t m = wwi_bcast( 31 );
109 1000000 : return wwi_or( wwi_shru_vector( a, wwi_and( b, m ) ), wwi_shl_vector ( a, wwi_and( wwi_neg( b ), m ) ) );
110 1000000 : }
111 :
112 : /* Comparison operations */
113 : /* mask(c0,c1,...) means (((int)c0)<<0) | (((int)c1)<<1) | ... */
114 :
115 : #define wwi_eq(x,y) ((int)_mm512_cmpeq_epi32_mask( (x), (y) )) /* mask( x0==y0, x1==y1, ... ) */
116 : #define wwi_gt(x,y) ((int)_mm512_cmpgt_epi32_mask( (x), (y) )) /* mask( x0> y0, x1> y1, ... ) */
117 : #define wwi_lt(x,y) ((int)_mm512_cmplt_epi32_mask( (x), (y) )) /* mask( x0< y0, x1< y1, ... ) */
118 : #define wwi_ne(x,y) ((int)_mm512_cmpneq_epi32_mask( (x), (y) )) /* mask( x0!=y0, x1!=y1, ... ) */
119 : #define wwi_ge(x,y) ((int)_mm512_cmpge_epi32_mask( (x), (y) )) /* mask( x0>=y0, x1>=y1, ... ) */
120 : #define wwi_le(x,y) ((int)_mm512_cmple_epi32_mask( (x), (y) )) /* mask( x0<=y0, x1<=y1, ... ) */
121 :
122 : #define wwi_lnot(x) wwi_eq( (x), wwi_zero() ) /* mask( !x0, !x1, ... ) */
123 : #define wwi_lnotnot(x) wwi_ne( (x), wwi_zero() ) /* mask( !!x0, !!x1, ... ) */
124 :
125 : /* Conditional operations */
126 : /* cn means bit n of c */
127 :
128 2000000 : #define wwi_if(c,x,y) _mm512_mask_blend_epi32 ( (__mmask16)(c), (y), (x) ) /* wwi( c0? x0 :y0, ... ) */
129 :
130 : #define wwi_add_if(c,x,y,z) _mm512_mask_add_epi32 ( (z), (__mmask16)(c), (x), (y) ) /* wwi( c0?(x0+y0):z0, ... ) */
131 : #define wwi_sub_if(c,x,y,z) _mm512_mask_sub_epi32 ( (z), (__mmask16)(c), (x), (y) ) /* wwi( c0?(x0-y0):z0, ... ) */
132 :
133 : #define wwi_and_if(c,x,y,z) _mm512_mask_and_epi32 ( (z), (__mmask16)(c), (x), (y) ) /* wwi( c0?(x0&y0):z0, ... ) */
134 : #define wwi_andnot_if(c,x,y,z) _mm512_mask_andnot_epi32( (z), (__mmask16)(c), (x), (y) ) /* wwi( c0?(~x0&y0):z0, ... ) */
135 : #define wwi_or_if(c,x,y,z) _mm512_mask_or_epi32 ( (z), (__mmask16)(c), (x), (y) ) /* wwi( c0?(x0|y0):z0, ... ) */
136 : #define wwi_xor_if(c,x,y,z) _mm512_mask_xor_epi32 ( (z), (__mmask16)(c), (x), (y) ) /* wwi( c0?(x0^y0):z0, ... ) */
137 :
138 : /* Conversions */
139 :
140 : /* wwi_to_wwu( x ) returns wwi( (uint)x0, (uint)x1, ... (uint)x15 )
141 :
142 : wwi_to_wwl( x, 0 ) returns wwl( (long)x0, (long)x2, ... (long)x14 )
143 : wwi_to_wwl( x, 1 ) returns wwl( (long)x1, (long)x3, ... (long)x15 )
144 :
145 : wwi_to_wwv( x, 0 ) returns wwv( (ulong)x0, (ulong)x2, ... (ulong)x14 )
146 : wwi_to_wwv( x, 1 ) returns wwv( (ulong)x1, (ulong)x3, ... (ulong)x15 )
147 :
148 : TODO: consider _mm512_cvtepi32_* intrinsics? */
149 :
150 : #define wwi_to_wwu( x ) (x)
151 : #define wwi_to_wwl( x, odd ) /* trinary should be compile time */ \
152 : (__extension__({ wwl_t _wwi_to_wwl_tmp = (x); wwl_shr( (odd) ? _wwi_to_wwl_tmp : wwl_shl( _wwi_to_wwl_tmp, 32 ), 32 ); }))
153 : #define wwi_to_wwv( x, odd ) /* trinary should be compile time (yes, wwl_shr) */ \
154 : (__extension__({ wwv_t _wwi_to_wwv_tmp = (x); wwl_shr( (odd) ? _wwi_to_wwv_tmp : wwv_shl( _wwi_to_wwv_tmp, 32 ), 32 ); }))
155 :
156 : #define wwi_to_wwu_raw(x) (x)
157 : #define wwi_to_wwl_raw(x) (x)
158 : #define wwi_to_wwv_raw(x) (x)
159 :
160 : /* Misc operations */
161 :
162 : /* wwi_pack_halves(x,imm0,y,imm1) packs half of x and half of y into a
163 : wwi. imm0/imm1 select which half of x and y to pack. imm0 / imm1
164 : should be in [0,1]. That is, this returns:
165 :
166 : [ if( imm0, x(8:15), x(0:7) ) if( imm1, y(8:15), y(0:7) ) ]
167 :
168 : wwi_pack_h0_h1(x,y) does the wwi_pack_halves(x,0,y,1) case faster.
169 : Hat tip to Philip Taffet for pointing this out. */
170 :
171 : #define wwi_pack_halves(x,imm0,y,imm1) _mm512_shuffle_i32x4( (x), (y), 68+10*(imm0)+160*(imm1) )
172 : #define wwi_pack_h0_h1(x,y) _mm512_mask_blend_epi32( (__mmask16)0xFF00, (x), (y) )
173 :
174 : /* wwi_slide(x,y,imm) treats as a x FIFO with the oldest / newest
175 : element at lane 0 / 15. Returns the result of dequeing x imm times
176 : and enqueing the values y0 ... y{imm-1} in that order. imm should be
177 : in [0,15]. For example, with imm==5 case, returns:
178 : [ x5 x6 ... xf y0 y1 y2 y3 y4 ]. */
179 :
180 : #define wwi_slide(x,y,imm) _mm512_alignr_epi32( (y), (x), (imm) )
181 :
182 : /* wwv_unpack unpacks the wwv x into its int components x0,x1,...xf. */
183 :
184 1000000 : #define wwi_unpack( x, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf ) do { \
185 1000000 : __m512i _wwi_unpack_x = (x); \
186 1000000 : __m256i _wwi_unpack_xl = _mm512_extracti32x8_epi32( _wwi_unpack_x, 0 ); \
187 1000000 : __m256i _wwi_unpack_xh = _mm512_extracti32x8_epi32( _wwi_unpack_x, 1 ); \
188 1000000 : (x0) = _mm256_extract_epi32( _wwi_unpack_xl, 0 ); \
189 1000000 : (x1) = _mm256_extract_epi32( _wwi_unpack_xl, 1 ); \
190 1000000 : (x2) = _mm256_extract_epi32( _wwi_unpack_xl, 2 ); \
191 1000000 : (x3) = _mm256_extract_epi32( _wwi_unpack_xl, 3 ); \
192 1000000 : (x4) = _mm256_extract_epi32( _wwi_unpack_xl, 4 ); \
193 1000000 : (x5) = _mm256_extract_epi32( _wwi_unpack_xl, 5 ); \
194 1000000 : (x6) = _mm256_extract_epi32( _wwi_unpack_xl, 6 ); \
195 1000000 : (x7) = _mm256_extract_epi32( _wwi_unpack_xl, 7 ); \
196 1000000 : (x8) = _mm256_extract_epi32( _wwi_unpack_xh, 0 ); \
197 1000000 : (x9) = _mm256_extract_epi32( _wwi_unpack_xh, 1 ); \
198 1000000 : (xa) = _mm256_extract_epi32( _wwi_unpack_xh, 2 ); \
199 1000000 : (xb) = _mm256_extract_epi32( _wwi_unpack_xh, 3 ); \
200 1000000 : (xc) = _mm256_extract_epi32( _wwi_unpack_xh, 4 ); \
201 1000000 : (xd) = _mm256_extract_epi32( _wwi_unpack_xh, 5 ); \
202 1000000 : (xe) = _mm256_extract_epi32( _wwi_unpack_xh, 6 ); \
203 1000000 : (xf) = _mm256_extract_epi32( _wwi_unpack_xh, 7 ); \
204 1000000 : } while(0)
205 :
206 : /* wwi_transpose_16x16 sets wwi_t's c0,c1,...cf to the columns of a
207 : 16x16 int matrix given the rows of the matrix in wwi_t's r0,r1,...rf.
208 : In-place operation fine. */
209 :
210 : #define wwi_transpose_16x16( r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,ra,rb,rc,rd,re,rf, \
211 1000000 : c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf ) do { \
212 1000000 : wwi_t _wwi_transpose_r0 = (r0); wwi_t _wwi_transpose_r1 = (r1); \
213 1000000 : wwi_t _wwi_transpose_r2 = (r2); wwi_t _wwi_transpose_r3 = (r3); \
214 1000000 : wwi_t _wwi_transpose_r4 = (r4); wwi_t _wwi_transpose_r5 = (r5); \
215 1000000 : wwi_t _wwi_transpose_r6 = (r6); wwi_t _wwi_transpose_r7 = (r7); \
216 1000000 : wwi_t _wwi_transpose_r8 = (r8); wwi_t _wwi_transpose_r9 = (r9); \
217 1000000 : wwi_t _wwi_transpose_ra = (ra); wwi_t _wwi_transpose_rb = (rb); \
218 1000000 : wwi_t _wwi_transpose_rc = (rc); wwi_t _wwi_transpose_rd = (rd); \
219 1000000 : wwi_t _wwi_transpose_re = (re); wwi_t _wwi_transpose_rf = (rf); \
220 1000000 : \
221 1000000 : /* Outer 4x4 transpose of 4x4 blocks */ \
222 1000000 : wwi_t _wwi_transpose_t0 = _mm512_shuffle_i32x4( _wwi_transpose_r0, _wwi_transpose_r4, 0x88 ); \
223 1000000 : wwi_t _wwi_transpose_t1 = _mm512_shuffle_i32x4( _wwi_transpose_r1, _wwi_transpose_r5, 0x88 ); \
224 1000000 : wwi_t _wwi_transpose_t2 = _mm512_shuffle_i32x4( _wwi_transpose_r2, _wwi_transpose_r6, 0x88 ); \
225 1000000 : wwi_t _wwi_transpose_t3 = _mm512_shuffle_i32x4( _wwi_transpose_r3, _wwi_transpose_r7, 0x88 ); \
226 1000000 : wwi_t _wwi_transpose_t4 = _mm512_shuffle_i32x4( _wwi_transpose_r0, _wwi_transpose_r4, 0xdd ); \
227 1000000 : wwi_t _wwi_transpose_t5 = _mm512_shuffle_i32x4( _wwi_transpose_r1, _wwi_transpose_r5, 0xdd ); \
228 1000000 : wwi_t _wwi_transpose_t6 = _mm512_shuffle_i32x4( _wwi_transpose_r2, _wwi_transpose_r6, 0xdd ); \
229 1000000 : wwi_t _wwi_transpose_t7 = _mm512_shuffle_i32x4( _wwi_transpose_r3, _wwi_transpose_r7, 0xdd ); \
230 1000000 : wwi_t _wwi_transpose_t8 = _mm512_shuffle_i32x4( _wwi_transpose_r8, _wwi_transpose_rc, 0x88 ); \
231 1000000 : wwi_t _wwi_transpose_t9 = _mm512_shuffle_i32x4( _wwi_transpose_r9, _wwi_transpose_rd, 0x88 ); \
232 1000000 : wwi_t _wwi_transpose_ta = _mm512_shuffle_i32x4( _wwi_transpose_ra, _wwi_transpose_re, 0x88 ); \
233 1000000 : wwi_t _wwi_transpose_tb = _mm512_shuffle_i32x4( _wwi_transpose_rb, _wwi_transpose_rf, 0x88 ); \
234 1000000 : wwi_t _wwi_transpose_tc = _mm512_shuffle_i32x4( _wwi_transpose_r8, _wwi_transpose_rc, 0xdd ); \
235 1000000 : wwi_t _wwi_transpose_td = _mm512_shuffle_i32x4( _wwi_transpose_r9, _wwi_transpose_rd, 0xdd ); \
236 1000000 : wwi_t _wwi_transpose_te = _mm512_shuffle_i32x4( _wwi_transpose_ra, _wwi_transpose_re, 0xdd ); \
237 1000000 : wwi_t _wwi_transpose_tf = _mm512_shuffle_i32x4( _wwi_transpose_rb, _wwi_transpose_rf, 0xdd ); \
238 1000000 : \
239 1000000 : /**/ _wwi_transpose_r0 = _mm512_shuffle_i32x4( _wwi_transpose_t0, _wwi_transpose_t8, 0x88 ); \
240 1000000 : /**/ _wwi_transpose_r1 = _mm512_shuffle_i32x4( _wwi_transpose_t1, _wwi_transpose_t9, 0x88 ); \
241 1000000 : /**/ _wwi_transpose_r2 = _mm512_shuffle_i32x4( _wwi_transpose_t2, _wwi_transpose_ta, 0x88 ); \
242 1000000 : /**/ _wwi_transpose_r3 = _mm512_shuffle_i32x4( _wwi_transpose_t3, _wwi_transpose_tb, 0x88 ); \
243 1000000 : /**/ _wwi_transpose_r4 = _mm512_shuffle_i32x4( _wwi_transpose_t4, _wwi_transpose_tc, 0x88 ); \
244 1000000 : /**/ _wwi_transpose_r5 = _mm512_shuffle_i32x4( _wwi_transpose_t5, _wwi_transpose_td, 0x88 ); \
245 1000000 : /**/ _wwi_transpose_r6 = _mm512_shuffle_i32x4( _wwi_transpose_t6, _wwi_transpose_te, 0x88 ); \
246 1000000 : /**/ _wwi_transpose_r7 = _mm512_shuffle_i32x4( _wwi_transpose_t7, _wwi_transpose_tf, 0x88 ); \
247 1000000 : /**/ _wwi_transpose_r8 = _mm512_shuffle_i32x4( _wwi_transpose_t0, _wwi_transpose_t8, 0xdd ); \
248 1000000 : /**/ _wwi_transpose_r9 = _mm512_shuffle_i32x4( _wwi_transpose_t1, _wwi_transpose_t9, 0xdd ); \
249 1000000 : /**/ _wwi_transpose_ra = _mm512_shuffle_i32x4( _wwi_transpose_t2, _wwi_transpose_ta, 0xdd ); \
250 1000000 : /**/ _wwi_transpose_rb = _mm512_shuffle_i32x4( _wwi_transpose_t3, _wwi_transpose_tb, 0xdd ); \
251 1000000 : /**/ _wwi_transpose_rc = _mm512_shuffle_i32x4( _wwi_transpose_t4, _wwi_transpose_tc, 0xdd ); \
252 1000000 : /**/ _wwi_transpose_rd = _mm512_shuffle_i32x4( _wwi_transpose_t5, _wwi_transpose_td, 0xdd ); \
253 1000000 : /**/ _wwi_transpose_re = _mm512_shuffle_i32x4( _wwi_transpose_t6, _wwi_transpose_te, 0xdd ); \
254 1000000 : /**/ _wwi_transpose_rf = _mm512_shuffle_i32x4( _wwi_transpose_t7, _wwi_transpose_tf, 0xdd ); \
255 1000000 : \
256 1000000 : /* Inner 4x4 transpose of 1x1 blocks */ \
257 1000000 : /**/ _wwi_transpose_t0 = _mm512_unpacklo_epi32( _wwi_transpose_r0, _wwi_transpose_r2 ); \
258 1000000 : /**/ _wwi_transpose_t1 = _mm512_unpacklo_epi32( _wwi_transpose_r1, _wwi_transpose_r3 ); \
259 1000000 : /**/ _wwi_transpose_t2 = _mm512_unpackhi_epi32( _wwi_transpose_r0, _wwi_transpose_r2 ); \
260 1000000 : /**/ _wwi_transpose_t3 = _mm512_unpackhi_epi32( _wwi_transpose_r1, _wwi_transpose_r3 ); \
261 1000000 : /**/ _wwi_transpose_t4 = _mm512_unpacklo_epi32( _wwi_transpose_r4, _wwi_transpose_r6 ); \
262 1000000 : /**/ _wwi_transpose_t5 = _mm512_unpacklo_epi32( _wwi_transpose_r5, _wwi_transpose_r7 ); \
263 1000000 : /**/ _wwi_transpose_t6 = _mm512_unpackhi_epi32( _wwi_transpose_r4, _wwi_transpose_r6 ); \
264 1000000 : /**/ _wwi_transpose_t7 = _mm512_unpackhi_epi32( _wwi_transpose_r5, _wwi_transpose_r7 ); \
265 1000000 : /**/ _wwi_transpose_t8 = _mm512_unpacklo_epi32( _wwi_transpose_r8, _wwi_transpose_ra ); \
266 1000000 : /**/ _wwi_transpose_t9 = _mm512_unpacklo_epi32( _wwi_transpose_r9, _wwi_transpose_rb ); \
267 1000000 : /**/ _wwi_transpose_ta = _mm512_unpackhi_epi32( _wwi_transpose_r8, _wwi_transpose_ra ); \
268 1000000 : /**/ _wwi_transpose_tb = _mm512_unpackhi_epi32( _wwi_transpose_r9, _wwi_transpose_rb ); \
269 1000000 : /**/ _wwi_transpose_tc = _mm512_unpacklo_epi32( _wwi_transpose_rc, _wwi_transpose_re ); \
270 1000000 : /**/ _wwi_transpose_td = _mm512_unpacklo_epi32( _wwi_transpose_rd, _wwi_transpose_rf ); \
271 1000000 : /**/ _wwi_transpose_te = _mm512_unpackhi_epi32( _wwi_transpose_rc, _wwi_transpose_re ); \
272 1000000 : /**/ _wwi_transpose_tf = _mm512_unpackhi_epi32( _wwi_transpose_rd, _wwi_transpose_rf ); \
273 1000000 : \
274 1000000 : /**/ (c0) = _mm512_unpacklo_epi32( _wwi_transpose_t0, _wwi_transpose_t1 ); \
275 1000000 : /**/ (c1) = _mm512_unpackhi_epi32( _wwi_transpose_t0, _wwi_transpose_t1 ); \
276 1000000 : /**/ (c2) = _mm512_unpacklo_epi32( _wwi_transpose_t2, _wwi_transpose_t3 ); \
277 1000000 : /**/ (c3) = _mm512_unpackhi_epi32( _wwi_transpose_t2, _wwi_transpose_t3 ); \
278 1000000 : /**/ (c4) = _mm512_unpacklo_epi32( _wwi_transpose_t4, _wwi_transpose_t5 ); \
279 1000000 : /**/ (c5) = _mm512_unpackhi_epi32( _wwi_transpose_t4, _wwi_transpose_t5 ); \
280 1000000 : /**/ (c6) = _mm512_unpacklo_epi32( _wwi_transpose_t6, _wwi_transpose_t7 ); \
281 1000000 : /**/ (c7) = _mm512_unpackhi_epi32( _wwi_transpose_t6, _wwi_transpose_t7 ); \
282 1000000 : /**/ (c8) = _mm512_unpacklo_epi32( _wwi_transpose_t8, _wwi_transpose_t9 ); \
283 1000000 : /**/ (c9) = _mm512_unpackhi_epi32( _wwi_transpose_t8, _wwi_transpose_t9 ); \
284 1000000 : /**/ (ca) = _mm512_unpacklo_epi32( _wwi_transpose_ta, _wwi_transpose_tb ); \
285 1000000 : /**/ (cb) = _mm512_unpackhi_epi32( _wwi_transpose_ta, _wwi_transpose_tb ); \
286 1000000 : /**/ (cc) = _mm512_unpacklo_epi32( _wwi_transpose_tc, _wwi_transpose_td ); \
287 1000000 : /**/ (cd) = _mm512_unpackhi_epi32( _wwi_transpose_tc, _wwi_transpose_td ); \
288 1000000 : /**/ (ce) = _mm512_unpacklo_epi32( _wwi_transpose_te, _wwi_transpose_tf ); \
289 1000000 : /**/ (cf) = _mm512_unpackhi_epi32( _wwi_transpose_te, _wwi_transpose_tf ); \
290 1000000 : } while(0)
291 :
292 : /* wwi_transpose_2x8x8 transposes the 2 8x8 matrices whose rows are
293 : held in the lower and upper halves of wwi_t's r0,r1...r7 and
294 : stores the result in c0,c1...c7. In-place operation fine. */
295 :
296 : #define wwi_transpose_2x8x8( r0,r1,r2,r3,r4,r5,r6,r7, \
297 1000000 : c0,c1,c2,c3,c4,c5,c6,c7 ) { \
298 1000000 : wwi_t _wwi_transpose_r0 = (r0); wwi_t _wwi_transpose_r1 = (r1); \
299 1000000 : wwi_t _wwi_transpose_r2 = (r2); wwi_t _wwi_transpose_r3 = (r3); \
300 1000000 : wwi_t _wwi_transpose_r4 = (r4); wwi_t _wwi_transpose_r5 = (r5); \
301 1000000 : wwi_t _wwi_transpose_r6 = (r6); wwi_t _wwi_transpose_r7 = (r7); \
302 1000000 : \
303 1000000 : /* Outer 2x2 transpose of 4x4 blocks */ \
304 1000000 : /* No _mm256_permute2f128_si128 equiv? sigh ... probably a better method possible here */ \
305 1000000 : wwi_t _wwi_transpose_p = wwi( 0, 1, 2, 3,16,17,18,19, 8, 9,10,11,24,25,26,27); \
306 1000000 : wwi_t _wwi_transpose_q = wwi( 4, 5, 6, 7,20,21,22,23,12,13,14,15,28,29,30,31); \
307 1000000 : wwi_t _wwi_transpose_t0 = wwi_select( _wwi_transpose_p, _wwi_transpose_r0, _wwi_transpose_r4 ); \
308 1000000 : wwi_t _wwi_transpose_t1 = wwi_select( _wwi_transpose_p, _wwi_transpose_r1, _wwi_transpose_r5 ); \
309 1000000 : wwi_t _wwi_transpose_t2 = wwi_select( _wwi_transpose_p, _wwi_transpose_r2, _wwi_transpose_r6 ); \
310 1000000 : wwi_t _wwi_transpose_t3 = wwi_select( _wwi_transpose_p, _wwi_transpose_r3, _wwi_transpose_r7 ); \
311 1000000 : wwi_t _wwi_transpose_t4 = wwi_select( _wwi_transpose_q, _wwi_transpose_r0, _wwi_transpose_r4 ); \
312 1000000 : wwi_t _wwi_transpose_t5 = wwi_select( _wwi_transpose_q, _wwi_transpose_r1, _wwi_transpose_r5 ); \
313 1000000 : wwi_t _wwi_transpose_t6 = wwi_select( _wwi_transpose_q, _wwi_transpose_r2, _wwi_transpose_r6 ); \
314 1000000 : wwi_t _wwi_transpose_t7 = wwi_select( _wwi_transpose_q, _wwi_transpose_r3, _wwi_transpose_r7 ); \
315 1000000 : \
316 1000000 : /* Inner 4x4 transpose of 1x1 blocks */ \
317 1000000 : /**/ _wwi_transpose_r0 = _mm512_unpacklo_epi32( _wwi_transpose_t0, _wwi_transpose_t2 ); \
318 1000000 : /**/ _wwi_transpose_r1 = _mm512_unpacklo_epi32( _wwi_transpose_t1, _wwi_transpose_t3 ); \
319 1000000 : /**/ _wwi_transpose_r2 = _mm512_unpackhi_epi32( _wwi_transpose_t0, _wwi_transpose_t2 ); \
320 1000000 : /**/ _wwi_transpose_r3 = _mm512_unpackhi_epi32( _wwi_transpose_t1, _wwi_transpose_t3 ); \
321 1000000 : /**/ _wwi_transpose_r4 = _mm512_unpacklo_epi32( _wwi_transpose_t4, _wwi_transpose_t6 ); \
322 1000000 : /**/ _wwi_transpose_r5 = _mm512_unpacklo_epi32( _wwi_transpose_t5, _wwi_transpose_t7 ); \
323 1000000 : /**/ _wwi_transpose_r6 = _mm512_unpackhi_epi32( _wwi_transpose_t4, _wwi_transpose_t6 ); \
324 1000000 : /**/ _wwi_transpose_r7 = _mm512_unpackhi_epi32( _wwi_transpose_t5, _wwi_transpose_t7 ); \
325 1000000 : \
326 1000000 : /**/ (c0) = _mm512_unpacklo_epi32( _wwi_transpose_r0, _wwi_transpose_r1 ); \
327 1000000 : /**/ (c1) = _mm512_unpackhi_epi32( _wwi_transpose_r0, _wwi_transpose_r1 ); \
328 1000000 : /**/ (c2) = _mm512_unpacklo_epi32( _wwi_transpose_r2, _wwi_transpose_r3 ); \
329 1000000 : /**/ (c3) = _mm512_unpackhi_epi32( _wwi_transpose_r2, _wwi_transpose_r3 ); \
330 1000000 : /**/ (c4) = _mm512_unpacklo_epi32( _wwi_transpose_r4, _wwi_transpose_r5 ); \
331 1000000 : /**/ (c5) = _mm512_unpackhi_epi32( _wwi_transpose_r4, _wwi_transpose_r5 ); \
332 1000000 : /**/ (c6) = _mm512_unpacklo_epi32( _wwi_transpose_r6, _wwi_transpose_r7 ); \
333 1000000 : /**/ (c7) = _mm512_unpackhi_epi32( _wwi_transpose_r6, _wwi_transpose_r7 ); \
334 1000000 : } while(0)
|