Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx_h
2 : #error "Do not include this directly; use fd_avx.h"
3 : #endif
4 :
5 : /* Vector uint API ****************************************************/
6 :
7 : /* A wu_t is a vector where each 32-bit wide lane holds an unsigned
8 : 32-bit integer (a "uint"). These mirror wc and wf as much as
9 : possible.
10 :
11 : These mirror the other APIs as much as possible. Macros are
12 : preferred over static inlines when it is possible to do it robustly
13 : to reduce the risk of the compiler mucking it up. */
14 :
15 39131053619 : #define wu_t __m256i
16 :
17 : /* Constructors */
18 :
19 : /* Given the uint values, return ... */
20 :
21 : #define wu(u0,u1,u2,u3,u4,u5,u6,u7) /* [ u0 u1 u2 u3 u4 u5 u6 u7 ] */ \
22 59179779 : _mm256_setr_epi32( (int)(u0), (int)(u1), (int)(u2), (int)(u3), (int)(u4), (int)(u5), (int)(u6), (int)(u7) )
23 :
24 4650037972 : #define wu_bcast(u0) _mm256_set1_epi32( (int)(u0) ) /* [ u0 u0 u0 u0 u0 u0 u0 u0 ] */
25 :
26 : static inline wu_t /* [ u0 u1 u0 u1 u0 u1 u0 u1 ] */
27 196608 : wu_bcast_pair( uint u0, uint u1 ) {
28 196608 : int i0 = (int)u0; int i1 = (int)u1;
29 196608 : return _mm256_setr_epi32( i0, i1, i0, i1, i0, i1, i0, i1 );
30 196608 : }
31 :
32 : static inline wu_t /* [ u0 u0 u0 u0 u1 u1 u1 u1 ] */
33 196608 : wu_bcast_lohi( uint u0, uint u1 ) {
34 196608 : int i0 = (int)u0; int i1 = (int)u1;
35 196608 : return _mm256_setr_epi32( i0, i0, i0, i0, i1, i1, i1, i1 );
36 196608 : }
37 :
38 : static inline wu_t /* [ u0 u1 u2 u3 u0 u1 u2 u3 ] */
39 196608 : wu_bcast_quad( uint u0, uint u1, uint u2, uint u3 ) {
40 196608 : int i0 = (int)u0; int i1 = (int)u1; int i2 = (int)u2; int i3 = (int)u3;
41 196608 : return _mm256_setr_epi32( i0, i1, i2, i3, i0, i1, i2, i3 );
42 196608 : }
43 :
44 : static inline wu_t /* [ u0 u0 u1 u1 u2 u2 u3 u3 ] */
45 196608 : wu_bcast_wide( uint u0, uint u1, uint u2, uint u3 ) {
46 196608 : int i0 = (int)u0; int i1 = (int)u1; int i2 = (int)u2; int i3 = (int)u3;
47 196608 : return _mm256_setr_epi32( i0, i0, i1, i1, i2, i2, i3, i3 );
48 196608 : }
49 :
50 : /* No general wu_permute due to cross-128-bit lane limitations in AVX.
51 : Useful cases are provided below. Given [ u0 u1 u2 u3 u4 u5 u6 u7 ],
52 : return ... */
53 :
54 : #define wu_bcast_even(x) /* [ u0 u0 u2 u2 u4 u4 u6 u6 ] */ \
55 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(2,2,0,0) ) )
56 :
57 : #define wu_bcast_odd(x) /* [ u1 u1 u3 u3 u5 u5 u7 u7 ] */ \
58 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(3,3,1,1) ) )
59 :
60 : #define wu_exch_adj(x) /* [ u1 u0 u3 u2 u5 u4 u7 u6 ] */ \
61 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(2,3,0,1) ) )
62 :
63 : #define wu_exch_adj_pair(x) /* [ u2 u3 u0 u1 u6 u7 u4 u5 ] */ \
64 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(1,0,3,2) ) )
65 :
66 : static inline wu_t
67 196608 : wu_exch_adj_quad( wu_t x ) { /* [ u4 u5 u6 u7 u0 u1 u2 u3 ] */
68 196608 : return _mm256_permute2f128_si256( x, x, 1 );
69 196608 : }
70 :
71 : /* Predefined constants */
72 :
73 229430193 : #define wu_zero() _mm256_setzero_si256() /* Return [ 0U 0U 0U 0U 0U 0U 0U 0U ] */
74 58589955 : #define wu_one() _mm256_set1_epi32( 1 ) /* Return [ 1U 1U 1U 1U 1U 1U 1U 1U ] */
75 :
76 : /* Memory operations */
77 :
78 : /* wu_ld return the 8 uints at the 32-byte aligned / 32-byte sized
79 : location p as a vector uint. wu_ldu is the same but p does not have
80 : to be aligned. wu_st writes the vector uint to the 32-byte aligned /
81 : 32-byte sized location p as 8 uints. wu_stu is the same but p does
82 : not have to be aligned. In all these lane l will be at p[l]. FIXME:
83 : USE ATTRIBUTES ON P PASSED TO THESE?
84 :
85 : Note: gcc knows a __m256i may alias. */
86 :
87 58589955 : static inline wu_t wu_ld( uint const * p ) { return _mm256_load_si256( (__m256i const *)p ); }
88 670286259 : static inline void wu_st( uint * p, wu_t i ) { _mm256_store_si256( (__m256i *)p, i ); }
89 :
90 5214795846 : static inline wu_t wu_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
91 652572955 : static inline void wu_stu( void * p, wu_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
92 :
93 : /* wu_ldif is an optimized equivalent to wu_notczero(c,wu_ldu(p)) (may
94 : have different behavior if c is not a proper vector conditional). It
95 : is provided for symmetry with the wu_stif operation. wu_stif stores
96 : x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
97 : Undefined behavior if c is not a proper vector conditional. */
98 :
99 : #define wu_ldif(c,p) _mm256_maskload_epi32( (p),(c))
100 : #define wu_stif(c,p,x) _mm256_maskstore_epi32((p),(c),(x))
101 :
102 : /* Element operations */
103 :
104 : /* wu_extract extracts the uint in lane imm from the vector uint.
105 : wu_insert returns the vector uint formed by replacing the value in
106 : lane imm of a with the provided uint. imm should be a compile time
107 : constant in 0:7. wu_extract_variable and wu_insert_variable are the
108 : slower but the lane n does not have to be known at compile time
109 : (should still be in 0:7).
110 :
111 : Note: C99 TC3 allows type punning through a union. */
112 :
113 468719640 : #define wu_extract(a,imm) ((uint)_mm256_extract_epi32( (a), (imm) ))
114 468719640 : #define wu_insert(a,imm,v) _mm256_insert_epi32( (a), (int)(v), (imm) )
115 :
116 : static inline uint
117 468719640 : wu_extract_variable( wu_t a, int n ) {
118 468719640 : union { __m256i m[1]; uint u[8]; } t[1];
119 468719640 : _mm256_store_si256( t->m, a );
120 468719640 : return t->u[n];
121 468719640 : }
122 :
123 : static inline wu_t
124 468719640 : wu_insert_variable( wu_t a, int n, uint v ) {
125 468719640 : union { __m256i m[1]; uint u[8]; } t[1];
126 468719640 : _mm256_store_si256( t->m, a );
127 468719640 : t->u[n] = v;
128 468719640 : return _mm256_load_si256( t->m );
129 468719640 : }
130 :
131 : /* Given [a0 a1 a2 a3 a4 a5 a6 a7] and/or [b0 b1 b2 b3 b4 b5 b6 b7],
132 : return ... */
133 :
134 : /* Arithmetic operations */
135 :
136 : #define wu_neg(a) _mm256_sub_epi32( _mm256_setzero_si256(), (a) ) /* [ -a0 -a1 ... -a7 ] (twos complement handling) */
137 : #define wu_abs(a) (a) /* [ |a0| |a1| ... |a7| ] (twos complement handling) */
138 :
139 : #define wu_min(a,b) _mm256_min_epu32( (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a7,b7) ] */
140 : #define wu_max(a,b) _mm256_max_epu32( (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a7,b7) ] */
141 84509905578 : #define wu_add(a,b) _mm256_add_epi32( (a), (b) ) /* [ a0 +b0 a1 +b1 ... a7 +b7 ] */
142 : #define wu_sub(a,b) _mm256_sub_epi32( (a), (b) ) /* [ a0 -b0 a1 -b1 ... a7 -b7 ] */
143 : #define wu_mul(a,b) _mm256_mullo_epi32( (a), (b) ) /* [ a0 *b0 a1 *b1 ... a7 *b7 ] */
144 :
145 : /* Binary operations */
146 :
147 : /* Note: wu_shl/wu_shr is an unsigned left/right shift by imm bits; imm
148 : must be a compile time constant in [0,31]. The variable variants are
149 : slower but do not require the shift amount to be known at compile
150 : time (should still be in [0,31]). */
151 :
152 : #define wu_not(a) _mm256_xor_si256( _mm256_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a7 ] */
153 :
154 : #define wu_shl(a,imm) _mm256_slli_epi32( (a), (imm) ) /* [ a0<<imm a1<<imm ... a7<<imm ] */
155 : #define wu_shr(a,imm) _mm256_srli_epi32( (a), (imm) ) /* [ a0>>imm a1>>imm ... a7>>imm ] */
156 :
157 : #define wu_shl_variable(a,n) _mm256_sll_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
158 : #define wu_shr_variable(a,n) _mm256_srl_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
159 :
160 6546992260 : #define wu_shl_vector(a,b) _mm256_sllv_epi32( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a7<<b7 ] */
161 : #define wu_shr_vector(a,b) _mm256_srlv_epi32( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a7>>b7 ] */
162 :
163 6546992260 : #define wu_and(a,b) _mm256_and_si256( (a), (b) ) /* [ a0 &b0 a1& b1 ... a7& b7 ] */
164 : #define wu_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ... (~a7)&b7 ] */
165 >15294*10^7 : #define wu_or(a,b) _mm256_or_si256( (a), (b) ) /* [ a0 |b0 a1 |b1 ... a7 |b7 ] */
166 13704882404 : #define wu_xor(a,b) _mm256_xor_si256( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ... a7 ^b7 ] */
167 :
168 : /* wu_rol(x,n) returns wu( rotate_left (x0,n), rotate_left (x1,n), ... )
169 : wu_ror(x,n) returns wu( rotate_right(x0,n), rotate_right(x1,n), ... ) */
170 :
171 : #if FD_HAS_AVX512
172 2061326256 : #define wu_rol(a,imm) _mm256_rol_epi32( (a), (imm) )
173 : #define wu_ror(a,imm) _mm256_ror_epi32( (a), (imm) )
174 : #else
175 >14070*10^7 : static inline wu_t wu_rol( wu_t a, int imm ) { return wu_or( wu_shl( a, imm & 31 ), wu_shr( a, (-imm) & 31 ) ); }
176 4194304 : static inline wu_t wu_ror( wu_t a, int imm ) { return wu_or( wu_shr( a, imm & 31 ), wu_shl( a, (-imm) & 31 ) ); }
177 : #endif
178 :
179 6291456 : static inline wu_t wu_rol_variable( wu_t a, int n ) { return wu_or( wu_shl_variable( a, n&31 ), wu_shr_variable( a, (-n)&31 ) ); }
180 6291456 : static inline wu_t wu_ror_variable( wu_t a, int n ) { return wu_or( wu_shr_variable( a, n&31 ), wu_shl_variable( a, (-n)&31 ) ); }
181 :
182 0 : static inline wu_t wu_rol_vector( wu_t a, wi_t b ) {
183 0 : wi_t m = wi_bcast( 31 );
184 0 : return wu_or( wu_shl_vector( a, wi_and( b, m ) ), wu_shr_vector( a, wi_and( wi_neg( b ), m ) ) );
185 0 : }
186 :
187 0 : static inline wu_t wu_ror_vector( wu_t a, wi_t b ) {
188 0 : wi_t m = wi_bcast( 31 );
189 0 : return wu_or( wu_shr_vector( a, wi_and( b, m ) ), wu_shl_vector( a, wi_and( wi_neg( b ), m ) ) );
190 0 : }
191 :
192 4200960748 : static inline wu_t wu_bswap( wu_t a ) {
193 4200960748 : wu_t m = wu_bcast( 0x00FF00FFU ); /* Probably hoisted */
194 4200960748 : wu_t t = wu_rol( a, 16 ); /* Swap E/O 16-bit pairs */
195 4200960748 : return wu_or( wu_andnot( m, wu_shl( t, 8 ) ), wu_and( m, wu_shr( t, 8 ) ) ); /* Swap E/O 8-bit pairs */
196 4200960748 : }
197 :
198 : /* Logical operations */
199 :
200 : /* Like noted below in the wu_to_{wf,wd} converters, Intel clearly has
201 : the hardware to do a _mm256_cmpgt_epu32 given that _mm256_cmpgt_epi32
202 : exists but doesn't expose it in the ISA pre AVX-512. Sigh ... twos
203 : complement bit tricks to the rescue for wu_{gt,lt,ge,le}. */
204 :
205 : #define wu_lnot(a) _mm256_cmpeq_epi32( (a), _mm256_setzero_si256() ) /* [ !a0 !a1 ... !a7 ] */
206 : #define wu_lnotnot(a) /* [ !!a0 !!a1 ... !!a7 ] */ \
207 : _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( (a), _mm256_setzero_si256() ) )
208 :
209 : #define wu_eq(a,b) _mm256_cmpeq_epi32( (a), (b) ) /* [ a0==b0 a1==b1 ... a7==b7 ] */
210 : #define wu_gt(a,b) /* [ a0> b0 a1> b1 ... a7> b7 ] */ \
211 : _mm256_cmpgt_epi32( _mm256_sub_epi32( (a), _mm256_set1_epi32( (int)(1U<<31) ) ), \
212 : _mm256_sub_epi32( (b), _mm256_set1_epi32( (int)(1U<<31) ) ) )
213 : #define wu_lt(a,b) wu_gt( (b), (a) ) /* [ a0< b0 a1< b1 ... a7< b7 ] */
214 : #define wu_ne(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a7!=b7 ] */
215 : #define wu_ge(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), wu_gt( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ... a7>=b7 ] */
216 : #define wu_le(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), wu_gt( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ... a7<=b7 ] */
217 :
218 : /* Conditional operations */
219 :
220 : #define wu_czero(c,f) _mm256_andnot_si256( (c), (f) ) /* [ c0?0U:f0 c1?0U:f1 ... c7?0U:f7 ] */
221 : #define wu_notczero(c,f) _mm256_and_si256( (c), (f) ) /* [ c0?f0:0U c1?f1:0U ... c7?f7:0U ] */
222 :
223 : #define wu_if(c,t,f) _mm256_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c7?t7:f7 ] */
224 :
225 : /* Conversion operations */
226 :
227 : /* Summarizing:
228 :
229 : wu_to_wc(a) returns [ !!a0 !!a1 ... !!a7 ]
230 :
231 : wu_to_wf(a) returns [ (float)a0 (float)a1 ... (float)a7 ]
232 :
233 : wu_to_wi(a) returns [ (int)a0 (int)a1 ... (int)a7 ]
234 :
235 : wu_to_wd(a,0) returns [ (double)a0 (double)a1 (double)a2 (double)a3 ]
236 : wu_to_wd(a,1) returns [ (double)a4 (double)a5 (double)a6 (double)a7 ]
237 :
238 : wu_to_wl(a,0) returns [ (long)a0 (long)a1 (long)a2 (long)a3 ]
239 : wu_to_wl(a,1) returns [ (long)a4 (long)a5 (long)a6 (long)a7 ]
240 :
241 : wu_to_wv(a,0) returns [ (ulong)a0 (ulong)a1 (ulong)a2 (ulong)a3 ]
242 : wu_to_wv(a,1) returns [ (ulong)a4 (ulong)a5 (ulong)a6 (ulong)a7 ]
243 :
244 : where imm_hi should be a compile time constant.
245 :
246 : For wu_to_{wd,wl}, the permutation used for the conversion is less
247 : flexible due to cross 128-bit lane limitations in AVX. If imm_hi==0,
248 : the conversion is done to lanes 0:3. Otherwise, the conversion is
249 : done to lanes 4:7.
250 :
251 : The raw variants just treat the raw bits as the corresponding vector
252 : type. For wu_to_wc_raw, the user promises wu contains a proper
253 : vector conditional (e.g. 0 or -1 in each lane). wu_to_wf_raw is
254 : useful for doing advanced bit tricks on floating point values. The
255 : others are probably dubious but are provided for completness. */
256 :
257 : #define wu_to_wc(a) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( (a), _mm256_setzero_si256() ) )
258 :
259 : #define wu_to_wi(a) (a)
260 :
261 786432 : static inline __m256d wu_to_wd( wu_t u, int imm_hi ) { /* FIXME: workaround wd_t isn't declared at this point */
262 :
263 : /* Note: Given that _mm256_cvtepi32_pd exists, Intel clearly has the
264 : hardware under the hood to support a _mm256_cvtepu32_pd but didn't
265 : bother to expose it pre AVX-512 ... sigh (all too typical
266 : unfortunately). We can do a mix of twos complement and floating
267 : point hacks to emulate it without spilling. */
268 :
269 786432 : __m128i i = imm_hi ? _mm256_extractf128_si256( u, 1 ) : _mm256_extractf128_si256( u, 0 ); // u if u<2^31, u-2^32 o.w
270 786432 : __m128i c = _mm_cmpgt_epi32( _mm_setzero_si128(), i ); // 0 if u<2^31, -1 o.w
271 786432 : __m256d d = _mm256_cvtepi32_pd( i ); // u if u<2^31, u-2^32 o.w, exact
272 786432 : __m256d ds = _mm256_add_pd( d, _mm256_set1_pd( (double)(1UL<<32) ) ); // u+2^32 if u<2^31, u o.w, exact
273 786432 : __m256i cl = _mm256_cvtepi32_epi64( c ); // 0L if u<2^31, -1L o.w
274 786432 : return _mm256_blendv_pd( d, ds, _mm256_castsi256_pd( cl ) ); // u
275 786432 : }
276 :
277 196608 : static inline wf_t wu_to_wf( wu_t a ) {
278 :
279 : /* See note above re ISA dubiousness. Note that we can't do the same
280 : trick as wu_to_wd due to single precision roundoff limitations (the
281 : _mm256_cvtepi32_pd equivalent would not be exact such that add to
282 : correct the twos complement mangling would add a possible second
283 : roundoff error ... this would result in slightly different values
284 : occasionally when u is >~ 2^31). We instead convert the two
285 : halves to double (exact), convert the double to float (single
286 : roundoff error) and then concat the two float halves to make a
287 : correctly rounded implementation. */
288 :
289 196608 : return _mm256_setr_m128( _mm256_cvtpd_ps( wu_to_wd( a, 0 ) ), _mm256_cvtpd_ps( wu_to_wd( a, 1 ) ) );
290 196608 : }
291 :
292 : #define wu_to_wl(a,imm_hi) _mm256_cvtepu32_epi64( _mm256_extractf128_si256( (a), !!(imm_hi) ) )
293 : #define wu_to_wv(a,imm_hi) _mm256_cvtepu32_epi64( _mm256_extractf128_si256( (a), !!(imm_hi) ) )
294 :
295 : #define wu_to_wc_raw(a) (a)
296 : #define wu_to_wf_raw(a) _mm256_castsi256_ps( (a) )
297 : #define wu_to_wi_raw(a) (a)
298 : #define wu_to_wd_raw(a) _mm256_castsi256_pd( (a) )
299 : #define wu_to_wl_raw(a) (a)
300 28682184 : #define wu_to_wv_raw(a) (a)
301 :
302 : /* Reduction operations */
303 :
304 : static inline wu_t
305 196608 : wu_sum_all( wu_t x ) { /* Returns wu_bcast( sum( x ) ) */
306 196608 : x = _mm256_add_epi32( x, _mm256_permute2f128_si256( x, x, 1 ) ); /* x04 x15 x26 x37 ... */
307 196608 : x = _mm256_hadd_epi32( x, x ); /* x0145 x2367 ... */
308 196608 : return _mm256_hadd_epi32( x, x ); /* xsum ... */
309 196608 : }
310 :
311 : static inline wu_t
312 196608 : wu_min_all( wu_t x ) { /* Returns wu_bcast( min( x ) ) */
313 196608 : __m256i y = _mm256_permute2f128_si256( x, x, 1 ); /* x4 x5 x6 x7 x0 x1 x2 x3 */
314 196608 : x = _mm256_min_epu32( x, y ); /* x04 x15 x26 x37 ... */
315 196608 : y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x26 x37 x04 x15 ... */
316 196608 : x = _mm256_min_epu32( x, y ); /* x0246 x1357 ... */
317 196608 : y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x1357 x0246 ... */
318 196608 : x = _mm256_min_epu32( x, y ); /* xmin ... */
319 196608 : return x;
320 196608 : }
321 :
322 : static inline wu_t
323 196608 : wu_max_all( wu_t x ) { /* Returns wu_bcast( max( x ) ) */
324 196608 : __m256i y = _mm256_permute2f128_si256( x, x, 1 ); /* x4 x5 x6 x7 x0 x1 x2 x3 */
325 196608 : x = _mm256_max_epu32( x, y ); /* x04 x15 x26 x37 ... */
326 196608 : y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x26 x37 x04 x15 ... */
327 196608 : x = _mm256_max_epu32( x, y ); /* x0246 x1357 ... */
328 196608 : y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x1357 x0246 ... */
329 196608 : x = _mm256_max_epu32( x, y ); /* xmax ... */
330 196608 : return x;
331 196608 : }
332 :
333 : /* Misc operations */
334 :
335 : /* wu_gather(b,i) returns [ b[i(0)] b[i(1)] ... b[i(7)] ] where b is a
336 : "uint const *" and i is a wi_t. We use a static inline here instead
337 : of a define to keep strict type checking while working around yet
338 : another Intel intrinsic type mismatch issue. */
339 :
340 58589955 : static inline wu_t wu_gather( uint const * b, wi_t i ) {
341 58589955 : return _mm256_i32gather_epi32( (int const *)b, (i), 4 );
342 58589955 : }
343 :
344 : /* wu_transpose_8x8 transposes the 8x8 matrix stored in wu_t r0,r1,...r7
345 : and stores the result in 8x8 matrix wu_t c0,c1,...c7. All
346 : c0,c1,...c7 should be different for a well defined result.
347 : Otherwise, in-place operation and/or using the same wu_t to specify
348 : multiple rows of r is fine. */
349 :
350 552803150 : #define wu_transpose_8x8( r0,r1,r2,r3,r4,r5,r6,r7, c0,c1,c2,c3,c4,c5,c6,c7 ) do { \
351 552803150 : wu_t _wu_transpose_r0 = (r0); wu_t _wu_transpose_r1 = (r1); wu_t _wu_transpose_r2 = (r2); wu_t _wu_transpose_r3 = (r3); \
352 552803150 : wu_t _wu_transpose_r4 = (r4); wu_t _wu_transpose_r5 = (r5); wu_t _wu_transpose_r6 = (r6); wu_t _wu_transpose_r7 = (r7); \
353 552803150 : wu_t _wu_transpose_t; \
354 552803150 : /* Transpose 4x4 blocks */ \
355 552803150 : _wu_transpose_t = _wu_transpose_r0; _wu_transpose_r0 = _mm256_permute2f128_si256( _wu_transpose_t, _wu_transpose_r4, 0x20 ); \
356 552803150 : /**/ _wu_transpose_r4 = _mm256_permute2f128_si256( _wu_transpose_t, _wu_transpose_r4, 0x31 ); \
357 552803150 : _wu_transpose_t = _wu_transpose_r1; _wu_transpose_r1 = _mm256_permute2f128_si256( _wu_transpose_t, _wu_transpose_r5, 0x20 ); \
358 552803150 : /**/ _wu_transpose_r5 = _mm256_permute2f128_si256( _wu_transpose_t, _wu_transpose_r5, 0x31 ); \
359 552803150 : _wu_transpose_t = _wu_transpose_r2; _wu_transpose_r2 = _mm256_permute2f128_si256( _wu_transpose_t, _wu_transpose_r6, 0x20 ); \
360 552803150 : /**/ _wu_transpose_r6 = _mm256_permute2f128_si256( _wu_transpose_t, _wu_transpose_r6, 0x31 ); \
361 552803150 : _wu_transpose_t = _wu_transpose_r3; _wu_transpose_r3 = _mm256_permute2f128_si256( _wu_transpose_t, _wu_transpose_r7, 0x20 ); \
362 552803150 : /**/ _wu_transpose_r7 = _mm256_permute2f128_si256( _wu_transpose_t, _wu_transpose_r7, 0x31 ); \
363 552803150 : /* Transpose 2x2 blocks */ \
364 552803150 : _wu_transpose_t = _wu_transpose_r0; _wu_transpose_r0 = _mm256_unpacklo_epi32( _wu_transpose_t, _wu_transpose_r2 ); \
365 552803150 : /**/ _wu_transpose_r2 = _mm256_unpackhi_epi32( _wu_transpose_t, _wu_transpose_r2 ); \
366 552803150 : _wu_transpose_t = _wu_transpose_r1; _wu_transpose_r1 = _mm256_unpacklo_epi32( _wu_transpose_t, _wu_transpose_r3 ); \
367 552803150 : /**/ _wu_transpose_r3 = _mm256_unpackhi_epi32( _wu_transpose_t, _wu_transpose_r3 ); \
368 552803150 : _wu_transpose_t = _wu_transpose_r4; _wu_transpose_r4 = _mm256_unpacklo_epi32( _wu_transpose_t, _wu_transpose_r6 ); \
369 552803150 : /**/ _wu_transpose_r6 = _mm256_unpackhi_epi32( _wu_transpose_t, _wu_transpose_r6 ); \
370 552803150 : _wu_transpose_t = _wu_transpose_r5; _wu_transpose_r5 = _mm256_unpacklo_epi32( _wu_transpose_t, _wu_transpose_r7 ); \
371 552803150 : /**/ _wu_transpose_r7 = _mm256_unpackhi_epi32( _wu_transpose_t, _wu_transpose_r7 ); \
372 552803150 : /* Transpose 1x1 blocks */ \
373 552803150 : /**/ (c0) = _mm256_unpacklo_epi32( _wu_transpose_r0, _wu_transpose_r1 ); \
374 552803150 : /**/ (c1) = _mm256_unpackhi_epi32( _wu_transpose_r0, _wu_transpose_r1 ); \
375 552803150 : /**/ (c2) = _mm256_unpacklo_epi32( _wu_transpose_r2, _wu_transpose_r3 ); \
376 552803150 : /**/ (c3) = _mm256_unpackhi_epi32( _wu_transpose_r2, _wu_transpose_r3 ); \
377 552803150 : /**/ (c4) = _mm256_unpacklo_epi32( _wu_transpose_r4, _wu_transpose_r5 ); \
378 552803150 : /**/ (c5) = _mm256_unpackhi_epi32( _wu_transpose_r4, _wu_transpose_r5 ); \
379 552803150 : /**/ (c6) = _mm256_unpacklo_epi32( _wu_transpose_r6, _wu_transpose_r7 ); \
380 552803150 : /**/ (c7) = _mm256_unpackhi_epi32( _wu_transpose_r6, _wu_transpose_r7 ); \
381 552803150 : } while(0)
|