Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx_h
2 : #error "Do not include this directly; use fd_avx.h"
3 : #endif
4 :
5 : /* Vector int API *****************************************************/
6 :
7 : /* A wi_t is a vector where each 32-bit wide lane holds a signed 32-bit
8 : twos-complement integer (an "int"). These mirror wc and wf as much
9 : as possible.
10 :
11 : These mirror the other APIs as much as possible. Macros are
12 : preferred over static inlines when it is possible to do it robustly
13 : to reduce the risk of the compiler mucking it up. */
14 :
15 74908419 : #define wi_t __m256i
16 :
17 : /* Constructors */
18 :
19 : /* Given the int values, return ... */
20 :
21 : #define wi(i0,i1,i2,i3,i4,i5,i6,i7) /* [ i0 i1 i2 i3 i4 i5 i6 i7 ] */ \
22 490146828 : _mm256_setr_epi32( (i0), (i1), (i2), (i3), (i4), (i5), (i6), (i7) )
23 :
24 : #define wi_bcast(i0) _mm256_set1_epi32( (i0) ) /* [ i0 i0 i0 i0 i0 i0 i0 i0 ] */
25 :
26 : static inline wi_t /* [ i0 i1 i0 i1 i0 i1 i0 i1 ] */
27 196608 : wi_bcast_pair( int i0, int i1 ) {
28 196608 : return _mm256_setr_epi32( i0, i1, i0, i1, i0, i1, i0, i1 );
29 196608 : }
30 :
31 : static inline wi_t /* [ i0 i0 i0 i0 i1 i1 i1 i1 ] */
32 196608 : wi_bcast_lohi( int i0, int i1 ) {
33 196608 : return _mm256_setr_epi32( i0, i0, i0, i0, i1, i1, i1, i1 );
34 196608 : }
35 :
36 : static inline wi_t /* [ i0 i1 i2 i3 i0 i1 i2 i3 ] */
37 196608 : wi_bcast_quad( int i0, int i1, int i2, int i3 ) {
38 196608 : return _mm256_setr_epi32( i0, i1, i2, i3, i0, i1, i2, i3 );
39 196608 : }
40 :
41 : static inline wi_t /* [ i0 i0 i1 i1 i2 i2 i3 i3 ] */
42 196608 : wi_bcast_wide( int i0, int i1, int i2, int i3 ) {
43 196608 : return _mm256_setr_epi32( i0, i0, i1, i1, i2, i2, i3, i3 );
44 196608 : }
45 :
46 : /* No general vf_permute due to cross-128-bit lane limitations in AVX.
47 : Useful cases are provided below. Given [ i0 i1 i2 i3 i4 i5 i6 i7 ],
48 : return ... */
49 :
50 : #define wi_bcast_even(x) /* [ i0 i0 i2 i2 i4 i4 i6 i6 ] */ \
51 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(2,2,0,0) ) )
52 :
53 : #define wi_bcast_odd(x) /* [ i1 i1 i3 i3 i5 i5 i7 i7 ] */ \
54 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(3,3,1,1) ) )
55 :
56 : #define wi_exch_adj(x) /* [ i1 i0 i3 i2 i5 i4 i7 i6 ] */ \
57 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(2,3,0,1) ) )
58 :
59 : #define wi_exch_adj_pair(x) /* [ i2 i3 i0 i1 i6 i7 i4 i5 ] */ \
60 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(1,0,3,2) ) )
61 :
62 : static inline wi_t
63 196608 : wi_exch_adj_quad( wi_t x ) { /* [ i4 i5 i6 i7 i0 i1 i2 i3 ] */
64 196608 : return _mm256_permute2f128_si256( x, x, 1 );
65 196608 : }
66 :
67 : /* Predefined constants */
68 :
69 : #define wi_zero() _mm256_setzero_si256() /* Return [ 0 0 0 0 0 0 0 0 ] */
70 70976259 : #define wi_one() _mm256_set1_epi32( 1 ) /* Return [ 1 1 1 1 1 1 1 1 ] */
71 :
72 : /* Memory operations */
73 :
74 : /* wi_ld return the 8 ints at the 32-byte aligned / 32-byte sized
75 : location p as a vector int. wi_ldu is the same but p does not have
76 : to be aligned. wi_st writes the vector int to the 32-byte aligned /
77 : 32-byte sized location p as 8 ints. wi_stu is the same but p does
78 : not have to be aligned. In all these lane l will be at p[l]. FIXME:
79 : USE ATTRIBUTES ON P PASSED TO THESE?
80 :
81 : Note: gcc knows a __m256i may alias. */
82 :
83 70976259 : static inline wi_t wi_ld( int const * p ) { return _mm256_load_si256( (__m256i const *)p ); }
84 70976259 : static inline void wi_st( int * p, wi_t i ) { _mm256_store_si256( (__m256i *)p, i ); }
85 :
86 567810072 : static inline wi_t wi_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
87 567810072 : static inline void wi_stu( void * p, wi_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
88 :
89 : /* wi_ldif is an optimized equivalent to wi_notczero(c,wi_ldu(p)) (may
90 : have different behavior if c is not a proper vector conditional). It
91 : is provided for symmetry with the wi_stif operation. wi_stif stores
92 : x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
93 : Undefined behavior if c is not a proper vector conditional. */
94 :
95 : #define wi_ldif(c,p) _mm256_maskload_epi32( (p),(c))
96 : #define wi_stif(c,p,x) _mm256_maskstore_epi32((p),(c),(x))
97 :
98 : /* Element operations */
99 :
100 : /* wi_extract extracts the int in lane imm from the vector int as an int.
101 : wi_insert returns the vector int formed by replacing the value in
102 : lane imm of a with the provided int. imm should be a compile time
103 : constant in 0:7. wi_extract_variable and wi_insert_variable are the
104 : slower but the lane n does not have to be known at compile time
105 : (should still be in 0:7).
106 :
107 : Note: C99 TC3 allows type punning through a union. */
108 :
109 567810072 : #define wi_extract(a,imm) _mm256_extract_epi32( (a), (imm) )
110 567810072 : #define wi_insert(a,imm,v) _mm256_insert_epi32( (a), (v), (imm) )
111 :
112 : static inline int
113 567810072 : wi_extract_variable( wi_t a, int n ) {
114 567810072 : union { __m256i m[1]; int i[8]; } t[1];
115 567810072 : _mm256_store_si256( t->m, a );
116 567810072 : return t->i[n];
117 567810072 : }
118 :
119 : static inline wi_t
120 567810072 : wi_insert_variable( wi_t a, int n, int v ) {
121 567810072 : union { __m256i m[1]; int i[8]; } t[1];
122 567810072 : _mm256_store_si256( t->m, a );
123 567810072 : t->i[n] = v;
124 567810072 : return _mm256_load_si256( t->m );
125 567810072 : }
126 :
127 : /* Given [a0 a1 a2 a3 a4 a5 a6 a7] and/or [b0 b1 b2 b3 b4 b5 b6 b7],
128 : return ... */
129 :
130 : /* Arithmetic operations */
131 :
132 : #define wi_neg(a) _mm256_sub_epi32( _mm256_setzero_si256(), (a) ) /* [ -a0 -a1 ... -a7 ] (twos complement handling) */
133 : #define wi_abs(a) _mm256_abs_epi32( (a) ) /* [ |a0| |a1| ... |a7| ] (twos complement handling) */
134 :
135 : #define wi_min(a,b) _mm256_min_epi32( (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a7,b7) ] */
136 : #define wi_max(a,b) _mm256_max_epi32( (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a7,b7) ] */
137 : #define wi_add(a,b) _mm256_add_epi32( (a), (b) ) /* [ a0 +b0 a1 +b1 ... a7 +b7 ] */
138 : #define wi_sub(a,b) _mm256_sub_epi32( (a), (b) ) /* [ a0 -b0 a1 -b1 ... a7 -b7 ] */
139 : #define wi_mul(a,b) _mm256_mullo_epi32( (a), (b) ) /* [ a0 *b0 a1 *b1 ... a7 *b7 ] */
140 :
141 : /* Binary operations */
142 :
143 : /* Note: wi_shl/wi_shr/wi_shru is a left/signed right/unsigned right
144 : shift by imm bits; imm must be a compile time constant in 0:63. The
145 : variable variants are slower but do not require the shift amount to
146 : be known at compile time (should still be in 0:63). */
147 :
148 : #define wi_not(a) _mm256_xor_si256( _mm256_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a7 ] */
149 :
150 : #define wi_shl(a,imm) _mm256_slli_epi32( (a), (imm) ) /* [ a0<<imm a1<<imm ... a7<<imm ] */
151 : #define wi_shr(a,imm) _mm256_srai_epi32( (a), (imm) ) /* [ a0>>imm a1>>imm ... a7>>imm ] (treat a as signed)*/
152 : #define wi_shru(a,imm) _mm256_srli_epi32( (a), (imm) ) /* [ a0>>imm a1>>imm ... a7>>imm ] (treat a as unsigned) */
153 :
154 : #define wi_shl_variable(a,n) _mm256_sll_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
155 : #define wi_shr_variable(a,n) _mm256_sra_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
156 : #define wi_shru_variable(a,n) _mm256_srl_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
157 :
158 : #define wi_shl_vector(a,b) _mm256_sllv_epi32( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a7<<b7 ] */
159 : #define wi_shr_vector(a,b) _mm256_srav_epi32( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a7>>b7 ] (treat a as signed) */
160 : #define wi_shru_vector(a,b) _mm256_srlv_epi32( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a7>>b7 ] (treat a as unsigned) */
161 :
162 : #define wi_and(a,b) _mm256_and_si256( (a), (b) ) /* [ a0 &b0 a1& b1 ... a7& b7 ] */
163 : #define wi_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ... (~a7)&b7 ] */
164 20971520 : #define wi_or(a,b) _mm256_or_si256( (a), (b) ) /* [ a0 |b0 a1 |b1 ... a7 |b7 ] */
165 : #define wi_xor(a,b) _mm256_xor_si256( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ... a7 ^b7 ] */
166 :
167 : /* wi_rol(x,n) returns wi( rotate_left (x0,n), rotate_left (x1,n), ... )
168 : wi_ror(x,n) returns wi( rotate_right(x0,n), rotate_right(x1,n), ... ) */
169 :
170 : #if FD_HAS_AVX512
171 : #define wi_rol(a,imm) _mm256_rol_epi32( (a), (imm) )
172 : #define wi_ror(a,imm) _mm256_ror_epi32( (a), (imm) )
173 : #else
174 4194304 : static inline wi_t wi_rol( wi_t a, int imm ) { return wi_or( wi_shl( a, imm & 31 ), wi_shru( a, (-imm) & 31 ) ); }
175 4194304 : static inline wi_t wi_ror( wi_t a, int imm ) { return wi_or( wi_shru( a, imm & 31 ), wi_shl( a, (-imm) & 31 ) ); }
176 : #endif
177 :
178 6291456 : static inline wi_t wi_rol_variable( wi_t a, int n ) { return wi_or( wi_shl_variable( a, n&31 ), wi_shru_variable( a, (-n)&31 ) ); }
179 6291456 : static inline wi_t wi_ror_variable( wi_t a, int n ) { return wi_or( wi_shru_variable( a, n&31 ), wi_shl_variable( a, (-n)&31 ) ); }
180 :
181 0 : static inline wi_t wi_rol_vector( wi_t a, wi_t b ) {
182 0 : wi_t m = wi_bcast( 31 );
183 0 : return wi_or( wi_shl_vector( a, wi_and( b, m ) ), wi_shru_vector( a, wi_and( wi_neg( b ), m ) ) );
184 0 : }
185 :
186 0 : static inline wi_t wi_ror_vector( wi_t a, wi_t b ) {
187 0 : wi_t m = wi_bcast( 31 );
188 0 : return wi_or( wi_shru_vector( a, wi_and( b, m ) ), wi_shl_vector( a, wi_and( wi_neg( b ), m ) ) );
189 0 : }
190 :
191 : /* Logical operations */
192 :
193 : #define wi_lnot(a) _mm256_cmpeq_epi32( (a), _mm256_setzero_si256() ) /* [ !a0 !a1 ... !a7 ] */
194 : #define wi_lnotnot(a) /* [ !!a0 !!a1 ... !!a7 ] */ \
195 : _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( (a), _mm256_setzero_si256() ) )
196 :
197 : #define wi_eq(a,b) _mm256_cmpeq_epi32( (a), (b) ) /* [ a0==b0 a1==b1 ... a7==b7 ] */
198 : #define wi_gt(a,b) _mm256_cmpgt_epi32( (a), (b) ) /* [ a0> b0 a1> b1 ... a7> b7 ] */
199 : #define wi_lt(a,b) _mm256_cmpgt_epi32( (b), (a) ) /* [ a0< b0 a1< b1 ... a7< b7 ] */
200 : #define wi_ne(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a7!=b7 ] */
201 : #define wi_ge(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpgt_epi32( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ... a7>=b7 ] */
202 : #define wi_le(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpgt_epi32( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ... a7<=b7 ] */
203 :
204 : /* Conditional operations */
205 :
206 : #define wi_czero(c,f) _mm256_andnot_si256( (c), (f) ) /* [ c0? 0:f0 c1? 0:f1 ... c7? 0:f7 ] */
207 : #define wi_notczero(c,f) _mm256_and_si256( (c), (f) ) /* [ c0?f0: 0 c1?f1: 0 ... c7?f7: 0 ] */
208 :
209 : #define wi_if(c,t,f) _mm256_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c7?t7:f7 ] */
210 :
211 : /* Conversion operations */
212 :
213 : /* Summarizing:
214 :
215 : wi_to_wc(a) returns [ !!a0 !!a1 ... !!a7 ]
216 :
217 : wi_to_wu(a) returns [ (uint)a0 (uint)a1 ... (uint)a7 ]
218 :
219 : wi_to_wf(a) returns [ (float)a0 (float)a1 ... (float)a7 ]
220 :
221 : wi_to_wd(a,0) returns [ (double)a0 (double)a1 (double)a2 (double)a3 ]
222 : wi_to_wd(a,1) returns [ (double)a4 (double)a5 (double)a6 (double)a7 ]
223 :
224 : wi_to_wl(a,0) returns [ (long)a0 (long)a1 (long)a2 (long)a3 ]
225 : wi_to_wl(a,1) returns [ (long)a4 (long)a5 (long)a6 (long)a7 ]
226 :
227 : wi_to_wv(a,0) returns [ (ulong)a0 (ulong)a1 (ulong)a2 (ulong)a3 ]
228 : wi_to_wv(a,1) returns [ (ulong)a4 (ulong)a5 (ulong)a6 (ulong)a7 ]
229 :
230 : where imm_hi should be a compile time constant.
231 :
232 : For wi_to_{wd,wl}, the permutation used for the conversion is less
233 : flexible due to cross 128-bit lane limitations in AVX. If imm_hi==0,
234 : the conversion is done to lanes 0:3. Otherwise, the conversion is
235 : done to lanes 4:7.
236 :
237 : The raw variants just treat the raw bits as the corresponding vector
238 : type. For wi_to_wc_raw, the user promises wi contains a proper
239 : vector conditional (e.g. 0 or -1 in each lane). wi_to_wf_raw is
240 : useful for doing advanced bit tricks on floating point values. The
241 : others are probably dubious but are provided for completness. */
242 :
243 : #define wi_to_wc(a) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( (a), _mm256_setzero_si256() ) )
244 : #define wi_to_wf(a) _mm256_cvtepi32_ps( (a) )
245 : #define wi_to_wu(a) (a)
246 : #define wi_to_wd(a,imm_hi) _mm256_cvtepi32_pd( _mm256_extractf128_si256( (a), !!(imm_hi) ) )
247 : #define wi_to_wl(a,imm_hi) _mm256_cvtepi32_epi64( _mm256_extractf128_si256( (a), !!(imm_hi) ) )
248 : #define wi_to_wv(a,imm_hi) _mm256_cvtepi32_epi64( _mm256_extractf128_si256( (a), !!(imm_hi) ) )
249 :
250 : #define wi_to_wc_raw(a) (a)
251 : #define wi_to_wf_raw(a) _mm256_castsi256_ps( (a) )
252 : #define wi_to_wu_raw(a) (a)
253 : #define wi_to_wd_raw(a) _mm256_castsi256_pd( (a) )
254 : #define wi_to_wl_raw(a) (a)
255 : #define wi_to_wv_raw(a) (a)
256 :
257 : /* Reduction operations */
258 :
259 : static inline wi_t
260 196608 : wi_sum_all( wi_t x ) { /* Returns wi_bcast( sum( x ) ) */
261 196608 : x = _mm256_add_epi32( x, _mm256_permute2f128_si256( x, x, 1 ) ); /* x04 x15 x26 x37 ... */
262 196608 : x = _mm256_hadd_epi32( x, x ); /* x0145 x2367 ... */
263 196608 : return _mm256_hadd_epi32( x, x ); /* xsum ... */
264 196608 : }
265 :
266 : static inline wi_t
267 196608 : wi_min_all( wi_t x ) { /* Returns wi_bcast( min( x ) ) */
268 196608 : __m256i y = _mm256_permute2f128_si256( x, x, 1 ); /* x4 x5 x6 x7 x0 x1 x2 x3 */
269 196608 : x = _mm256_min_epi32( x, y ); /* x04 x15 x26 x37 ... */
270 196608 : y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x26 x37 x04 x15 ... */
271 196608 : x = _mm256_min_epi32( x, y ); /* x0246 x1357 ... */
272 196608 : y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x1357 x0246 ... */
273 196608 : x = _mm256_min_epi32( x, y ); /* xmin ... */
274 196608 : return x;
275 196608 : }
276 :
277 : static inline wi_t
278 196608 : wi_max_all( wi_t x ) { /* Returns wi_bcast( max( x ) ) */
279 196608 : __m256i y = _mm256_permute2f128_si256( x, x, 1 ); /* x4 x5 x6 x7 x0 x1 x2 x3 */
280 196608 : x = _mm256_max_epi32( x, y ); /* x04 x15 x26 x37 ... */
281 196608 : y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x26 x37 x04 x15 ... */
282 196608 : x = _mm256_max_epi32( x, y ); /* x0246 x1357 ... */
283 196608 : y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x1357 x0246 ... */
284 196608 : x = _mm256_max_epi32( x, y ); /* xmax ... */
285 196608 : return x;
286 196608 : }
287 :
288 : /* Misc operations */
289 :
290 : /* wi_gather(b,i) returns [ b[i(0)] b[i(1)] ... b[i(7)] ] where b is a
291 : "int const *" and i is a wi_t. */
292 :
293 70976259 : #define wi_gather(b,i) _mm256_i32gather_epi32( (b), (i), 4 )
294 :
295 : /* wi_transpose_8x8 transposes the 8x8 matrix stored in wi_t r0,r1,...r7
296 : and stores the result in 8x8 matrix wi_t c0,c1,...c7. All
297 : c0,c1,...c7 should be different for a well defined result.
298 : Otherwise, in-place operation and/or using the same wi_t to specify
299 : multiple rows of r is fine. */
300 :
301 196608 : #define wi_transpose_8x8( r0,r1,r2,r3,r4,r5,r6,r7, c0,c1,c2,c3,c4,c5,c6,c7 ) do { \
302 196608 : wi_t _wi_transpose_r0 = (r0); wi_t _wi_transpose_r1 = (r1); wi_t _wi_transpose_r2 = (r2); wi_t _wi_transpose_r3 = (r3); \
303 196608 : wi_t _wi_transpose_r4 = (r4); wi_t _wi_transpose_r5 = (r5); wi_t _wi_transpose_r6 = (r6); wi_t _wi_transpose_r7 = (r7); \
304 196608 : wi_t _wi_transpose_t; \
305 196608 : /* Transpose 4x4 blocks */ \
306 196608 : _wi_transpose_t = _wi_transpose_r0; _wi_transpose_r0 = _mm256_permute2f128_si256( _wi_transpose_t, _wi_transpose_r4, 0x20 ); \
307 196608 : /**/ _wi_transpose_r4 = _mm256_permute2f128_si256( _wi_transpose_t, _wi_transpose_r4, 0x31 ); \
308 196608 : _wi_transpose_t = _wi_transpose_r1; _wi_transpose_r1 = _mm256_permute2f128_si256( _wi_transpose_t, _wi_transpose_r5, 0x20 ); \
309 196608 : /**/ _wi_transpose_r5 = _mm256_permute2f128_si256( _wi_transpose_t, _wi_transpose_r5, 0x31 ); \
310 196608 : _wi_transpose_t = _wi_transpose_r2; _wi_transpose_r2 = _mm256_permute2f128_si256( _wi_transpose_t, _wi_transpose_r6, 0x20 ); \
311 196608 : /**/ _wi_transpose_r6 = _mm256_permute2f128_si256( _wi_transpose_t, _wi_transpose_r6, 0x31 ); \
312 196608 : _wi_transpose_t = _wi_transpose_r3; _wi_transpose_r3 = _mm256_permute2f128_si256( _wi_transpose_t, _wi_transpose_r7, 0x20 ); \
313 196608 : /**/ _wi_transpose_r7 = _mm256_permute2f128_si256( _wi_transpose_t, _wi_transpose_r7, 0x31 ); \
314 196608 : /* Transpose 2x2 blocks */ \
315 196608 : _wi_transpose_t = _wi_transpose_r0; _wi_transpose_r0 = _mm256_unpacklo_epi32( _wi_transpose_t, _wi_transpose_r2 ); \
316 196608 : /**/ _wi_transpose_r2 = _mm256_unpackhi_epi32( _wi_transpose_t, _wi_transpose_r2 ); \
317 196608 : _wi_transpose_t = _wi_transpose_r1; _wi_transpose_r1 = _mm256_unpacklo_epi32( _wi_transpose_t, _wi_transpose_r3 ); \
318 196608 : /**/ _wi_transpose_r3 = _mm256_unpackhi_epi32( _wi_transpose_t, _wi_transpose_r3 ); \
319 196608 : _wi_transpose_t = _wi_transpose_r4; _wi_transpose_r4 = _mm256_unpacklo_epi32( _wi_transpose_t, _wi_transpose_r6 ); \
320 196608 : /**/ _wi_transpose_r6 = _mm256_unpackhi_epi32( _wi_transpose_t, _wi_transpose_r6 ); \
321 196608 : _wi_transpose_t = _wi_transpose_r5; _wi_transpose_r5 = _mm256_unpacklo_epi32( _wi_transpose_t, _wi_transpose_r7 ); \
322 196608 : /**/ _wi_transpose_r7 = _mm256_unpackhi_epi32( _wi_transpose_t, _wi_transpose_r7 ); \
323 196608 : /* Transpose 1x1 blocks */ \
324 196608 : /**/ (c0) = _mm256_unpacklo_epi32( _wi_transpose_r0, _wi_transpose_r1 ); \
325 196608 : /**/ (c1) = _mm256_unpackhi_epi32( _wi_transpose_r0, _wi_transpose_r1 ); \
326 196608 : /**/ (c2) = _mm256_unpacklo_epi32( _wi_transpose_r2, _wi_transpose_r3 ); \
327 196608 : /**/ (c3) = _mm256_unpackhi_epi32( _wi_transpose_r2, _wi_transpose_r3 ); \
328 196608 : /**/ (c4) = _mm256_unpacklo_epi32( _wi_transpose_r4, _wi_transpose_r5 ); \
329 196608 : /**/ (c5) = _mm256_unpackhi_epi32( _wi_transpose_r4, _wi_transpose_r5 ); \
330 196608 : /**/ (c6) = _mm256_unpacklo_epi32( _wi_transpose_r6, _wi_transpose_r7 ); \
331 196608 : /**/ (c7) = _mm256_unpackhi_epi32( _wi_transpose_r6, _wi_transpose_r7 ); \
332 196608 : } while(0)
|