Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx_h
2 : #error "Do not include this directly; use fd_avx.h"
3 : #endif
4 :
5 : /* TODO: the below is much very designed for a 32-bit SIMD lane world
6 : (with 64-bit SIMD lane support hacked on afterward). Revamp these to
7 : be more general for 8, 16, 32 and 64 bit lanes. */
8 :
9 : /* Vector conditional API *********************************************/
10 :
11 : /* A wc_t is a vector conditional. This is, it is a vector of integers
12 : where each 32-bit wide lane is either 0 (all zero bits), indicating
13 : the condition is true for that lane or -1 (all one bits), indicating
14 : the condition is false for that lane. This allows fast bit
15 : operations to mask other types of vectors. If this API is used on
16 : vectors that aren't proper vector conditionals, results are
17 : undefined. When vector conditional are applied to vector doubles,
18 : longs and ulongs, adjacent lanes (0-1 / 2-3 / 4-5 / 6-7) should have
19 : identical values, otherwise results will be undefined.
20 :
21 : These mirror the other APIs as much as possible. Macros are
22 : preferred over static inlines when it is possible to do it robustly
23 : to reduce the risk of the compiler mucking it up. */
24 :
25 13349756931 : #define wc_t __m256i
26 :
27 : /* Constructors */
28 :
29 : /* wc returns a wc_t corresponding to the c-style logical values c0:c7.
30 : This will always create a proper vector conditional regardless how
31 : logical values were presented to them. That is, the provided values
32 : will be treated as c-style logical values such that zero/false will
33 : become zero/false in the vector and non-zero/true will become -1/true
34 : in the vector conditional). Similarly for wc_bcast*. Summarizing:
35 :
36 : wc(c0,c1,c2,c3) return [c0 c1 c2 c3 c4 c5 c6 c7]
37 : wc_bcast(c0) return [c0 c0 c0 c0 c0 c0 c0 c0]
38 : wc_bcast_pair(c0,c1) return [c0 c1 c0 c1 c0 c1 c0 c1]
39 : wc_bcast_lohi(c0,c1) return [c0 c0 c0 c1 c1 c1 c1 c1]
40 : wc_bcast_quad(c0,c1,c2,c3) return [c0 c1 c2 c3 c0 c1 c2 c3]
41 : wc_bcast_wide(c0,c1,c2,c3) return [c0 c0 c1 c1 c2 c2 c3 c3] */
42 :
43 591354 : #define wc(c0,c1,c2,c3,c4,c5,c6,c7) _mm256_setr_epi32( -!!(c0), -!!(c1), -!!(c2), -!!(c3), -!!(c4), -!!(c5), -!!(c6), -!!(c7) )
44 :
45 : #if 0 /* Compiler sometimes tries to turn this into branches ... sigh */
46 : #define wc_bcast(c0) _mm256_set1_epi32( -!!(c0) )
47 : #else
48 : static inline __m256i
49 6885 : wc_bcast( int c0 ) {
50 6885 : c0 = -!!c0; FD_COMPILER_FORGET( c0 );
51 6885 : return _mm256_set1_epi32( c0 );
52 6885 : }
53 : #endif
54 :
55 : static inline wc_t
56 765 : wc_bcast_pair( int c0, int c1 ) {
57 765 : c0 = -!!c0; c1 = -!!c1;
58 765 : return _mm256_setr_epi32( c0, c1, c0, c1, c0, c1, c0, c1 );
59 765 : }
60 :
61 : static inline wc_t
62 765 : wc_bcast_lohi( int c0, int c1 ) {
63 765 : c0 = -!!c0; c1 = -!!c1;
64 765 : return _mm256_setr_epi32( c0, c0, c0, c0, c1, c1, c1, c1 );
65 765 : }
66 :
67 : static inline wc_t
68 765 : wc_bcast_quad( int c0, int c1, int c2, int c3 ) {
69 765 : c0 = -!!c0; c1 = -!!c1; c2 = -!!c2; c3 = -!!c3;
70 765 : return _mm256_setr_epi32( c0, c1, c2, c3, c0, c1, c2, c3 );
71 765 : }
72 :
73 : static inline wc_t
74 592884 : wc_bcast_wide( int c0, int c1, int c2, int c3 ) {
75 592884 : c0 = -!!c0; c1 = -!!c1; c2 = -!!c2; c3 = -!!c3;
76 592884 : return _mm256_setr_epi32( c0, c0, c1, c1, c2, c2, c3, c3 );
77 592884 : }
78 :
79 : /* No general wc_permute due to cross-128-bit lane limitations in AVX.
80 : Useful cases are provided below. Given [ c0 c1 c2 c3 c4 c5 c6 c7 ],
81 : return ... */
82 :
83 : #define wc_bcast_even(c) /* [ c0 c0 c2 c2 c4 c4 c6 c6 ] */ \
84 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (c) ), _MM_SHUFFLE(2,2,0,0) ) )
85 :
86 : #define wc_bcast_odd(c) /* [ c1 c1 c3 c3 c5 c5 c7 c7 ] */ \
87 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (c) ), _MM_SHUFFLE(3,3,1,1) ) )
88 :
89 : #define wc_exch_adj(c) /* [ c1 c0 c3 c2 c5 c4 c7 c6 ] */ \
90 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (c) ), _MM_SHUFFLE(2,3,0,1) ) )
91 :
92 : #define wc_exch_adj_pair(c) /* [ c2 c3 c0 c1 c6 c7 c4 c5 ] */ \
93 : _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (c) ), _MM_SHUFFLE(1,0,3,2) ) )
94 :
95 : static inline wc_t
96 765 : wc_exch_adj_quad( wc_t c ) { /* [ c4 c5 c6 c7 c0 c1 c2 c3 ] */
97 765 : return _mm256_permute2f128_si256( c, c, 1 );
98 765 : }
99 :
100 : /* Predefined constants */
101 :
102 : #define wc_false() _mm256_setzero_si256() /* Return [ f f f f f f f f ] */
103 : #define wc_true() _mm256_set1_epi32( -1 ) /* Return [ t t t t t t t t ] */
104 :
105 : /* Memory operations */
106 :
107 : /* wc_ld returns the 8 integers at the 32-byte aligned / 32-byte sized
108 : location p as a proper vector conditional (see above note about
109 : c-style logicals). wc_ldu is the same but p does not have to be
110 : aligned. In the fast variants, the caller promises that p already
111 : holds a proper vector conditions (e.g. 0/-1 for true/false). wc_st
112 : writes the vector conditional c at the 32-byte aligned / 32-byte size
113 : location p (0/-1 for true/false). wc_stu is the same but p does not
114 : have to be aligned. Lane l will be at p[l]. FIXME: USE ATTRIBUTES
115 : ON P PASSED TO THESE?
116 :
117 : Note: gcc knows that __m256i may alias. */
118 :
119 : static inline wc_t
120 11809491 : wc_ld( int const * p ) {
121 11809491 : return _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( _mm256_load_si256( (__m256i const *)p ),
122 11809491 : _mm256_setzero_si256() ) );
123 11809491 : }
124 11809491 : static inline wc_t wc_ld_fast( int const * p ) { return _mm256_load_si256( (__m256i const *)p ); }
125 11809491 : static inline void wc_st( int * p, wc_t c ) { _mm256_store_si256( (__m256i *)p, c ); }
126 :
127 : static inline wc_t
128 94475928 : wc_ldu( void const * p ) {
129 94475928 : return _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( _mm256_loadu_si256( (__m256i const *)p ),
130 94475928 : _mm256_setzero_si256() ) );
131 94475928 : }
132 94475928 : static inline wc_t wc_ldu_fast( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
133 94475928 : static inline void wc_stu( void * p, wc_t c ) { _mm256_storeu_si256( (__m256i *)p, c ); }
134 :
135 : /* wc_ldif is an optimized equivalent to wc_and(c,wc_ldu(p)). Similarly
136 : for wc_ldif_fast (either may have different behavior if c is not a
137 : proper vector conditional). wc_ldif_fast assumes p already holds a
138 : proper vector conditional. These are provided for symmetry with the
139 : wc_stif operation. wc_stif stores x(n) to p[n] if c(n) is true and
140 : leaves p[n] unchanged otherwise. Undefined behavior if c is not a
141 : proper vector conditional. */
142 :
143 : #define wc_ldif(c,p) _mm256_xor_si128( _mm256_set1_epi32(-1), _mm256_cmpeq_epi32( _mm256_maskload_epi32( (p), (c) ), \
144 : _mm256_setzero_si128()) )
145 : #define wc_ldif_fast(c,p) _mm256_maskload_epi32((p),(c))
146 : #define wc_stif(c,p,x) _mm256_maskstore_epi32((p),(c),(x))
147 :
148 : /* Element operations */
149 :
150 : /* wc_extract extracts the value of lane imm from the vector conditional
151 : as an int 0 (false) or 1 (true). wc_insert returns the vector
152 : conditional formed by replacing the value in lane imm of a with the
153 : provided c-style logical. imm should be a compile time constant in
154 : 0:7. wc_extract_variable and wc_insert_variable are the slower but
155 : the lane does not have to be compile-time known static value (should
156 : still be in 0:7). */
157 :
158 94475928 : #define wc_extract(c,imm) ((_mm256_movemask_ps( _mm256_castsi256_ps( (c) ) ) >> (imm)) & 1)
159 94475928 : #define wc_insert(a,imm,c) _mm256_insert_epi32( (a), -!!(c), (imm) )
160 :
161 94475928 : #define wc_extract_variable(c,n) ((_mm256_movemask_ps( _mm256_castsi256_ps( (c) ) ) >> (n) ) & 1)
162 : #define wc_insert_variable(a,n,c) \
163 94475928 : _mm256_cmpgt_epi32( _mm256_and_si256( _mm256_set1_epi32( (_mm256_movemask_ps( _mm256_castsi256_ps( (a) ) ) & (~(1<<(n)))) | \
164 94475928 : ((!!(c))<<n) ), \
165 94475928 : _mm256_setr_epi32( 1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7 ) ), \
166 94475928 : _mm256_setzero_si256() )
167 :
168 : /* Given [ a0 a1 a2 a3 a4 a5 a6 a7 ] and/or [ b0 b1 b2 b3 b4 b5 b6 b7 ],
169 : return ... */
170 :
171 : /* Arithmetic operations */
172 :
173 : /* Note: arithmetic and shift operations are not well defined for a wc_t
174 : as it isn't clear if user would like to treat the vector conditional
175 : these as 8 1-bit signed ints (0/-1), 8 1-bit unsigned ints (0/1) or
176 : 8-GF2 elements (f/t but sign is meaningless) or do cross lane motion
177 : of the condition. Instead, the user should use wc_to_{wi,wl}[_raw]
178 : as necessary and use the appropriate binary, arithmetic, permute
179 : and/or shift operations there. */
180 :
181 : /* Binary operations */
182 :
183 : #define wc_not(a) _mm256_xor_si256( _mm256_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a7 ] */
184 :
185 : #define wc_and(a,b) _mm256_and_si256( (a),(b)) /* [ a0 &b0 a1 &b1 ... a7 &b7 ] */
186 : #define wc_or(a,b) _mm256_or_si256( (a),(b)) /* [ a0 |b0 a1 |b1 ... a7 |b7 ] */
187 : #define wc_xor(a,b) _mm256_xor_si256( (a),(b)) /* [ a0 ^b0 a1 ^b1 ... a7 ^b7 ] */
188 12540008284 : #define wc_andnot(a,b) _mm256_andnot_si256((a),(b)) /* [ (~a0)&b0 (~a1)&b1 ... (~a7)&b7 ] */
189 :
190 : /* Logical operations */
191 :
192 : /* Note: wc_{gt,lt,ge,le} are provided for completeness and treat
193 : true>false. */
194 :
195 : #define wc_lnot(a) _mm256_xor_si256( _mm256_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a7 ] */
196 : #define wc_lnotnot(a) (a) /* [ a0 a1 ... a7 ] */
197 :
198 : #define wc_eq(a,b) _mm256_cmpeq_epi32( (a),(b)) /* [ a0==b0 a1==b1 ... a7==b7 ] */
199 : #define wc_gt(a,b) _mm256_andnot_si256((b),(a)) /* [ a0> b0 a1> b1 ... a7> b7 ] */
200 : #define wc_lt(a,b) _mm256_andnot_si256((a),(b)) /* [ a0< b0 a1< b1 ... a7< b7 ] */
201 : #define wc_ne(a,b) _mm256_xor_si256( (a),(b)) /* [ a0!=b0 a1!=b1 ... a7!=b7 ] */
202 : #define wc_ge(a,b) /* [ a0>=b0 a1>=b1 ... a7>=b7 ] */ \
203 : _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_andnot_si256( (a), (b) ) )
204 : #define wc_le(a,b) /* [ a0<=b0 a1<=b1 ... a7<=b7 ] */ \
205 : _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_andnot_si256( (b), (a) ) )
206 :
207 : /* Conditional operations */
208 :
209 : /* FIXME: Define wc_czero / wc_notczero? Equivalent TO wc_andnot and
210 : wc_and but have arithmetic connotations. */
211 :
212 : #define wc_if(c,t,f) _mm256_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c7?t7:f7 ] */
213 :
214 : /* Conversion operations */
215 :
216 : /* wc_to_{wf,wi,wu,wd,wl,wv} convert a proper vector conditional into a
217 : vector float/int/double/long/ulong with f mapping to 0 and t mapping
218 : to 1 in each lane.
219 :
220 : wc_to_{wf,wi,wu,wd,wl,wv}_raw just treat the raw bits in the vector
221 : conditional as the corresponding vector type. wc_to_{wi,wu}_raw map
222 : false(true) to 0(-1) and similarly for wc_to_{wl,wv}_raw when c has
223 : paired lanes. wc_to_{wf,wd}_raw probably are not useful in practice
224 : but are provided for completeness; wc_to_wf_raw maps false(true) to
225 : 0(-nan) and similarly for wc_to_wd_raw when c has paired lanes. */
226 :
227 : #define wc_to_wf(a) _mm256_and_ps( _mm256_castsi256_ps( (a) ), _mm256_set1_ps( 1.f ) )
228 : #define wc_to_wi(a) _mm256_and_si256( (a), _mm256_set1_epi32( 1 ) )
229 : #define wc_to_wu(a) _mm256_and_si256( (a), _mm256_set1_epi32( 1 ) )
230 : #define wc_to_wd(a) _mm256_and_pd( _mm256_castsi256_pd( (a) ), _mm256_set1_pd( 1. ) ) /* wc should have paired lanes */
231 : #define wc_to_wl(a) _mm256_and_si256( (a), _mm256_set1_epi64x( 1L ) ) /* wc should have paired lanes */
232 : #define wc_to_wv(a) _mm256_and_si256( (a), _mm256_set1_epi64x( 1L ) ) /* wc should have paired lanes */
233 :
234 : #define wc_to_wf_raw(a) _mm256_castsi256_ps( (a) )
235 : #define wc_to_wi_raw(a) (a)
236 : #define wc_to_wu_raw(a) (a)
237 : #define wc_to_wd_raw(a) _mm256_castsi256_pd( (a) )
238 : #define wc_to_wl_raw(a) (a)
239 : #define wc_to_wv_raw(a) (a)
240 :
241 : /* Reduction operations */
242 :
243 : /* wc_any/wc_all returns logical true if any/all conditions in c is true */
244 :
245 929728008 : #define wc_any(c) (_mm256_movemask_ps( _mm256_castsi256_ps( (c) ) )!=0x00)
246 2488808815 : #define wc_all(c) (_mm256_movemask_ps( _mm256_castsi256_ps( (c) ) )==0xff)
247 :
248 : /* Misc operations */
249 :
250 : /* wc_pack returns an int where bit i equals 0(1) if lane i of c is
251 : false(true) for i in [0,8). Vice versa for wc_unpack. */
252 :
253 2548098084 : #define wc_pack(c) _mm256_movemask_ps( _mm256_castsi256_ps( (c) ) )
254 20530622 : #define wc_unpack(b) _mm256_cmpgt_epi32( _mm256_and_si256( _mm256_set1_epi32( (b) ), \
255 20530622 : _mm256_setr_epi32( 1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7 ) ), \
256 20530622 : _mm256_setzero_si256() )
257 :
258 : /* wc_expand expands c0:c3 (imm_hi==0) or c4:c7 (imm_hi==1) into a
259 : paired lane conditional. That is:
260 :
261 : wc_expand(c,0) returns [ c0 c0 c1 c1 c2 c2 c3 c3 ]
262 : wc_expand(c,1) returns [ c4 c4 c5 c5 c6 c6 c7 c7 ]
263 :
264 : Conversely:
265 :
266 : wc_narrow(a,b) returns [ a0 a2 a4 a6 b0 b2 b4 b6 ]
267 :
268 : which is useful for turning two paired lane conditionals into a
269 : single lane conditional. U.B. if a, b, and/or c are not proper
270 : vector conditionals. These are useful, for example, for vectorizing
271 : 64-bit pointer arithmetic used in 32-bit lane SIMD. */
272 :
273 1530 : #define wc_expand(c,imm_hi) _mm256_cvtepi32_epi64( _mm256_extractf128_si256( (c), (imm_hi) ) )
274 :
275 229939051 : static inline wc_t wc_narrow( wc_t a, wc_t b ) {
276 229939051 : __m128 a01 = _mm_castsi128_ps( _mm256_extractf128_si256( a, 0 ) ); /* [ a0l a0h a1l a1h ] */
277 229939051 : __m128 a23 = _mm_castsi128_ps( _mm256_extractf128_si256( a, 1 ) ); /* [ a2l a2h a3l a3h ] */
278 229939051 : __m128 b01 = _mm_castsi128_ps( _mm256_extractf128_si256( b, 0 ) ); /* [ b0l b0h b1l b1h ] */
279 229939051 : __m128 b23 = _mm_castsi128_ps( _mm256_extractf128_si256( b, 1 ) ); /* [ b2l b2h b3l b3h ] */
280 229939051 : return _mm256_setr_m128i( _mm_castps_si128( _mm_shuffle_ps( a01, a23, _MM_SHUFFLE(2,0,2,0) ) ),
281 229939051 : _mm_castps_si128( _mm_shuffle_ps( b01, b23, _MM_SHUFFLE(2,0,2,0) ) ) );
282 229939051 : }
283 :
284 : /* wc_gather(b,i) returns [ -!!b[i(0)] -!!b[i(1)] ... -!!b[i(7)] ] where
285 : b is an "int const *" (0/non-zero map to false/true) and i is a wi_t.
286 :
287 : wc_gather_fast(b,i) returns [ b[i(0)] b[i(1)] ... b[i(7)] ] where b s
288 : an "int const *". User promises b[i(:)] values are already either 0
289 : or -1. i here is a wi_t. */
290 :
291 11809491 : #define wc_gather(b,i) _mm256_xor_si256( _mm256_set1_epi32( -1 ), \
292 11809491 : _mm256_cmpeq_epi32( _mm256_i32gather_epi32( (b), (i), 4 ), _mm256_setzero_si256() ) )
293 11809491 : #define wc_gather_fast(b,i) _mm256_i32gather_epi32( (b), (i), 4 )
294 :
295 : /* wc_transpose_8x8 transposes the 8x8 matrix stored in wc_t r0,r1,...r7
296 : and stores the result in 8x8 matrix wc_t c0,c1,...c7. All
297 : c0,c1,...c7 should be different for a well defined result.
298 : Otherwise, in-place operation and/or using the same wc_t to specify
299 : multiple rows of r is fine. */
300 :
301 765 : #define wc_transpose_8x8( r0,r1,r2,r3,r4,r5,r6,r7, c0,c1,c2,c3,c4,c5,c6,c7 ) do { \
302 765 : wc_t _wc_transpose_r0 = (r0); wc_t _wc_transpose_r1 = (r1); wc_t _wc_transpose_r2 = (r2); wc_t _wc_transpose_r3 = (r3); \
303 765 : wc_t _wc_transpose_r4 = (r4); wc_t _wc_transpose_r5 = (r5); wc_t _wc_transpose_r6 = (r6); wc_t _wc_transpose_r7 = (r7); \
304 765 : wc_t _wc_transpose_t; \
305 765 : /* Transpose 4x4 blocks */ \
306 765 : _wc_transpose_t = _wc_transpose_r0; _wc_transpose_r0 = _mm256_permute2f128_si256( _wc_transpose_t, _wc_transpose_r4, 0x20 ); \
307 765 : /**/ _wc_transpose_r4 = _mm256_permute2f128_si256( _wc_transpose_t, _wc_transpose_r4, 0x31 ); \
308 765 : _wc_transpose_t = _wc_transpose_r1; _wc_transpose_r1 = _mm256_permute2f128_si256( _wc_transpose_t, _wc_transpose_r5, 0x20 ); \
309 765 : /**/ _wc_transpose_r5 = _mm256_permute2f128_si256( _wc_transpose_t, _wc_transpose_r5, 0x31 ); \
310 765 : _wc_transpose_t = _wc_transpose_r2; _wc_transpose_r2 = _mm256_permute2f128_si256( _wc_transpose_t, _wc_transpose_r6, 0x20 ); \
311 765 : /**/ _wc_transpose_r6 = _mm256_permute2f128_si256( _wc_transpose_t, _wc_transpose_r6, 0x31 ); \
312 765 : _wc_transpose_t = _wc_transpose_r3; _wc_transpose_r3 = _mm256_permute2f128_si256( _wc_transpose_t, _wc_transpose_r7, 0x20 ); \
313 765 : /**/ _wc_transpose_r7 = _mm256_permute2f128_si256( _wc_transpose_t, _wc_transpose_r7, 0x31 ); \
314 765 : /* Transpose 2x2 blocks */ \
315 765 : _wc_transpose_t = _wc_transpose_r0; _wc_transpose_r0 = _mm256_unpacklo_epi32( _wc_transpose_t, _wc_transpose_r2 ); \
316 765 : /**/ _wc_transpose_r2 = _mm256_unpackhi_epi32( _wc_transpose_t, _wc_transpose_r2 ); \
317 765 : _wc_transpose_t = _wc_transpose_r1; _wc_transpose_r1 = _mm256_unpacklo_epi32( _wc_transpose_t, _wc_transpose_r3 ); \
318 765 : /**/ _wc_transpose_r3 = _mm256_unpackhi_epi32( _wc_transpose_t, _wc_transpose_r3 ); \
319 765 : _wc_transpose_t = _wc_transpose_r4; _wc_transpose_r4 = _mm256_unpacklo_epi32( _wc_transpose_t, _wc_transpose_r6 ); \
320 765 : /**/ _wc_transpose_r6 = _mm256_unpackhi_epi32( _wc_transpose_t, _wc_transpose_r6 ); \
321 765 : _wc_transpose_t = _wc_transpose_r5; _wc_transpose_r5 = _mm256_unpacklo_epi32( _wc_transpose_t, _wc_transpose_r7 ); \
322 765 : /**/ _wc_transpose_r7 = _mm256_unpackhi_epi32( _wc_transpose_t, _wc_transpose_r7 ); \
323 765 : /* Transpose 1x1 blocks */ \
324 765 : /**/ (c0) = _mm256_unpacklo_epi32( _wc_transpose_r0, _wc_transpose_r1 ); \
325 765 : /**/ (c1) = _mm256_unpackhi_epi32( _wc_transpose_r0, _wc_transpose_r1 ); \
326 765 : /**/ (c2) = _mm256_unpacklo_epi32( _wc_transpose_r2, _wc_transpose_r3 ); \
327 765 : /**/ (c3) = _mm256_unpackhi_epi32( _wc_transpose_r2, _wc_transpose_r3 ); \
328 765 : /**/ (c4) = _mm256_unpacklo_epi32( _wc_transpose_r4, _wc_transpose_r5 ); \
329 765 : /**/ (c5) = _mm256_unpackhi_epi32( _wc_transpose_r4, _wc_transpose_r5 ); \
330 765 : /**/ (c6) = _mm256_unpacklo_epi32( _wc_transpose_r6, _wc_transpose_r7 ); \
331 765 : /**/ (c7) = _mm256_unpackhi_epi32( _wc_transpose_r6, _wc_transpose_r7 ); \
332 765 : } while(0)
|