Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx_h
2 : #error "Do not include this directly; use fd_avx.h"
3 : #endif
4 :
5 : /* Vector ulong API ***************************************************/
6 :
7 : /* A wv_t is a vector where each adjacent pair of 32-bit wide lanes
8 : (e.g. 0-1 / 2-3 / 4-5 / 6-7) holds an unsigned 64-bit integer (a
9 : "ulong").
10 :
11 : These mirror the other APIs as much as possible. Macros are
12 : preferred over static inlines when it is possible to do it robustly
13 : to reduce the risk of the compiler mucking it up. */
14 :
15 2111751311 : #define wv_t __m256i
16 :
17 : /* Constructors */
18 :
19 : /* Given the ulong values, return ... */
20 :
21 69922449 : #define wv(v0,v1,v2,v3) _mm256_setr_epi64x( (long)(v0), (long)(v1), (long)(v2), (long)(v3) ) /* [ v0 v1 v2 v3 ] */
22 :
23 166973900 : #define wv_bcast(v0) _mm256_set1_epi64x( (long)(v0) ) /* [ v0 v0 v0 v0 ] */
24 :
25 : static inline wv_t /* [ v0 v1 v0 v1 ] */
26 196608 : wv_bcast_pair( ulong v0, ulong v1 ) {
27 196608 : return _mm256_setr_epi64x( (long)v0, (long)v1, (long)v0, (long)v1 );
28 196608 : }
29 :
30 : static inline wv_t /* [ v0 v0 v1 v1 ] */
31 196608 : wv_bcast_wide( ulong v0, ulong v1 ) {
32 196608 : return _mm256_setr_epi64x( (long)v0, (long)v0, (long)v1, (long)v1 );
33 196608 : }
34 :
35 : /* wv_permute returns [ l(imm_v0) l(imm_i1) l(imm_i2) l(imm_i3) ].
36 : imm_i* should be compile time constants in 0:3. */
37 :
38 : #if FD_USING_CLANG /* Sigh ... clang is sad and can't handle passing compile time const expressions through a static inline */
39 :
40 : static inline wv_t
41 2162688 : wv_permute( wv_t x, int imm_i0, int imm_i1, int imm_i2, int imm_i3 ) {
42 2162688 : union { ulong u[4]; __m256i v[1]; } t, u;
43 2162688 : _mm256_store_si256( t.v, x );
44 2162688 : u.u[0] = t.u[ imm_i0 ];
45 2162688 : u.u[1] = t.u[ imm_i1 ];
46 2162688 : u.u[2] = t.u[ imm_i2 ];
47 2162688 : u.u[3] = t.u[ imm_i3 ];
48 2162688 : return _mm256_load_si256( u.v );
49 2162688 : }
50 :
51 : #else
52 :
53 : #define wv_permute(x,imm_i0,imm_i1,imm_i2,imm_i3) _mm256_permute4x64_epi64( (x), (imm_i0)+4*(imm_i1)+16*(imm_i2)+64*(imm_i3) )
54 :
55 : #endif
56 :
57 : /* Predefined constants */
58 :
59 37910702 : #define wv_zero() _mm256_setzero_si256() /* Return [ 0UL 0UL 0UL 0UL ] */
60 109904643 : #define wv_one() _mm256_set1_epi64x( 1L ) /* Return [ 1UL 1UL 1UL 1UL ] */
61 :
62 : /* Memory operations */
63 :
64 : /* wv_ld return the 4 ulongs at the 32-byte aligned / 32-byte sized
65 : location p as a vector ulong. wv_ldu is the same but p does not have
66 : to be aligned. wv_st writes the vector ulong to the 32-byte aligned
67 : / 32-byte sized location p as 4 ulongs. wv_stu is the same but p
68 : does not have to be aligned. In all these 64-bit lane l wvll be at
69 : p[l]. FIXME: USE ATTRIBUTES ON P PASSED TO THESE?
70 :
71 : Note: gcc knows a __m256i may alias. */
72 :
73 12262066639 : static inline wv_t wv_ld( ulong const * p ) { return _mm256_load_si256( (__m256i const *)p ); }
74 12783221219 : static inline void wv_st( ulong * p, wv_t i ) { _mm256_store_si256( (__m256i *)p, i ); }
75 :
76 869924428 : static inline wv_t wv_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
77 471511169 : static inline void wv_stu( void * p, wv_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
78 :
79 : /* wv_ldif is an optimized equivalent to wv_notczero(c,wv_ldu(p)) (may
80 : have different behavior if c is not a proper vector conditional). It
81 : is provided for symmetry with the wv_stif operation. wv_stif stores
82 : x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
83 : Undefined behavior if c is not a proper vector conditional. */
84 :
85 : #define wv_ldif(c,p) _mm256_maskload_epi64( (p),(c))
86 : #define wv_stif(c,p,x) _mm256_maskstore_epi64((p),(c),(x))
87 :
88 : /* Element operations */
89 :
90 : /* wv_extract extracts the ulong in lane imm from the vector ulong as a
91 : ulong. wv_insert returns the vector ulong formed by replacing the
92 : value in lane imm of a with the provided ulong. imm should be a
93 : compile time known in 0:3. wv_extract_variable and
94 : wv_insert_variable are the slower but the lane n does not have to be
95 : known at compile time (should still be in 0:3).
96 :
97 : Note: C99 TC3 allows type punning through a union. */
98 :
99 2371582272 : #define wv_extract(a,imm) ((ulong)_mm256_extract_epi64( (a), (imm) ))
100 :
101 439618572 : #define wv_insert(a,imm,v) _mm256_insert_epi64( (a), (long)(v), (imm) )
102 :
103 : static inline ulong
104 439618572 : wv_extract_variable( wv_t a, int n ) {
105 439618572 : union { __m256i m[1]; ulong u[4]; } t[1];
106 439618572 : _mm256_store_si256( t->m, a );
107 439618572 : return t->u[n];
108 439618572 : }
109 :
110 : static inline wv_t
111 439618572 : wv_insert_variable( wv_t a, int n, ulong v ) {
112 439618572 : union { __m256i m[1]; ulong u[4]; } t[1];
113 439618572 : _mm256_store_si256( t->m, a );
114 439618572 : t->u[n] = v;
115 439618572 : return _mm256_load_si256( t->m );
116 439618572 : }
117 :
118 : /* Given [a0 a1 a2 a3] and/or [b0 b1 b2 b3], return ... */
119 :
120 : /* Arithmetic operations */
121 :
122 : #define wv_neg(a) _mm256_sub_epi64( _mm256_setzero_si256(), (a) ) /* [ -a0 -a1 ... -a3 ] */
123 : #define wv_abs(a) (a) /* [ |a0| |a1| ... |a3| ] */
124 :
125 : /* Note: _mm256_{min,max}_epu64 are missing pre AVX-512. We emulate
126 : these on pre AVX-512 targets below (and use the AVX-512 versions if
127 : possible). Likewise, there is no _mm256_mullo_epi64 pre AVX-512.
128 : Since this is not cheap to emulate, we do not provide a wv_mul for
129 : the time being (we could consider exposing it on AVX-512 targets
130 : though). There is a 64L*64L->64 multiply (where the lower 32-bits of
131 : the inputs will be zero extended to 64-bits beforehand) though and
132 : that is very useful. So we do provide that. */
133 :
134 10026808226 : #define wv_add(a,b) _mm256_add_epi64( (a), (b) ) /* [ a0 +b0 a1 +b1 ... a3 +b3 ] */
135 12005939136 : #define wv_sub(a,b) _mm256_sub_epi64( (a), (b) ) /* [ a0 -b0 a1 -b1 ... a3 -b3 ] */
136 : //#define wv_mul(a,b) _mm256_mullo_epi64( (a), (b) ) /* [ a0 *b0 a1 *b1 ... a3 *b3 ] */
137 : #define wv_mul_ll(a,b) _mm256_mul_epu32( (a), (b) ) /* [ a0l*b0l a1l*b1l ... a3l *b3l ] */
138 :
139 : /* Binary operations */
140 :
141 : /* Note: wv_shl/wv_shr is a left/right shift by imm bits; imm should be
142 : a compile time constant in 0:63. The variable variants are slower
143 : but do not require the shift amount to be known at compile time
144 : (should still be in 0:63). */
145 :
146 : #define wv_not(a) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), (a) ) /* [ ~a0 ~a1 ... ~a3 ] */
147 :
148 : #define wv_shl(a,imm) _mm256_slli_epi64( (a), (imm) ) /* [ a0<<imm a1<<imm ... a3<<imm ] */
149 : #define wv_shr(a,imm) _mm256_srli_epi64( (a), (imm) ) /* [ a0>>imm a1>>imm ... a3>>imm ] */
150 :
151 : #define wv_shl_variable(a,n) _mm256_sll_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
152 : #define wv_shr_variable(a,n) _mm256_srl_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
153 :
154 : #define wv_shl_vector(a,b) _mm256_sllv_epi64( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a3<<b3 ] */
155 : #define wv_shr_vector(a,b) _mm256_srlv_epi64( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a3>>b3 ] */
156 :
157 : #define wv_and(a,b) _mm256_and_si256( (a), (b) ) /* [ a0 &b0 a1& b1 ... a3& b3 ] */
158 27102604 : #define wv_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ... (~a3)&b3 ] */
159 16510082528 : #define wv_or(a,b) _mm256_or_si256( (a), (b) ) /* [ a0 |b0 a1 |b1 ... a3 |b3 ] */
160 : #define wv_xor(a,b) _mm256_xor_si256( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ... a3 ^b3 ] */
161 :
162 : /* wv_rol(x,n) returns wv( rotate_left (x0,n), rotate_left (x1,n), ... )
163 : wv_ror(x,n) returns wv( rotate_right(x0,n), rotate_right(x1,n), ... ) */
164 :
165 : #if FD_HAS_AVX512
166 : #define wv_rol(a,imm) _mm256_rol_epi64( (a), (imm) )
167 : #define wv_ror(a,imm) _mm256_ror_epi64( (a), (imm) )
168 : #else
169 384824008 : static inline wv_t wv_rol( wv_t a, int imm ) { return wv_or( wv_shl( a, imm & 63 ), wv_shr( a, (-imm) & 63 ) ); }
170 16061886656 : static inline wv_t wv_ror( wv_t a, int imm ) { return wv_or( wv_shr( a, imm & 63 ), wv_shl( a, (-imm) & 63 ) ); }
171 : #endif
172 :
173 12582912 : static inline wv_t wv_rol_variable( wv_t a, int n ) { return wv_or( wv_shl_variable( a, n&63 ), wv_shr_variable( a, (-n)&63 ) ); }
174 12582912 : static inline wv_t wv_ror_variable( wv_t a, int n ) { return wv_or( wv_shr_variable( a, n&63 ), wv_shl_variable( a, (-n)&63 ) ); }
175 :
176 0 : static inline wv_t wv_rol_vector( wv_t a, wl_t b ) {
177 0 : wl_t m = wl_bcast( 63L );
178 0 : return wv_or( wv_shl_vector( a, wl_and( b, m ) ), wv_shr_vector( a, wl_and( wl_neg( b ), m ) ) );
179 0 : }
180 :
181 0 : static inline wv_t wv_ror_vector( wv_t a, wl_t b ) {
182 0 : wl_t m = wl_bcast( 63L );
183 0 : return wv_or( wv_shr_vector( a, wl_and( b, m ) ), wv_shl_vector( a, wl_and( wl_neg( b ), m ) ) );
184 0 : }
185 :
186 28682184 : #define wv_bswap(a) wu_to_wv_raw( wu_bswap( wv_to_wu_raw( wv_rol( (a), 32 ) ) ) )
187 :
188 : /* Logical operations */
189 :
190 : /* Like noted below in the converters, Intel clearly has the hardware to
191 : do a _mm256_cmpgt_epu64 given that _mm256_cmpgt_epi64 exists but
192 : doesn't expose it in the ISA pre AVX-512. Sigh ... twos complement
193 : bit tricks to the rescue for wu_{gt,lt,ge,le}. */
194 :
195 : #define wv_lnot(a) _mm256_cmpeq_epi64( (a), _mm256_setzero_si256() ) /* [ !a0 !a1 ... !a3 ] */
196 : #define wv_lnotnot(a) /* [ !!a0 !!a1 ... !!a3 ] */ \
197 : _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpeq_epi64( (a), _mm256_setzero_si256() ) )
198 :
199 : #define wv_eq(a,b) _mm256_cmpeq_epi64( (a), (b) ) /* [ a0==b0 a1==b1 ... a3==b3 ] */
200 : #define wv_gt(a,b) /* [ a0> b0 a1> b1 ... a3> b3 ] */ \
201 : _mm256_cmpgt_epi64( _mm256_sub_epi64( (a), _mm256_set1_epi64x( (long)(1UL<<63) ) ), \
202 : _mm256_sub_epi64( (b), _mm256_set1_epi64x( (long)(1UL<<63) ) ) )
203 : #define wv_lt(a,b) wv_gt( (b), (a) ) /* [ a0< b0 a1< b1 ... a3< b3 ] */
204 : #define wv_ne(a,b) _mm256_xor_si256( _mm256_set1_epi64x(-1L), _mm256_cmpeq_epi64( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a3!=b3 ] */
205 : #define wv_ge(a,b) _mm256_xor_si256( _mm256_set1_epi64x(-1L), wv_gt( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ... a3>=b3 ] */
206 : #define wv_le(a,b) _mm256_xor_si256( _mm256_set1_epi64x(-1L), wv_gt( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ... a3<=b3 ] */
207 :
208 : /* Conditional operations */
209 :
210 : #define wv_czero(c,f) _mm256_andnot_si256( (c), (f) ) /* [ c0?0UL:f0 c1?0UL:f1 ... c3?0UL:f3 ] */
211 36405338 : #define wv_notczero(c,f) _mm256_and_si256( (c), (f) ) /* [ c0?f0:0UL c1?f1:0UL ... c3?f3:0UL ] */
212 :
213 966768282 : #define wv_if(c,t,f) _mm256_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c3?t3:f3 ] */
214 :
215 : #if defined(__AVX512F__) && defined(__AVX512VL__) /* See note above */
216 131072 : #define wv_min(a,b) _mm256_min_epu64( (a), (b) )
217 131072 : #define wv_max(a,b) _mm256_max_epu64( (a), (b) )
218 : #else
219 393216 : static inline wv_t wv_min( wv_t a, wv_t b ) { return wv_if( wv_lt( a, b ), a, b ); }
220 393216 : static inline wv_t wv_max( wv_t a, wv_t b ) { return wv_if( wv_gt( a, b ), a, b ); }
221 : #endif
222 :
223 : /* Conversion operations */
224 :
225 : /* Summarizing:
226 :
227 : wv_to_wc(d) returns [ !!v0 !!v0 !!v1 !!v1 ... !!v3 !!v3 ]
228 :
229 : wv_to_wf(l,i,0) returns [ (float)v0 (float)v1 (float)v2 (float)v3 f4 f5 f6 f7 ]
230 : wv_to_wf(l,i,1) returns [ f0 f1 f2 f3 (float)v0 (float)v1 (float)v2 (float)v3 ]
231 :
232 : wv_to_wi(l,i,0) returns [ (int)v0 (int)v1 (int)v2 (int)v3 i4 i5 i6 i7 ]
233 : wv_to_wi(l,i,1) returns [ i0 i1 i2 i3 (int)v0 (int)v1 (int)v2 (int)v3 ]
234 :
235 : wv_to_wu(l,u,0) returns [ (uint)v0 (uint)v1 (uint)v2 (uint)v3 u4 u5 u6 u7 ]
236 : wv_to_wu(l,u,1) returns [ v0 v1 v2 v3 (uint)v0 (uint)v1 (uint)v2 (uint)v3 ]
237 :
238 : wv_to_wd(l) returns [ (double)v0 (double)v1 (double)v2 (double)v3 ]
239 :
240 : wv_to_wl(l) returns [ (long)v0 (long)v1 (long)v2 (long)v3 ]
241 :
242 : The raw variants just treat the raw bits as the corresponding vector
243 : type. For wv_to_wc_raw, the user promises wv contains a proper
244 : vector conditional (e.g. 0 or -1 in each lane). The others are
245 : provided to facilitate doing advanced bit tricks on floating point
246 : values. */
247 :
248 519396263 : #define wv_to_wc(a) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpeq_epi64( (a), _mm256_setzero_si256() ) )
249 :
250 393216 : static inline wf_t wv_to_wf( wv_t v, wf_t f, int imm_hi ) {
251 393216 : union { ulong u[4]; __m256i v[1]; } t[1];
252 393216 : union { float f[4]; __m128 v[1]; } u[1];
253 393216 : _mm256_store_si256( t->v, v );
254 393216 : u->f[0] = (float)t->u[0];
255 393216 : u->f[1] = (float)t->u[1];
256 393216 : u->f[2] = (float)t->u[2];
257 393216 : u->f[3] = (float)t->u[3];
258 393216 : __m128 w = _mm_load_ps( u->f );
259 393216 : return imm_hi ? _mm256_insertf128_ps( f, w, 1 ) : _mm256_insertf128_ps( f, w, 0 ); /* compile time */
260 393216 : }
261 :
262 393216 : static inline wv_t wv_to_wi( wv_t v, wi_t i, int imm_hi ) {
263 393216 : __m128 v01 = _mm_castsi128_ps( _mm256_extractf128_si256( v, 0 ) ); /* [ v0l v0h v1l v1h ] */
264 393216 : __m128 v23 = _mm_castsi128_ps( _mm256_extractf128_si256( v, 1 ) ); /* [ v2l v2h v3l v3h ] */
265 393216 : __m128i w = _mm_castps_si128( _mm_shuffle_ps( v01, v23, _MM_SHUFFLE(2,0,2,0) ) );
266 393216 : return imm_hi ? _mm256_insertf128_si256( i, w, 1 ) : _mm256_insertf128_si256( i, w, 0 ); /* compile time */
267 393216 : }
268 :
269 393216 : static inline wu_t wv_to_wu( wv_t v, wu_t u, int imm_hi ) {
270 393216 : __m128 v01 = _mm_castsi128_ps( _mm256_extractf128_si256( v, 0 ) ); /* [ v0l v0h v1l v1h ] */
271 393216 : __m128 v23 = _mm_castsi128_ps( _mm256_extractf128_si256( v, 1 ) ); /* [ v2l v2h v3l v3h ] */
272 393216 : __m128i w = _mm_castps_si128( _mm_shuffle_ps( v01, v23, _MM_SHUFFLE(2,0,2,0) ) );
273 393216 : return imm_hi ? _mm256_insertf128_si256( u, w, 1 ) : _mm256_insertf128_si256( u, w, 0 ); /* compile time */
274 393216 : }
275 :
276 : /* FIXME: IS IT FASTER TO USE INSERT / EXTRACT HERE? */
277 196608 : static inline wd_t wv_to_wd( wv_t v ) {
278 196608 : union { ulong u[4]; __m256i v[1]; } t[1];
279 196608 : union { double d[4]; __m256d v[1]; } u[1];
280 196608 : _mm256_store_si256( t->v, v );
281 196608 : u->d[0] = (double)t->u[0];
282 196608 : u->d[1] = (double)t->u[1];
283 196608 : u->d[2] = (double)t->u[2];
284 196608 : u->d[3] = (double)t->u[3];
285 196608 : return _mm256_load_pd( u->d );
286 196608 : }
287 :
288 : #define wv_to_wl(a) (a)
289 :
290 : #define wv_to_wc_raw(a) (a)
291 : #define wv_to_wf_raw(a) _mm256_castsi256_ps( (a) )
292 : #define wv_to_wi_raw(a) (a)
293 : #define wv_to_wu_raw(a) (a)
294 : #define wv_to_wd_raw(a) _mm256_castsi256_pd( (a) )
295 : #define wv_to_wl_raw(a) (a)
296 :
297 : /* Reduction operations */
298 :
299 : static inline wv_t
300 196608 : wv_sum_all( wv_t x ) { /* Returns wv_bcast( sum( x ) ) */
301 196608 : x = _mm256_add_epi64( x, _mm256_permute2f128_si256( x, x, 1 ) );
302 196608 : return _mm256_add_epi64( x, _mm256_castpd_si256( _mm256_permute_pd( _mm256_castsi256_pd( x ), 5 ) ) );
303 196608 : }
304 :
305 : static inline wv_t
306 196608 : wv_min_all( wv_t x ) { /* Returns wv_bcast( min( x ) ) */
307 196608 : x = wv_min( x, _mm256_permute2f128_si256( x, x, 1 ) );
308 196608 : return wv_min( x, _mm256_castpd_si256( _mm256_permute_pd( _mm256_castsi256_pd( x ), 5 ) ) );
309 196608 : }
310 :
311 : static inline wv_t
312 196608 : wv_max_all( wv_t x ) { /* Returns wv_bcast( max( x ) ) */
313 196608 : x = wv_max( x, _mm256_permute2f128_si256( x, x, 1 ) );
314 196608 : return wv_max( x, _mm256_castpd_si256( _mm256_permute_pd( _mm256_castsi256_pd( x ), 5 ) ) );
315 196608 : }
316 :
317 : /* Misc operations */
318 :
319 : /* wv_gather(b,i,imm_hi) returns
320 : [ b[i(0)] b[i(1)] b[i(2)] b[i(3)] ] if imm_hi is 0 and
321 : [ b[i(4)] b[i(5)] b[i(6)] b[i(7)] ] o.w.
322 : where b is a "ulong const*", i is wi_t and imm_hi is a compile time
323 : constant. We use a static inline here instead of a define to keep
324 : strict type checking while working around yet another Intel intrinsic
325 : type mismatch issue. */
326 :
327 219809286 : static inline wv_t wv_gather( ulong const * b, wi_t i, int imm_hi ) {
328 : /* A compile time branch, but older versions of GCC can't handle the
329 : ternary operator with -O0 */
330 219809286 : if( imm_hi ) return _mm256_i32gather_epi64( (long long const *)b, _mm256_extractf128_si256( i, 1 ), 8 );
331 109904643 : else return _mm256_i32gather_epi64( (long long const *)b, _mm256_extractf128_si256( i, 0 ), 8 );
332 219809286 : }
333 :
334 : /* wv_transpose_4x4 transposes the 4x4 matrix stored in wv_t r0,r1,r2,r3
335 : and stores the result in 4x4 matrix wv_t c0,c1,c2,c3. All
336 : c0,c1,c2,c3 should be different for a well defined result.
337 : Otherwise, in-place operation and/or using the same wv_t to specify
338 : multiple rows of r is fine. */
339 :
340 100392968 : #define wv_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do { \
341 100392968 : wv_t _wv_transpose_r0 = (r0); wv_t _wv_transpose_r1 = (r1); wv_t _wv_transpose_r2 = (r2); wv_t _wv_transpose_r3 = (r3); \
342 100392968 : wv_t _wv_transpose_t; \
343 100392968 : /* Transpose 2x2 blocks */ \
344 100392968 : _wv_transpose_t = _wv_transpose_r0; _wv_transpose_r0 = _mm256_permute2f128_si256( _wv_transpose_t, _wv_transpose_r2, 0x20 ); \
345 100392968 : /**/ _wv_transpose_r2 = _mm256_permute2f128_si256( _wv_transpose_t, _wv_transpose_r2, 0x31 ); \
346 100392968 : _wv_transpose_t = _wv_transpose_r1; _wv_transpose_r1 = _mm256_permute2f128_si256( _wv_transpose_t, _wv_transpose_r3, 0x20 ); \
347 100392968 : /**/ _wv_transpose_r3 = _mm256_permute2f128_si256( _wv_transpose_t, _wv_transpose_r3, 0x31 ); \
348 100392968 : /* Transpose 1x1 blocks */ \
349 100392968 : /**/ (c0) = _mm256_unpacklo_epi64( _wv_transpose_r0, _wv_transpose_r1 ); \
350 100392968 : /**/ (c1) = _mm256_unpackhi_epi64( _wv_transpose_r0, _wv_transpose_r1 ); \
351 100392968 : /**/ (c2) = _mm256_unpacklo_epi64( _wv_transpose_r2, _wv_transpose_r3 ); \
352 100392968 : /**/ (c3) = _mm256_unpackhi_epi64( _wv_transpose_r2, _wv_transpose_r3 ); \
353 100392968 : } while(0)
|