Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx_h
2 : #error "Do not include this directly; use fd_avx.h"
3 : #endif
4 :
5 : /* Vector long API ****************************************************/
6 :
7 : /* A wl_t is a vector where each adjacent pair of 32-bit wide lanes
8 : (e.g. 0-1 / 2-3 / 4-5 / 6-7) holds a signed 64-bit twos-complement
9 : integer (a "long").
10 :
11 : These mirror the other APIs as much as possible. Macros are
12 : preferred over static inlines when it is possible to do it robustly
13 : to reduce the risk of the compiler mucking it up. */
14 :
15 3153798330 : #define wl_t __m256i
16 :
17 : /* Constructors */
18 :
19 : /* Given the long values, return ... */
20 :
21 866925 : #define wl(l0,l1,l2,l3) _mm256_setr_epi64x( (l0), (l1), (l2), (l3) ) /* [ l0 l1 l2 l3 ] */
22 :
23 3004533147 : #define wl_bcast(l0) _mm256_set1_epi64x( (l0) ) /* [ l0 l0 l0 l0 ] */
24 :
25 : static inline wl_t /* [ l0 l1 l0 l1 ] */
26 196608 : wl_bcast_pair( long l0, long l1 ) {
27 196608 : return _mm256_setr_epi64x( l0, l1, l0, l1 );
28 196608 : }
29 :
30 : static inline wl_t /* [ l0 l0 l1 l1 ] */
31 196608 : wl_bcast_wide( long l0, long l1 ) {
32 196608 : return _mm256_setr_epi64x( l0, l0, l1, l1 );
33 196608 : }
34 :
35 : /* wl_permute returns [ l(imm_i0) l(imm_i1) l(imm_i2) l(imm_i3) ].
36 : imm_i* should be compile time constants in 0:3. */
37 :
38 : #if FD_USING_CLANG /* Sigh ... clang is sad and can't handle passing compile time const expressions through a static inline */
39 :
40 : static inline wl_t
41 2162688 : wl_permute( wl_t x, int imm_i0, int imm_i1, int imm_i2, int imm_i3 ) {
42 2162688 : union { long l[4]; __m256i v[1]; } t, u;
43 2162688 : _mm256_store_si256( t.v, x );
44 2162688 : u.l[0] = t.l[ imm_i0 ];
45 2162688 : u.l[1] = t.l[ imm_i1 ];
46 2162688 : u.l[2] = t.l[ imm_i2 ];
47 2162688 : u.l[3] = t.l[ imm_i3 ];
48 2162688 : return _mm256_load_si256( u.v );
49 2162688 : }
50 :
51 : #else
52 :
53 : #define wl_permute(x,imm_i0,imm_i1,imm_i2,imm_i3) _mm256_permute4x64_epi64( (x), (imm_i0)+4*(imm_i1)+16*(imm_i2)+64*(imm_i3) )
54 :
55 : #endif
56 :
57 : /* Predefined constants */
58 :
59 : #define wl_zero() _mm256_setzero_si256() /* Return [ 0L 0L 0L 0L ] */
60 134873859 : #define wl_one() _mm256_set1_epi64x( 1L ) /* Return [ 1L 1L 1L 1L ] */
61 :
62 : /* Memory operations */
63 :
64 : /* wl_ld return the 4 longs at the 32-byte aligned / 32-byte sized
65 : location p as a vector long. wl_ldu is the same but p does not have
66 : to be aligned. wl_st writes the vector long to the 32-byte aligned /
67 : 32-byte sized location p as 4 longs. wl_stu is the same but p does
68 : not have to be aligned. In all these 64-bit lane l wlll be at p[l].
69 : FIXME: USE ATTRIBUTES ON P PASSED TO THESE?
70 :
71 : Note: gcc knows a __m256i may alias. */
72 :
73 12141644382 : static inline wl_t wl_ld( long const * p ) { return _mm256_load_si256( (__m256i const *)p ); }
74 134873859 : static inline void wl_st( long * p, wl_t i ) { _mm256_store_si256( (__m256i *)p, i ); }
75 :
76 12547213149 : static inline wl_t wl_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
77 539495436 : static inline void wl_stu( void * p, wl_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
78 :
79 : /* wl_ldif is an optimized equivalent to wl_notczero(c,wl_ldu(p)) (may
80 : have different behavior if c is not a proper vector conditional). It
81 : is provided for symmetry with the wl_stif operation. wl_stif stores
82 : x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
83 : Undefined behavior if c is not a proper vector conditional. */
84 :
85 : #define wl_ldif(c,p) _mm256_maskload_epi64( (p),(c))
86 : #define wl_stif(c,p,x) _mm256_maskstore_epi64((p),(c),(x))
87 :
88 : /* Element operations */
89 :
90 : /* wl_extract extracts the long in lane imm from the vector long as a
91 : long. wl_insert returns the vector long formed by replacing the
92 : value in lane imm of a with the provided long. imm should be a
93 : compile time known in 0:3. wl_extract_variable and
94 : wl_insert_variable are the slower but the lane n does not have to be
95 : known at compile time (should still be in 0:3).
96 :
97 : Note: C99 TC3 allows type punning through a union. */
98 :
99 539495436 : #define wl_extract(a,imm) _mm256_extract_epi64( (a), (imm) )
100 :
101 539495436 : #define wl_insert(a,imm,v) _mm256_insert_epi64( (a), (v), (imm) )
102 :
103 : static inline long
104 539495436 : wl_extract_variable( wl_t a, int n ) {
105 539495436 : union { __m256i m[1]; long l[4]; } t[1];
106 539495436 : _mm256_store_si256( t->m, a );
107 539495436 : return t->l[n];
108 539495436 : }
109 :
110 : static inline wl_t
111 539495436 : wl_insert_variable( wl_t a, int n, long v ) {
112 539495436 : union { __m256i m[1]; long l[4]; } t[1];
113 539495436 : _mm256_store_si256( t->m, a );
114 539495436 : t->l[n] = v;
115 539495436 : return _mm256_load_si256( t->m );
116 539495436 : }
117 :
118 : /* Given [a0 a1 a2 a3] and/or [b0 b1 b2 b3], return ... */
119 :
120 : /* Arithmetic operations */
121 :
122 : #define wl_neg(a) _mm256_sub_epi64( _mm256_setzero_si256(), (a) ) /* [ -a0 -a1 ... -a3 ] (twos complement handling) */
123 :
124 : /* Note: _mm256_{abs,min,max}_epi64 are missing pre AVX-512. We emulate
125 : these below (and use the AVX-512 versions if possible). Likewise,
126 : there is no _mm256_mullo_epi64 pre AVX-512. Since this is not cheap to
127 : emulate, we do not provide a wl_mul for the time being (we could
128 : consider exposing it on AVX-512 targets though). There is a
129 : 64L*64L->64 multiply (where the lower 32-bits will be sign extended
130 : to 64-bits beforehand) though and that is very useful. So we do
131 : provide that. */
132 :
133 : #define wl_add(a,b) _mm256_add_epi64( (a), (b) ) /* [ a0 +b0 a1 +b1 ... a3 +b3 ] */
134 3325548 : #define wl_sub(a,b) _mm256_sub_epi64( (a), (b) ) /* [ a0 -b0 a1 -b1 ... a3 -b3 ] */
135 : //#define wl_mul(a,b) _mm256_mullo_epi64( (a), (b) ) /* [ a0 *b0 a1 *b1 ... a3 *b3 ] */
136 : #define wl_mul_ll(a,b) _mm256_mul_epi32( (a), (b) ) /* [ a0l*b0l a1l*b1l ... a3l *b3l ] */
137 :
138 : /* Binary operations */
139 :
140 : /* Note: wl_shl/wl_shr/wl_shru is a left/signed right/unsigned right
141 : shift by imm bits; imm should be a compile time constant in 0:63.
142 : The variable variants are slower but do not require the shift amount
143 : to be known at compile time (should still be in 0:63). Also, AVX is
144 : missing _mm256_sra*_epi64 intrinsics. We emulate these below. */
145 :
146 : #define wl_not(a) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), (a) ) /* [ ~a0 ~a1 ... ~a3 ] */
147 :
148 277101 : #define wl_shl(a,imm) _mm256_slli_epi64( (a), (imm) ) /* [ a0<<imm a1<<imm ... a3<<imm ] */
149 : //#define wl_shr(a,imm) _mm256_srai_epi64( (a), (imm) ) /* [ a0>>imm a1>>imm ... a3>>imm ] (treat a as signed)*/
150 3602649 : #define wl_shru(a,imm) _mm256_srli_epi64( (a), (imm) ) /* [ a0>>imm a1>>imm ... a3>>imm ] (treat a as unsigned) */
151 :
152 : #define wl_shl_variable(a,n) _mm256_sll_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
153 : //#define wl_shr_variable(a,n) _mm256_sra_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
154 : #define wl_shru_variable(a,n) _mm256_srl_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
155 :
156 : #define wl_shl_vector(a,b) _mm256_sllv_epi64( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a3<<b3 ] */
157 : //#define wl_shr_vector(a,b) _mm256_srav_epi64( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a3>>b3 ] (treat a as signed) */
158 277101 : #define wl_shru_vector(a,b) _mm256_srlv_epi64( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a3>>b3 ] (treat a as unsigned) */
159 :
160 : #define wl_and(a,b) _mm256_and_si256( (a), (b) ) /* [ a0 &b0 a1& b1 ... a3& b3 ] */
161 : #define wl_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0 (~a1)&b1 ... (~a3)&b3 ] */
162 41943040 : #define wl_or(a,b) _mm256_or_si256( (a), (b) ) /* [ a0 |b0 a1 |b1 ... a3 |b3 ] */
163 : #define wl_xor(a,b) _mm256_xor_si256( (a), (b) ) /* [ a0 ^b0 a1 ^b1 ... a3 ^b3 ] */
164 :
165 : /* wl_rol(x,n) returns wl( rotate_left (x0,n), rotate_left (x1,n), ... )
166 : wl_ror(x,n) returns wl( rotate_right(x0,n), rotate_right(x1,n), ... ) */
167 :
168 : #if FD_HAS_AVX512
169 : #define wl_rol(a,imm) _mm256_rol_epi64( (a), (imm) )
170 : #define wl_ror(a,imm) _mm256_ror_epi64( (a), (imm) )
171 : #else
172 8388608 : static inline wl_t wl_rol( wl_t a, int imm ) { return wl_or( wl_shl( a, imm & 63 ), wl_shru( a, (-imm) & 63 ) ); }
173 8388608 : static inline wl_t wl_ror( wl_t a, int imm ) { return wl_or( wl_shru( a, imm & 63 ), wl_shl( a, (-imm) & 63 ) ); }
174 : #endif
175 :
176 12582912 : static inline wl_t wl_rol_variable( wl_t a, int n ) { return wl_or( wl_shl_variable( a, n&63 ), wl_shru_variable( a, (-n)&63 ) ); }
177 12582912 : static inline wl_t wl_ror_variable( wl_t a, int n ) { return wl_or( wl_shru_variable( a, n&63 ), wl_shl_variable( a, (-n)&63 ) ); }
178 :
179 0 : static inline wl_t wl_rol_vector( wl_t a, wl_t b ) {
180 0 : wl_t m = wl_bcast( 63L );
181 0 : return wl_or( wl_shl_vector( a, wl_and( b, m ) ), wl_shru_vector( a, wl_and( wl_neg( b ), m ) ) );
182 0 : }
183 :
184 0 : static inline wl_t wl_ror_vector( wl_t a, wl_t b ) {
185 0 : wl_t m = wl_bcast( 63L );
186 0 : return wl_or( wl_shru_vector( a, wl_and( b, m ) ), wl_shl_vector( a, wl_and( wl_neg( b ), m ) ) );
187 0 : }
188 :
189 : /* Logical operations */
190 :
191 : #define wl_lnot(a) _mm256_cmpeq_epi64( (a), _mm256_setzero_si256() ) /* [ !a0 !a1 ... !a3 ] */
192 : #define wl_lnotnot(a) /* [ !!a0 !!a1 ... !!a3 ] */ \
193 : _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpeq_epi64( (a), _mm256_setzero_si256() ) )
194 :
195 277101 : #define wl_eq(a,b) _mm256_cmpeq_epi64( (a), (b) ) /* [ a0==b0 a1==b1 ... a3==b3 ] */
196 277101 : #define wl_gt(a,b) _mm256_cmpgt_epi64( (a), (b) ) /* [ a0> b0 a1> b1 ... a3> b3 ] */
197 25165824 : #define wl_lt(a,b) _mm256_cmpgt_epi64( (b), (a) ) /* [ a0< b0 a1< b1 ... a3< b3 ] */
198 : #define wl_ne(a,b) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpeq_epi64( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a3!=b3 ] */
199 : #define wl_ge(a,b) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpgt_epi64( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ... a3>=b3 ] */
200 : #define wl_le(a,b) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpgt_epi64( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ... a3<=b3 ] */
201 :
202 : /* Conditional operations */
203 :
204 : #define wl_czero(c,f) _mm256_andnot_si256( (c), (f) ) /* [ c0?0L:f0 c1?0L:f1 ... c3?0L:f3 ] */
205 : #define wl_notczero(c,f) _mm256_and_si256( (c), (f) ) /* [ c0?f0:0L c1?f1:0L ... c3?f3:0L ] */
206 :
207 917504 : #define wl_if(c,t,f) _mm256_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c3?t3:f3 ] */
208 :
209 : #if defined(__AVX512F__) && defined(__AVX512VL__) /* See note above */
210 : #define wl_abs(a) _mm256_abs_epi64( (a) )
211 131072 : #define wl_min(a,b) _mm256_min_epi64( (a), (b) )
212 131072 : #define wl_max(a,b) _mm256_max_epi64( (a), (b) )
213 : #else
214 131072 : static inline wl_t wl_abs( wl_t a ) { return wl_if( wl_lt( a, wl_zero() ), wl_neg( a ), a ); }
215 393216 : static inline wl_t wl_min( wl_t a, wl_t b ) { return wl_if( wl_lt( a, b ), a, b ); }
216 393216 : static inline wl_t wl_max( wl_t a, wl_t b ) { return wl_if( wl_gt( a, b ), a, b ); }
217 : #endif
218 :
219 12582912 : static inline wl_t wl_shr( wl_t a, int imm ) {
220 12582912 : wc_t c = wl_lt( a, wl_zero() ); /* Note that wc_t is binary compat with wl_t */
221 12582912 : return _mm256_xor_si256( _mm256_srli_epi64( _mm256_xor_si256( a, c ), imm ), c );
222 12582912 : }
223 12582912 : static inline wl_t wl_shr_variable( wl_t a, int n ) {
224 12582912 : wc_t c = wl_lt( a, wl_zero() ); /* Note that wc_t is binary compat with wl_t */
225 12582912 : return _mm256_xor_si256( _mm256_srl_epi64( _mm256_xor_si256( a, c ), _mm_insert_epi64( _mm_setzero_si128(), n, 0 ) ), c );
226 12582912 : }
227 0 : static inline wl_t wl_shr_vector( wl_t a, wl_t n ) {
228 0 : wc_t c = wl_lt( a, wl_zero() ); /* Note that wc_t is binary compat with wl_t */
229 0 : return _mm256_xor_si256( _mm256_srlv_epi64( _mm256_xor_si256( a, c ), n ), c );
230 0 : }
231 :
232 : /* Conversion operations */
233 :
234 : /* Summarizing:
235 :
236 : wl_to_wc(d) returns [ !!l0 !!l0 !!l1 !!l1 ... !!l3 !!l3 ]
237 :
238 : wl_to_wf(l,i,0) returns [ (float)l0 (float)l1 (float)l2 (float)l3 f4 f5 f6 f7 ]
239 : wl_to_wf(l,i,1) returns [ f0 f1 f2 f3 (float)l0 (float)l1 (float)l2 (float)l3 ]
240 :
241 : wl_to_wi(l,i,0) returns [ (int)l0 (int)l1 (int)l2 (int)l3 i4 i5 i6 i7 ]
242 : wl_to_wi(l,i,1) returns [ i0 i1 i2 i3 (int)l0 (int)l1 (int)l2 (int)l3 ]
243 :
244 : wl_to_wu(l,u,0) returns [ (uint)l0 (uint)l1 (uint)l2 (uint)l3 u4 u5 u6 u7 ]
245 : wl_to_wu(l,u,1) returns [ u0 u1 u2 u3 (uint)l0 (uint)l1 (uint)l2 (uint)l3 ]
246 :
247 : wl_to_wd(l) returns [ (double)l0 (double)l1 (double)l2 (double)l3 ]
248 :
249 : wl_to_wv(l) returns [ (ulong)l0 (ulong)l1 (ulong)l2 (ulong)l3 ]
250 :
251 : The raw variants just treat the raw bits as the corresponding vector
252 : type. For wl_to_wc_raw, the user promises wl contains a proper
253 : vector conditional (e.g. 0 or -1 in each lane). The others are
254 : provided to facilitate doing advanced bit tricks on floating point
255 : values. */
256 :
257 : #define wl_to_wc(a) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpeq_epi64( (a), _mm256_setzero_si256() ) )
258 :
259 393216 : static inline wf_t wl_to_wf( wl_t l, wf_t f, int imm_hi ) {
260 393216 : union { long l[4]; __m256i v[1]; } t[1];
261 393216 : union { float f[4]; __m128 v[1]; } u[1];
262 393216 : _mm256_store_si256( t->v, l );
263 393216 : u->f[0] = (float)t->l[0];
264 393216 : u->f[1] = (float)t->l[1];
265 393216 : u->f[2] = (float)t->l[2];
266 393216 : u->f[3] = (float)t->l[3];
267 393216 : __m128 v = _mm_load_ps( u->f );
268 393216 : return imm_hi ? _mm256_insertf128_ps( f, v, 1 ) : _mm256_insertf128_ps( f, v, 0 ); /* compile time */
269 393216 : }
270 :
271 393216 : static inline wl_t wl_to_wi( wl_t l, wi_t i, int imm_hi ) {
272 393216 : __m128 v01 = _mm_castsi128_ps( _mm256_extractf128_si256( l, 0 ) ); /* [ l0l l0h l1l l1h ] */
273 393216 : __m128 v23 = _mm_castsi128_ps( _mm256_extractf128_si256( l, 1 ) ); /* [ l2l l2h l3l l3h ] */
274 393216 : __m128i v = _mm_castps_si128( _mm_shuffle_ps( v01, v23, _MM_SHUFFLE(2,0,2,0) ) );
275 393216 : return imm_hi ? _mm256_insertf128_si256( i, v, 1 ) : _mm256_insertf128_si256( i, v, 0 ); /* compile time */
276 393216 : }
277 :
278 393216 : static inline wu_t wl_to_wu( wl_t l, wu_t u, int imm_hi ) {
279 393216 : __m128 v01 = _mm_castsi128_ps( _mm256_extractf128_si256( l, 0 ) ); /* [ l0l l0h l1l l1h ] */
280 393216 : __m128 v23 = _mm_castsi128_ps( _mm256_extractf128_si256( l, 1 ) ); /* [ l2l l2h l3l l3h ] */
281 393216 : __m128i v = _mm_castps_si128( _mm_shuffle_ps( v01, v23, _MM_SHUFFLE(2,0,2,0) ) );
282 393216 : return imm_hi ? _mm256_insertf128_si256( u, v, 1 ) : _mm256_insertf128_si256( u, v, 0 ); /* compile time */
283 393216 : }
284 :
285 : /* FIXME: IS IT FASTER TO USE INSERT / EXTRACT HERE? */
286 196608 : static inline wd_t wl_to_wd( wl_t l ) {
287 196608 : union { long l[4]; __m256i v[1]; } t[1];
288 196608 : union { double d[4]; __m256d v[1]; } u[1];
289 196608 : _mm256_store_si256( t->v, l );
290 196608 : u->d[0] = (double)t->l[0];
291 196608 : u->d[1] = (double)t->l[1];
292 196608 : u->d[2] = (double)t->l[2];
293 196608 : u->d[3] = (double)t->l[3];
294 196608 : return _mm256_load_pd( u->d );
295 196608 : }
296 :
297 : #define wl_to_wv(a) (a)
298 :
299 : #define wl_to_wc_raw(a) (a)
300 : #define wl_to_wf_raw(a) _mm256_castsi256_ps( (a) )
301 : #define wl_to_wi_raw(a) (a)
302 : #define wl_to_wu_raw(a) (a)
303 : #define wl_to_wd_raw(a) _mm256_castsi256_pd( (a) )
304 : #define wl_to_wv_raw(a) (a)
305 :
306 : /* Reduction operations */
307 :
308 : static inline wl_t
309 196608 : wl_sum_all( wl_t x ) { /* Returns wl_bcast( sum( x ) ) */
310 196608 : x = _mm256_add_epi64( x, _mm256_permute2f128_si256( x, x, 1 ) );
311 196608 : return _mm256_add_epi64( x, _mm256_castpd_si256( _mm256_permute_pd( _mm256_castsi256_pd( x ), 5 ) ) );
312 196608 : }
313 :
314 : static inline wl_t
315 196608 : wl_min_all( wl_t x ) { /* Returns wl_bcast( min( x ) ) */
316 196608 : x = wl_min( x, _mm256_permute2f128_si256( x, x, 1 ) );
317 196608 : return wl_min( x, _mm256_castpd_si256( _mm256_permute_pd( _mm256_castsi256_pd( x ), 5 ) ) );
318 196608 : }
319 :
320 : static inline wl_t
321 196608 : wl_max_all( wl_t x ) { /* Returns wl_bcast( max( x ) ) */
322 196608 : x = wl_max( x, _mm256_permute2f128_si256( x, x, 1 ) );
323 196608 : return wl_max( x, _mm256_castpd_si256( _mm256_permute_pd( _mm256_castsi256_pd( x ), 5 ) ) );
324 196608 : }
325 :
326 : /* Misc operations */
327 :
328 : /* wl_gather(b,i,imm_hi) returns
329 : [ b[i(0)] b[i(1)] b[i(2)] b[i(3)] ] if imm_hi is 0 and
330 : [ b[i(4)] b[i(5)] b[i(6)] b[i(7)] ] o.w.
331 : where b is a "long const*", i is wi_t and imm_hi is a compile time
332 : constant. We use a static inline here instead of a define to keep
333 : strict type checking while working around yet another Intel intrinsic
334 : type mismatch issue. */
335 :
336 269747718 : static inline wl_t wl_gather( long const * b, wi_t i, int imm_hi ) {
337 : /* A compile time branch, but older versions of GCC can't handle the
338 : ternary operator with -O0 */
339 269747718 : if( imm_hi ) return _mm256_i32gather_epi64( (long long const *)b, _mm256_extractf128_si256( i, 1 ), 8 );
340 134873859 : else return _mm256_i32gather_epi64( (long long const *)b, _mm256_extractf128_si256( i, 0 ), 8 );
341 269747718 : }
342 :
343 : /* wl_transpose_4x4 transposes the 4x4 matrix stored in wl_t r0,r1,r2,r3
344 : and stores the result in 4x4 matrix wl_t c0,c1,c2,c3. All
345 : c0,c1,c2,c3 should be different for a well defined result.
346 : Otherwise, in-place operation and/or using the same wl_t to specify
347 : multiple rows of r is fine. */
348 :
349 196608 : #define wl_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do { \
350 196608 : wl_t _wl_transpose_r0 = (r0); wl_t _wl_transpose_r1 = (r1); wl_t _wl_transpose_r2 = (r2); wl_t _wl_transpose_r3 = (r3); \
351 196608 : wl_t _wl_transpose_t; \
352 196608 : /* Transpose 2x2 blocks */ \
353 196608 : _wl_transpose_t = _wl_transpose_r0; _wl_transpose_r0 = _mm256_permute2f128_si256( _wl_transpose_t, _wl_transpose_r2, 0x20 ); \
354 196608 : /**/ _wl_transpose_r2 = _mm256_permute2f128_si256( _wl_transpose_t, _wl_transpose_r2, 0x31 ); \
355 196608 : _wl_transpose_t = _wl_transpose_r1; _wl_transpose_r1 = _mm256_permute2f128_si256( _wl_transpose_t, _wl_transpose_r3, 0x20 ); \
356 196608 : /**/ _wl_transpose_r3 = _mm256_permute2f128_si256( _wl_transpose_t, _wl_transpose_r3, 0x31 ); \
357 196608 : /* Transpose 1x1 blocks */ \
358 196608 : /**/ (c0) = _mm256_unpacklo_epi64( _wl_transpose_r0, _wl_transpose_r1 ); \
359 196608 : /**/ (c1) = _mm256_unpackhi_epi64( _wl_transpose_r0, _wl_transpose_r1 ); \
360 196608 : /**/ (c2) = _mm256_unpacklo_epi64( _wl_transpose_r2, _wl_transpose_r3 ); \
361 196608 : /**/ (c3) = _mm256_unpackhi_epi64( _wl_transpose_r2, _wl_transpose_r3 ); \
362 196608 : } while(0)
|