LCOV - cov.lcov - util/simd/fd_avx

LCOV - code coverage report

Current view:	top level - util/simd - fd_avx_wi.h (source / functions)		Hit	Total	Coverage
Test:	cov.lcov	Lines:	96	104	92.3 %
Date:	2024-11-13 11:58:15	Functions:	18	12486	0.1 %

          Line data    Source code

       1             : #ifndef HEADER_fd_src_util_simd_fd_avx_h
       2             : #error "Do not include this directly; use fd_avx.h"
       3             : #endif
       4             : 
       5             : /* Vector int API *****************************************************/
       6             : 
       7             : /* A wi_t is a vector where each 32-bit wide lane holds a signed 32-bit
       8             :    twos-complement integer (an "int").  These mirror wc and wf as much
       9             :    as possible.
      10             : 
      11             :    These mirror the other APIs as much as possible.  Macros are
      12             :    preferred over static inlines when it is possible to do it robustly
      13             :    to reduce the risk of the compiler mucking it up. */
      14             : 
      15    74908419 : #define wi_t __m256i
      16             : 
      17             : /* Constructors */
      18             : 
      19             : /* Given the int values, return ... */
      20             : 
      21             : #define wi(i0,i1,i2,i3,i4,i5,i6,i7) /* [ i0 i1 i2 i3 i4 i5 i6 i7 ] */ \
      22   490146828 :   _mm256_setr_epi32( (i0), (i1), (i2), (i3), (i4), (i5), (i6), (i7) )
      23             : 
      24             : #define wi_bcast(i0) _mm256_set1_epi32( (i0) ) /* [ i0 i0 i0 i0 i0 i0 i0 i0 ] */
      25             : 
      26             : static inline wi_t /* [ i0 i1 i0 i1 i0 i1 i0 i1 ] */
      27      196608 : wi_bcast_pair( int i0, int i1 ) {
      28      196608 :   return _mm256_setr_epi32( i0, i1, i0, i1, i0, i1, i0, i1 );
      29      196608 : }
      30             : 
      31             : static inline wi_t /* [ i0 i0 i0 i0 i1 i1 i1 i1 ] */
      32      196608 : wi_bcast_lohi( int i0, int i1 ) {
      33      196608 :   return _mm256_setr_epi32( i0, i0, i0, i0, i1, i1, i1, i1 );
      34      196608 : }
      35             : 
      36             : static inline wi_t /* [ i0 i1 i2 i3 i0 i1 i2 i3 ] */
      37      196608 : wi_bcast_quad( int i0, int i1, int i2, int i3 ) {
      38      196608 :   return _mm256_setr_epi32( i0, i1, i2, i3, i0, i1, i2, i3 );
      39      196608 : }
      40             : 
      41             : static inline wi_t /* [ i0 i0 i1 i1 i2 i2 i3 i3 ] */
      42      196608 : wi_bcast_wide( int i0, int i1, int i2, int i3 ) {
      43      196608 :   return _mm256_setr_epi32( i0, i0, i1, i1, i2, i2, i3, i3 );
      44      196608 : }
      45             : 
      46             : /* No general vf_permute due to cross-128-bit lane limitations in AVX.
      47             :    Useful cases are provided below.  Given [ i0 i1 i2 i3 i4 i5 i6 i7 ],
      48             :    return ... */
      49             : 
      50             : #define wi_bcast_even(x)      /* [ i0 i0 i2 i2 i4 i4 i6 i6 ] */ \
      51             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(2,2,0,0) ) )
      52             : 
      53             : #define wi_bcast_odd(x)       /* [ i1 i1 i3 i3 i5 i5 i7 i7 ] */ \
      54             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(3,3,1,1) ) )
      55             : 
      56             : #define wi_exch_adj(x)        /* [ i1 i0 i3 i2 i5 i4 i7 i6 ] */ \
      57             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(2,3,0,1) ) )
      58             : 
      59             : #define wi_exch_adj_pair(x)   /* [ i2 i3 i0 i1 i6 i7 i4 i5 ] */ \
      60             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(1,0,3,2) ) )
      61             : 
      62             : static inline wi_t
      63      196608 : wi_exch_adj_quad( wi_t x ) { /* [ i4 i5 i6 i7 i0 i1 i2 i3 ] */
      64      196608 :   return _mm256_permute2f128_si256( x, x, 1 );
      65      196608 : }
      66             : 
      67             : /* Predefined constants */
      68             : 
      69             : #define wi_zero() _mm256_setzero_si256() /* Return [ 0 0 0 0 0 0 0 0 ] */
      70    70976259 : #define wi_one()  _mm256_set1_epi32( 1 ) /* Return [ 1 1 1 1 1 1 1 1 ] */
      71             : 
      72             : /* Memory operations */
      73             : 
      74             : /* wi_ld return the 8 ints at the 32-byte aligned / 32-byte sized
      75             :    location p as a vector int.  wi_ldu is the same but p does not have
      76             :    to be aligned.  wi_st writes the vector int to the 32-byte aligned /
      77             :    32-byte sized location p as 8 ints.  wi_stu is the same but p does
      78             :    not have to be aligned.  In all these lane l will be at p[l].  FIXME:
      79             :    USE ATTRIBUTES ON P PASSED TO THESE?
      80             : 
      81             :    Note: gcc knows a __m256i may alias. */
      82             : 
      83    70976259 : static inline wi_t wi_ld( int const * p ) { return _mm256_load_si256(  (__m256i const *)p ); }
      84    70976259 : static inline void wi_st( int * p, wi_t i ) { _mm256_store_si256(  (__m256i *)p, i ); }
      85             : 
      86   567810072 : static inline wi_t wi_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
      87   567810072 : static inline void wi_stu( void * p, wi_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
      88             : 
      89             : /* wi_ldif is an optimized equivalent to wi_notczero(c,wi_ldu(p)) (may
      90             :    have different behavior if c is not a proper vector conditional).  It
      91             :    is provided for symmetry with the wi_stif operation.  wi_stif stores
      92             :    x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
      93             :    Undefined behavior if c is not a proper vector conditional. */
      94             : 
      95             : #define wi_ldif(c,p)   _mm256_maskload_epi32( (p),(c))
      96             : #define wi_stif(c,p,x) _mm256_maskstore_epi32((p),(c),(x))
      97             : 
      98             : /* Element operations */
      99             : 
     100             : /* wi_extract extracts the int in lane imm from the vector int as an int.
     101             :    wi_insert returns the vector int formed by replacing the value in
     102             :    lane imm of a with the provided int.  imm should be a compile time
     103             :    constant in 0:7.  wi_extract_variable and wi_insert_variable are the
     104             :    slower but the lane n does not have to be known at compile time
     105             :    (should still be in 0:7).
     106             : 
     107             :    Note: C99 TC3 allows type punning through a union. */
     108             : 
     109   567810072 : #define wi_extract(a,imm)  _mm256_extract_epi32( (a), (imm) )
     110   567810072 : #define wi_insert(a,imm,v) _mm256_insert_epi32( (a), (v), (imm) )
     111             : 
     112             : static inline int
     113   567810072 : wi_extract_variable( wi_t a, int n ) {
     114   567810072 :   union { __m256i m[1]; int i[8]; } t[1];
     115   567810072 :   _mm256_store_si256( t->m, a );
     116   567810072 :   return t->i[n];
     117   567810072 : }
     118             : 
     119             : static inline wi_t
     120   567810072 : wi_insert_variable( wi_t a, int n, int v ) {
     121   567810072 :   union { __m256i m[1]; int i[8]; } t[1];
     122   567810072 :   _mm256_store_si256( t->m, a );
     123   567810072 :   t->i[n] = v;
     124   567810072 :   return _mm256_load_si256( t->m );
     125   567810072 : }
     126             : 
     127             : /* Given [a0 a1 a2 a3 a4 a5 a6 a7] and/or [b0 b1 b2 b3 b4 b5 b6 b7],
     128             :    return ... */
     129             : 
     130             : /* Arithmetic operations */
     131             : 
     132             : #define wi_neg(a) _mm256_sub_epi32( _mm256_setzero_si256(), (a) ) /* [ -a0  -a1  ... -a7  ] (twos complement handling) */
     133             : #define wi_abs(a) _mm256_abs_epi32( (a) )                         /* [ |a0| |a1| ... |a7| ] (twos complement handling) */
     134             : 
     135             : #define wi_min(a,b) _mm256_min_epi32(   (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a7,b7) ] */
     136             : #define wi_max(a,b) _mm256_max_epi32(   (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a7,b7) ] */
     137             : #define wi_add(a,b) _mm256_add_epi32(   (a), (b) ) /* [ a0 +b0     a1 +b1     ... a7 +b7     ] */
     138             : #define wi_sub(a,b) _mm256_sub_epi32(   (a), (b) ) /* [ a0 -b0     a1 -b1     ... a7 -b7     ] */
     139             : #define wi_mul(a,b) _mm256_mullo_epi32( (a), (b) ) /* [ a0 *b0     a1 *b1     ... a7 *b7     ] */
     140             : 
     141             : /* Binary operations */
     142             : 
     143             : /* Note: wi_shl/wi_shr/wi_shru is a left/signed right/unsigned right
     144             :    shift by imm bits; imm must be a compile time constant in 0:63.  The
     145             :    variable variants are slower but do not require the shift amount to
     146             :    be known at compile time (should still be in 0:63). */
     147             : 
     148             : #define wi_not(a) _mm256_xor_si256( _mm256_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a7 ] */
     149             : 
     150             : #define wi_shl(a,imm)  _mm256_slli_epi32( (a), (imm) ) /* [ a0<<imm a1<<imm ... a7<<imm ] */
     151             : #define wi_shr(a,imm)  _mm256_srai_epi32( (a), (imm) ) /* [ a0>>imm a1>>imm ... a7>>imm ] (treat a as signed)*/
     152             : #define wi_shru(a,imm) _mm256_srli_epi32( (a), (imm) ) /* [ a0>>imm a1>>imm ... a7>>imm ] (treat a as unsigned) */
     153             : 
     154             : #define wi_shl_variable(a,n)  _mm256_sll_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     155             : #define wi_shr_variable(a,n)  _mm256_sra_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     156             : #define wi_shru_variable(a,n) _mm256_srl_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     157             : 
     158             : #define wi_shl_vector(a,b)  _mm256_sllv_epi32( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a7<<b7 ] */
     159             : #define wi_shr_vector(a,b)  _mm256_srav_epi32( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a7>>b7 ] (treat a as signed) */
     160             : #define wi_shru_vector(a,b) _mm256_srlv_epi32( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a7>>b7 ] (treat a as unsigned) */
     161             : 
     162             : #define wi_and(a,b)    _mm256_and_si256(    (a), (b) ) /* [   a0 &b0    a1& b1 ...   a7& b7 ] */
     163             : #define wi_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0  (~a1)&b1 ... (~a7)&b7 ] */
     164    20971520 : #define wi_or(a,b)     _mm256_or_si256(     (a), (b) ) /* [   a0 |b0    a1 |b1 ...   a7 |b7 ] */
     165             : #define wi_xor(a,b)    _mm256_xor_si256(    (a), (b) ) /* [   a0 ^b0    a1 ^b1 ...   a7 ^b7 ] */
     166             : 
     167             : /* wi_rol(x,n) returns wi( rotate_left (x0,n), rotate_left (x1,n), ... )
     168             :    wi_ror(x,n) returns wi( rotate_right(x0,n), rotate_right(x1,n), ... ) */
     169             : 
     170             : #if FD_HAS_AVX512
     171             : #define wi_rol(a,imm)  _mm256_rol_epi32( (a), (imm) )
     172             : #define wi_ror(a,imm)  _mm256_ror_epi32( (a), (imm) )
     173             : #else
     174     4194304 : static inline wi_t wi_rol( wi_t a, int imm ) { return wi_or( wi_shl(  a, imm & 31 ), wi_shru( a, (-imm) & 31 ) ); }
     175     4194304 : static inline wi_t wi_ror( wi_t a, int imm ) { return wi_or( wi_shru( a, imm & 31 ), wi_shl(  a, (-imm) & 31 ) ); }
     176             : #endif
     177             : 
     178     6291456 : static inline wi_t wi_rol_variable( wi_t a, int n ) { return wi_or( wi_shl_variable(  a, n&31 ), wi_shru_variable( a, (-n)&31 ) ); }
     179     6291456 : static inline wi_t wi_ror_variable( wi_t a, int n ) { return wi_or( wi_shru_variable( a, n&31 ), wi_shl_variable(  a, (-n)&31 ) ); }
     180             : 
     181           0 : static inline wi_t wi_rol_vector( wi_t a, wi_t b ) {
     182           0 :   wi_t m = wi_bcast( 31 );
     183           0 :   return wi_or( wi_shl_vector(  a, wi_and( b, m ) ), wi_shru_vector( a, wi_and( wi_neg( b ), m ) ) );
     184           0 : }
     185             : 
     186           0 : static inline wi_t wi_ror_vector( wi_t a, wi_t b ) {
     187           0 :   wi_t m = wi_bcast( 31 );
     188           0 :   return wi_or( wi_shru_vector( a, wi_and( b, m ) ), wi_shl_vector(  a, wi_and( wi_neg( b ), m ) ) );
     189           0 : }
     190             : 
     191             : /* Logical operations */
     192             : 
     193             : #define wi_lnot(a)    _mm256_cmpeq_epi32( (a), _mm256_setzero_si256() ) /* [  !a0  !a1 ...  !a7 ] */
     194             : #define wi_lnotnot(a)                                                   /* [ !!a0 !!a1 ... !!a7 ] */ \
     195             :   _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( (a), _mm256_setzero_si256() ) )
     196             : 
     197             : #define wi_eq(a,b) _mm256_cmpeq_epi32( (a), (b) )                                              /* [ a0==b0 a1==b1 ... a7==b7 ] */
     198             : #define wi_gt(a,b) _mm256_cmpgt_epi32( (a), (b) )                                              /* [ a0> b0 a1> b1 ... a7> b7 ] */
     199             : #define wi_lt(a,b) _mm256_cmpgt_epi32( (b), (a) )                                              /* [ a0< b0 a1< b1 ... a7< b7 ] */
     200             : #define wi_ne(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a7!=b7 ] */
     201             : #define wi_ge(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpgt_epi32( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ... a7>=b7 ] */
     202             : #define wi_le(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpgt_epi32( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ... a7<=b7 ] */
     203             : 
     204             : /* Conditional operations */
     205             : 
     206             : #define wi_czero(c,f)    _mm256_andnot_si256( (c), (f) ) /* [ c0? 0:f0 c1? 0:f1 ... c7? 0:f7 ] */
     207             : #define wi_notczero(c,f) _mm256_and_si256(    (c), (f) ) /* [ c0?f0: 0 c1?f1: 0 ... c7?f7: 0 ] */
     208             : 
     209             : #define wi_if(c,t,f) _mm256_blendv_epi8(  (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c7?t7:f7 ] */
     210             : 
     211             : /* Conversion operations */
     212             : 
     213             : /* Summarizing:
     214             : 
     215             :    wi_to_wc(a)   returns [ !!a0 !!a1 ... !!a7 ]
     216             : 
     217             :    wi_to_wu(a)   returns [ (uint)a0 (uint)a1 ... (uint)a7 ]
     218             : 
     219             :    wi_to_wf(a)   returns [ (float)a0 (float)a1 ... (float)a7 ]
     220             : 
     221             :    wi_to_wd(a,0) returns [ (double)a0 (double)a1 (double)a2 (double)a3 ]
     222             :    wi_to_wd(a,1) returns [ (double)a4 (double)a5 (double)a6 (double)a7 ]
     223             : 
     224             :    wi_to_wl(a,0) returns [ (long)a0   (long)a1   (long)a2   (long)a3   ]
     225             :    wi_to_wl(a,1) returns [ (long)a4   (long)a5   (long)a6   (long)a7   ]
     226             : 
     227             :    wi_to_wv(a,0) returns [ (ulong)a0  (ulong)a1  (ulong)a2  (ulong)a3  ]
     228             :    wi_to_wv(a,1) returns [ (ulong)a4  (ulong)a5  (ulong)a6  (ulong)a7  ]
     229             : 
     230             :    where imm_hi should be a compile time constant.
     231             : 
     232             :    For wi_to_{wd,wl}, the permutation used for the conversion is less
     233             :    flexible due to cross 128-bit lane limitations in AVX.  If imm_hi==0,
     234             :    the conversion is done to lanes 0:3.  Otherwise, the conversion is
     235             :    done to lanes 4:7.
     236             : 
     237             :    The raw variants just treat the raw bits as the corresponding vector
     238             :    type.  For wi_to_wc_raw, the user promises wi contains a proper
     239             :    vector conditional (e.g. 0 or -1 in each lane).  wi_to_wf_raw is
     240             :    useful for doing advanced bit tricks on floating point values.  The
     241             :    others are probably dubious but are provided for completness. */
     242             : 
     243             : #define wi_to_wc(a)        _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( (a), _mm256_setzero_si256() ) )
     244             : #define wi_to_wf(a)        _mm256_cvtepi32_ps( (a) )
     245             : #define wi_to_wu(a)        (a)
     246             : #define wi_to_wd(a,imm_hi) _mm256_cvtepi32_pd(    _mm256_extractf128_si256( (a), !!(imm_hi) ) )
     247             : #define wi_to_wl(a,imm_hi) _mm256_cvtepi32_epi64( _mm256_extractf128_si256( (a), !!(imm_hi) ) )
     248             : #define wi_to_wv(a,imm_hi) _mm256_cvtepi32_epi64( _mm256_extractf128_si256( (a), !!(imm_hi) ) )
     249             : 
     250             : #define wi_to_wc_raw(a) (a)
     251             : #define wi_to_wf_raw(a) _mm256_castsi256_ps( (a) )
     252             : #define wi_to_wu_raw(a) (a)
     253             : #define wi_to_wd_raw(a) _mm256_castsi256_pd( (a) )
     254             : #define wi_to_wl_raw(a) (a)
     255             : #define wi_to_wv_raw(a) (a)
     256             : 
     257             : /* Reduction operations */
     258             : 
     259             : static inline wi_t
     260      196608 : wi_sum_all( wi_t x ) { /* Returns wi_bcast( sum( x ) ) */
     261      196608 :   x = _mm256_add_epi32( x, _mm256_permute2f128_si256( x, x, 1 ) ); /* x04   x15   x26   x37   ... */
     262      196608 :   x = _mm256_hadd_epi32( x, x );                                   /* x0145 x2367 ... */
     263      196608 :   return _mm256_hadd_epi32( x, x );                                /* xsum  ... */
     264      196608 : }
     265             : 
     266             : static inline wi_t
     267      196608 : wi_min_all( wi_t x ) { /* Returns wi_bcast( min( x ) ) */
     268      196608 :   __m256i y = _mm256_permute2f128_si256( x, x, 1 );         /* x4    x5    x6   x7    x0    x1   x2    x3   */
     269      196608 :   x = _mm256_min_epi32( x, y );                             /* x04   x15   x26  x37   ... */
     270      196608 :   y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x26   x37   x04  x15   ... */
     271      196608 :   x = _mm256_min_epi32( x, y );                             /* x0246 x1357 ... */
     272      196608 :   y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x1357 x0246 ... */
     273      196608 :   x = _mm256_min_epi32( x, y );                             /* xmin  ... */
     274      196608 :   return x;
     275      196608 : }
     276             : 
     277             : static inline wi_t
     278      196608 : wi_max_all( wi_t x ) { /* Returns wi_bcast( max( x ) ) */
     279      196608 :   __m256i y = _mm256_permute2f128_si256( x, x, 1 );         /* x4    x5    x6   x7    x0    x1   x2    x3   */
     280      196608 :   x = _mm256_max_epi32( x, y );                             /* x04   x15   x26  x37   ... */
     281      196608 :   y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x26   x37   x04  x15   ... */
     282      196608 :   x = _mm256_max_epi32( x, y );                             /* x0246 x1357 ... */
     283      196608 :   y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x1357 x0246 ... */
     284      196608 :   x = _mm256_max_epi32( x, y );                             /* xmax  ... */
     285      196608 :   return x;
     286      196608 : }
     287             : 
     288             : /* Misc operations */
     289             : 
     290             : /* wi_gather(b,i) returns [ b[i(0)] b[i(1)] ... b[i(7)] ] where b is a
     291             :    "int const *" and i is a wi_t. */
     292             : 
     293    70976259 : #define wi_gather(b,i) _mm256_i32gather_epi32( (b), (i), 4 )
     294             : 
     295             : /* wi_transpose_8x8 transposes the 8x8 matrix stored in wi_t r0,r1,...r7
     296             :    and stores the result in 8x8 matrix wi_t c0,c1,...c7.  All
     297             :    c0,c1,...c7 should be different for a well defined result.
     298             :    Otherwise, in-place operation and/or using the same wi_t to specify
     299             :    multiple rows of r is fine. */
     300             : 
     301      196608 : #define wi_transpose_8x8( r0,r1,r2,r3,r4,r5,r6,r7, c0,c1,c2,c3,c4,c5,c6,c7 ) do {                                                 \
     302      196608 :     wi_t _wi_transpose_r0 = (r0); wi_t _wi_transpose_r1 = (r1); wi_t _wi_transpose_r2 = (r2); wi_t _wi_transpose_r3 = (r3);       \
     303      196608 :     wi_t _wi_transpose_r4 = (r4); wi_t _wi_transpose_r5 = (r5); wi_t _wi_transpose_r6 = (r6); wi_t _wi_transpose_r7 = (r7);       \
     304      196608 :     wi_t _wi_transpose_t;                                                                                                         \
     305      196608 :     /* Transpose 4x4 blocks */                                                                                                    \
     306      196608 :     _wi_transpose_t = _wi_transpose_r0; _wi_transpose_r0 = _mm256_permute2f128_si256( _wi_transpose_t,  _wi_transpose_r4, 0x20 ); \
     307      196608 :     /**/                                _wi_transpose_r4 = _mm256_permute2f128_si256( _wi_transpose_t,  _wi_transpose_r4, 0x31 ); \
     308      196608 :     _wi_transpose_t = _wi_transpose_r1; _wi_transpose_r1 = _mm256_permute2f128_si256( _wi_transpose_t,  _wi_transpose_r5, 0x20 ); \
     309      196608 :     /**/                                _wi_transpose_r5 = _mm256_permute2f128_si256( _wi_transpose_t,  _wi_transpose_r5, 0x31 ); \
     310      196608 :     _wi_transpose_t = _wi_transpose_r2; _wi_transpose_r2 = _mm256_permute2f128_si256( _wi_transpose_t,  _wi_transpose_r6, 0x20 ); \
     311      196608 :     /**/                                _wi_transpose_r6 = _mm256_permute2f128_si256( _wi_transpose_t,  _wi_transpose_r6, 0x31 ); \
     312      196608 :     _wi_transpose_t = _wi_transpose_r3; _wi_transpose_r3 = _mm256_permute2f128_si256( _wi_transpose_t,  _wi_transpose_r7, 0x20 ); \
     313      196608 :     /**/                                _wi_transpose_r7 = _mm256_permute2f128_si256( _wi_transpose_t,  _wi_transpose_r7, 0x31 ); \
     314      196608 :     /* Transpose 2x2 blocks */                                                                                                    \
     315      196608 :     _wi_transpose_t = _wi_transpose_r0; _wi_transpose_r0 = _mm256_unpacklo_epi32(     _wi_transpose_t,  _wi_transpose_r2 );       \
     316      196608 :     /**/                                _wi_transpose_r2 = _mm256_unpackhi_epi32(     _wi_transpose_t,  _wi_transpose_r2 );       \
     317      196608 :     _wi_transpose_t = _wi_transpose_r1; _wi_transpose_r1 = _mm256_unpacklo_epi32(     _wi_transpose_t,  _wi_transpose_r3 );       \
     318      196608 :     /**/                                _wi_transpose_r3 = _mm256_unpackhi_epi32(     _wi_transpose_t,  _wi_transpose_r3 );       \
     319      196608 :     _wi_transpose_t = _wi_transpose_r4; _wi_transpose_r4 = _mm256_unpacklo_epi32(     _wi_transpose_t,  _wi_transpose_r6 );       \
     320      196608 :     /**/                                _wi_transpose_r6 = _mm256_unpackhi_epi32(     _wi_transpose_t,  _wi_transpose_r6 );       \
     321      196608 :     _wi_transpose_t = _wi_transpose_r5; _wi_transpose_r5 = _mm256_unpacklo_epi32(     _wi_transpose_t,  _wi_transpose_r7 );       \
     322      196608 :     /**/                                _wi_transpose_r7 = _mm256_unpackhi_epi32(     _wi_transpose_t,  _wi_transpose_r7 );       \
     323      196608 :     /* Transpose 1x1 blocks */                                                                                                    \
     324      196608 :     /**/                                (c0)             = _mm256_unpacklo_epi32(     _wi_transpose_r0, _wi_transpose_r1 );       \
     325      196608 :     /**/                                (c1)             = _mm256_unpackhi_epi32(     _wi_transpose_r0, _wi_transpose_r1 );       \
     326      196608 :     /**/                                (c2)             = _mm256_unpacklo_epi32(     _wi_transpose_r2, _wi_transpose_r3 );       \
     327      196608 :     /**/                                (c3)             = _mm256_unpackhi_epi32(     _wi_transpose_r2, _wi_transpose_r3 );       \
     328      196608 :     /**/                                (c4)             = _mm256_unpacklo_epi32(     _wi_transpose_r4, _wi_transpose_r5 );       \
     329      196608 :     /**/                                (c5)             = _mm256_unpackhi_epi32(     _wi_transpose_r4, _wi_transpose_r5 );       \
     330      196608 :     /**/                                (c6)             = _mm256_unpacklo_epi32(     _wi_transpose_r6, _wi_transpose_r7 );       \
     331      196608 :     /**/                                (c7)             = _mm256_unpackhi_epi32(     _wi_transpose_r6, _wi_transpose_r7 );       \
     332      196608 :   } while(0)

Generated by: LCOV version 1.14