LCOV - cov.lcov - util/simd/fd_avx

LCOV - code coverage report

Current view:	top level - util/simd - fd_avx_wc.h (source / functions)		Hit	Total	Coverage
Test:	cov.lcov	Lines:	95	95	100.0 %
Date:	2025-02-18 12:28:12	Functions:	15	8476	0.2 %

          Line data    Source code

       1             : #ifndef HEADER_fd_src_util_simd_fd_avx_h
       2             : #error "Do not include this directly; use fd_avx.h"
       3             : #endif
       4             : 
       5             : /* TODO: the below is much very designed for a 32-bit SIMD lane world
       6             :    (with 64-bit SIMD lane support hacked on afterward).  Revamp these to
       7             :    be more general for 8, 16, 32 and 64 bit lanes. */
       8             : 
       9             : /* Vector conditional API *********************************************/
      10             : 
      11             : /* A wc_t is a vector conditional.  This is, it is a vector of integers
      12             :    where each 32-bit wide lane is either 0 (all zero bits), indicating
      13             :    the condition is true for that lane or -1 (all one bits), indicating
      14             :    the condition is false for that lane.  This allows fast bit
      15             :    operations to mask other types of vectors.  If this API is used on
      16             :    vectors that aren't proper vector conditionals, results are
      17             :    undefined.  When vector conditional are applied to vector doubles,
      18             :    longs and ulongs, adjacent lanes (0-1 / 2-3 / 4-5 / 6-7) should have
      19             :    identical values, otherwise results will be undefined.
      20             : 
      21             :    These mirror the other APIs as much as possible.  Macros are
      22             :    preferred over static inlines when it is possible to do it robustly
      23             :    to reduce the risk of the compiler mucking it up. */
      24             : 
      25 13301737023 : #define wc_t __m256i
      26             : 
      27             : /* Constructors */
      28             : 
      29             : /* wc returns a wc_t corresponding to the c-style logical values c0:c7.
      30             :    This will always create a proper vector conditional regardless how
      31             :    logical values were presented to them.  That is, the provided values
      32             :    will be treated as c-style logical values such that zero/false will
      33             :    become zero/false in the vector and non-zero/true will become -1/true
      34             :    in the vector conditional).  Similarly for wc_bcast*.  Summarizing:
      35             : 
      36             :      wc(c0,c1,c2,c3)            return [c0 c1 c2 c3 c4 c5 c6 c7]
      37             :      wc_bcast(c0)               return [c0 c0 c0 c0 c0 c0 c0 c0]
      38             :      wc_bcast_pair(c0,c1)       return [c0 c1 c0 c1 c0 c1 c0 c1]
      39             :      wc_bcast_lohi(c0,c1)       return [c0 c0 c0 c1 c1 c1 c1 c1]
      40             :      wc_bcast_quad(c0,c1,c2,c3) return [c0 c1 c2 c3 c0 c1 c2 c3]
      41             :      wc_bcast_wide(c0,c1,c2,c3) return [c0 c0 c1 c1 c2 c2 c3 c3] */
      42             : 
      43      591354 : #define wc(c0,c1,c2,c3,c4,c5,c6,c7) _mm256_setr_epi32( -!!(c0), -!!(c1), -!!(c2), -!!(c3), -!!(c4), -!!(c5), -!!(c6), -!!(c7) )
      44             : 
      45             : #if 0 /* Compiler sometimes tries to turn this into branches ... sigh */
      46             : #define wc_bcast(c0) _mm256_set1_epi32( -!!(c0) )
      47             : #else
      48             : static inline __m256i
      49        6885 : wc_bcast( int c0 ) {
      50        6885 :   c0 = -!!c0; FD_COMPILER_FORGET( c0 );
      51        6885 :   return _mm256_set1_epi32( c0 );
      52        6885 : }
      53             : #endif
      54             : 
      55             : static inline wc_t
      56         765 : wc_bcast_pair( int c0, int c1 ) {
      57         765 :   c0 = -!!c0; c1 = -!!c1;
      58         765 :   return _mm256_setr_epi32( c0, c1, c0, c1, c0, c1, c0, c1 );
      59         765 : }
      60             : 
      61             : static inline wc_t
      62         765 : wc_bcast_lohi( int c0, int c1 ) {
      63         765 :   c0 = -!!c0; c1 = -!!c1;
      64         765 :   return _mm256_setr_epi32( c0, c0, c0, c0, c1, c1, c1, c1 );
      65         765 : }
      66             : 
      67             : static inline wc_t
      68         765 : wc_bcast_quad( int c0, int c1, int c2, int c3 ) {
      69         765 :   c0 = -!!c0; c1 = -!!c1; c2 = -!!c2; c3 = -!!c3;
      70         765 :   return _mm256_setr_epi32( c0, c1, c2, c3, c0, c1, c2, c3 );
      71         765 : }
      72             : 
      73             : static inline wc_t
      74      592884 : wc_bcast_wide( int c0, int c1, int c2, int c3 ) {
      75      592884 :   c0 = -!!c0; c1 = -!!c1; c2 = -!!c2; c3 = -!!c3;
      76      592884 :   return _mm256_setr_epi32( c0, c0, c1, c1, c2, c2, c3, c3 );
      77      592884 : }
      78             : 
      79             : /* No general wc_permute due to cross-128-bit lane limitations in AVX.
      80             :    Useful cases are provided below.  Given [ c0 c1 c2 c3 c4 c5 c6 c7 ],
      81             :    return ... */
      82             : 
      83             : #define wc_bcast_even(c)      /* [ c0 c0 c2 c2 c4 c4 c6 c6 ] */ \
      84             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (c) ), _MM_SHUFFLE(2,2,0,0) ) )
      85             : 
      86             : #define wc_bcast_odd(c)       /* [ c1 c1 c3 c3 c5 c5 c7 c7 ] */ \
      87             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (c) ), _MM_SHUFFLE(3,3,1,1) ) )
      88             : 
      89             : #define wc_exch_adj(c)        /* [ c1 c0 c3 c2 c5 c4 c7 c6 ] */ \
      90             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (c) ), _MM_SHUFFLE(2,3,0,1) ) )
      91             : 
      92             : #define wc_exch_adj_pair(c)   /* [ c2 c3 c0 c1 c6 c7 c4 c5 ] */ \
      93             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (c) ), _MM_SHUFFLE(1,0,3,2) ) )
      94             : 
      95             : static inline wc_t
      96         765 : wc_exch_adj_quad( wc_t c ) { /* [ c4 c5 c6 c7 c0 c1 c2 c3 ] */
      97         765 :   return _mm256_permute2f128_si256( c, c, 1 );
      98         765 : }
      99             : 
     100             : /* Predefined constants */
     101             : 
     102             : #define wc_false() _mm256_setzero_si256()  /* Return [ f f f f f f f f ] */
     103             : #define wc_true()  _mm256_set1_epi32( -1 ) /* Return [ t t t t t t t t ] */
     104             : 
     105             : /* Memory operations */
     106             : 
     107             : /* wc_ld returns the 8 integers at the 32-byte aligned / 32-byte sized
     108             :    location p as a proper vector conditional (see above note about
     109             :    c-style logicals).  wc_ldu is the same but p does not have to be
     110             :    aligned.  In the fast variants, the caller promises that p already
     111             :    holds a proper vector conditions (e.g. 0/-1 for true/false).  wc_st
     112             :    writes the vector conditional c at the 32-byte aligned / 32-byte size
     113             :    location p (0/-1 for true/false).  wc_stu is the same but p does not
     114             :    have to be aligned.  Lane l will be at p[l].  FIXME: USE ATTRIBUTES
     115             :    ON P PASSED TO THESE?
     116             : 
     117             :    Note: gcc knows that __m256i may alias. */
     118             : 
     119             : static inline wc_t
     120    11809491 : wc_ld( int const * p ) {
     121    11809491 :   return _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( _mm256_load_si256(  (__m256i const *)p ),
     122    11809491 :                                                                         _mm256_setzero_si256() ) );
     123    11809491 : }
     124    11809491 : static inline wc_t wc_ld_fast( int const * p ) { return _mm256_load_si256(  (__m256i const *)p ); }
     125    11809491 : static inline void wc_st( int * p, wc_t c ) { _mm256_store_si256(  (__m256i *)p, c ); }
     126             : 
     127             : static inline wc_t
     128    94475928 : wc_ldu( void const * p ) {
     129    94475928 :   return _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( _mm256_loadu_si256( (__m256i const *)p ),
     130    94475928 :                                                                         _mm256_setzero_si256() ) );
     131    94475928 : }
     132    94475928 : static inline wc_t wc_ldu_fast( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
     133    94475928 : static inline void wc_stu( void * p, wc_t c ) { _mm256_storeu_si256( (__m256i *)p, c ); }
     134             : 
     135             : /* wc_ldif is an optimized equivalent to wc_and(c,wc_ldu(p)).  Similarly
     136             :    for wc_ldif_fast (either may have different behavior if c is not a
     137             :    proper vector conditional).  wc_ldif_fast assumes p already holds a
     138             :    proper vector conditional.  These are provided for symmetry with the
     139             :    wc_stif operation.  wc_stif stores x(n) to p[n] if c(n) is true and
     140             :    leaves p[n] unchanged otherwise.  Undefined behavior if c is not a
     141             :    proper vector conditional. */
     142             : 
     143             : #define wc_ldif(c,p)      _mm256_xor_si128( _mm256_set1_epi32(-1), _mm256_cmpeq_epi32( _mm256_maskload_epi32( (p), (c) ), \
     144             :                                                                                        _mm256_setzero_si128()) )
     145             : #define wc_ldif_fast(c,p) _mm256_maskload_epi32((p),(c))
     146             : #define wc_stif(c,p,x)    _mm256_maskstore_epi32((p),(c),(x))
     147             : 
     148             : /* Element operations */
     149             : 
     150             : /* wc_extract extracts the value of lane imm from the vector conditional
     151             :    as an int 0 (false) or 1 (true).  wc_insert returns the vector
     152             :    conditional formed by replacing the value in lane imm of a with the
     153             :    provided c-style logical.  imm should be a compile time constant in
     154             :    0:7.  wc_extract_variable and wc_insert_variable are the slower but
     155             :    the lane does not have to be compile-time known static value (should
     156             :    still be in 0:7). */
     157             : 
     158    94475928 : #define wc_extract(c,imm)        ((_mm256_movemask_ps( _mm256_castsi256_ps( (c) ) ) >> (imm)) & 1)
     159    94475928 : #define wc_insert(a,imm,c)       _mm256_insert_epi32( (a), -!!(c), (imm) )
     160             : 
     161    94475928 : #define wc_extract_variable(c,n) ((_mm256_movemask_ps( _mm256_castsi256_ps( (c) ) ) >> (n)  ) & 1)
     162             : #define wc_insert_variable(a,n,c)                                                                                             \
     163    94475928 :   _mm256_cmpgt_epi32( _mm256_and_si256( _mm256_set1_epi32( (_mm256_movemask_ps( _mm256_castsi256_ps( (a) ) ) & (~(1<<(n)))) | \
     164    94475928 :                                                            ((!!(c))<<n) ),                                                    \
     165    94475928 :                                         _mm256_setr_epi32( 1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7 ) ),                \
     166    94475928 :                       _mm256_setzero_si256() )
     167             : 
     168             : /* Given [ a0 a1 a2 a3 a4 a5 a6 a7 ] and/or [ b0 b1 b2 b3 b4 b5 b6 b7 ],
     169             :    return ... */
     170             : 
     171             : /* Arithmetic operations */
     172             : 
     173             : /* Note: arithmetic and shift operations are not well defined for a wc_t
     174             :    as it isn't clear if user would like to treat the vector conditional
     175             :    these as 8 1-bit signed ints (0/-1), 8 1-bit unsigned ints (0/1) or
     176             :    8-GF2 elements (f/t but sign is meaningless) or do cross lane motion
     177             :    of the condition.  Instead, the user should use wc_to_{wi,wl}[_raw]
     178             :    as necessary and use the appropriate binary, arithmetic, permute
     179             :    and/or shift operations there. */
     180             : 
     181             : /* Binary operations */
     182             : 
     183             : #define wc_not(a)      _mm256_xor_si256( _mm256_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ...  ~a7 ] */
     184             : 
     185             : #define wc_and(a,b)    _mm256_and_si256(   (a),(b)) /* [   a0 &b0   a1 &b1 ...   a7 &b7 ] */
     186             : #define wc_or(a,b)     _mm256_or_si256(    (a),(b)) /* [   a0 |b0   a1 |b1 ...   a7 |b7 ] */
     187             : #define wc_xor(a,b)    _mm256_xor_si256(   (a),(b)) /* [   a0 ^b0   a1 ^b1 ...   a7 ^b7 ] */
     188 12492913960 : #define wc_andnot(a,b) _mm256_andnot_si256((a),(b)) /* [ (~a0)&b0 (~a1)&b1 ... (~a7)&b7 ] */
     189             : 
     190             : /* Logical operations */
     191             : 
     192             : /* Note: wc_{gt,lt,ge,le} are provided for completeness and treat
     193             :    true>false. */
     194             : 
     195             : #define wc_lnot(a)    _mm256_xor_si256( _mm256_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a7 ] */
     196             : #define wc_lnotnot(a) (a)                                              /* [  a0  a1 ...  a7 ] */
     197             : 
     198             : #define wc_eq(a,b) _mm256_cmpeq_epi32( (a),(b))                                /* [ a0==b0 a1==b1 ... a7==b7 ] */
     199             : #define wc_gt(a,b) _mm256_andnot_si256((b),(a))                                /* [ a0> b0 a1> b1 ... a7> b7 ] */
     200             : #define wc_lt(a,b) _mm256_andnot_si256((a),(b))                                /* [ a0< b0 a1< b1 ... a7< b7 ] */
     201             : #define wc_ne(a,b) _mm256_xor_si256(   (a),(b))                                /* [ a0!=b0 a1!=b1 ... a7!=b7 ] */
     202             : #define wc_ge(a,b)                                                             /* [ a0>=b0 a1>=b1 ... a7>=b7 ] */ \
     203             :   _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_andnot_si256( (a), (b) ) )
     204             : #define wc_le(a,b)                                                             /* [ a0<=b0 a1<=b1 ... a7<=b7 ] */ \
     205             :   _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_andnot_si256( (b), (a) ) )
     206             : 
     207             : /* Conditional operations */
     208             : 
     209             : /* FIXME: Define wc_czero / wc_notczero?  Equivalent TO wc_andnot and
     210             :    wc_and but have arithmetic connotations.  */
     211             : 
     212             : #define wc_if(c,t,f) _mm256_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c7?t7:f7 ] */
     213             : 
     214             : /* Conversion operations */
     215             : 
     216             : /* wc_to_{wf,wi,wu,wd,wl,wv} convert a proper vector conditional into a
     217             :    vector float/int/double/long/ulong with f mapping to 0 and t mapping
     218             :    to 1 in each lane.
     219             : 
     220             :    wc_to_{wf,wi,wu,wd,wl,wv}_raw just treat the raw bits in the vector
     221             :    conditional as the corresponding vector type.  wc_to_{wi,wu}_raw map
     222             :    false(true) to 0(-1) and similarly for wc_to_{wl,wv}_raw when c has
     223             :    paired lanes.  wc_to_{wf,wd}_raw probably are not useful in practice
     224             :    but are provided for completeness; wc_to_wf_raw maps false(true) to
     225             :    0(-nan) and similarly for wc_to_wd_raw when c has paired lanes. */
     226             : 
     227             : #define wc_to_wf(a) _mm256_and_ps( _mm256_castsi256_ps( (a) ), _mm256_set1_ps( 1.f ) )
     228             : #define wc_to_wi(a) _mm256_and_si256( (a), _mm256_set1_epi32( 1 ) )
     229             : #define wc_to_wu(a) _mm256_and_si256( (a), _mm256_set1_epi32( 1 ) )
     230             : #define wc_to_wd(a) _mm256_and_pd( _mm256_castsi256_pd( (a) ), _mm256_set1_pd( 1. ) ) /* wc should have paired lanes */
     231             : #define wc_to_wl(a) _mm256_and_si256( (a), _mm256_set1_epi64x( 1L ) )                 /* wc should have paired lanes */
     232             : #define wc_to_wv(a) _mm256_and_si256( (a), _mm256_set1_epi64x( 1L ) )                 /* wc should have paired lanes */
     233             : 
     234             : #define wc_to_wf_raw(a) _mm256_castsi256_ps( (a) )
     235             : #define wc_to_wi_raw(a) (a)
     236             : #define wc_to_wu_raw(a) (a)
     237             : #define wc_to_wd_raw(a) _mm256_castsi256_pd( (a) )
     238             : #define wc_to_wl_raw(a) (a)
     239             : #define wc_to_wv_raw(a) (a)
     240             : 
     241             : /* Reduction operations */
     242             : 
     243             : /* wc_any/wc_all returns logical true if any/all conditions in c is true */
     244             : 
     245   929728008 : #define wc_any(c) (_mm256_movemask_ps( _mm256_castsi256_ps( (c) ) )!=0x00)
     246  2488808815 : #define wc_all(c) (_mm256_movemask_ps( _mm256_castsi256_ps( (c) ) )==0xff)
     247             : 
     248             : /* Misc operations */
     249             : 
     250             : /* wc_pack returns an int where bit i equals 0(1) if lane i of c is
     251             :    false(true) for i in [0,8).  Vice versa for wc_unpack. */
     252             : 
     253  2548098084 : #define wc_pack(c)   _mm256_movemask_ps( _mm256_castsi256_ps( (c) ) )
     254    20530622 : #define wc_unpack(b) _mm256_cmpgt_epi32( _mm256_and_si256( _mm256_set1_epi32( (b) ),                                              \
     255    20530622 :                                                            _mm256_setr_epi32( 1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7 ) ), \
     256    20530622 :                                          _mm256_setzero_si256() )
     257             : 
     258             : /* wc_expand expands c0:c3 (imm_hi==0) or c4:c7 (imm_hi==1) into a
     259             :    paired lane conditional.  That is:
     260             : 
     261             :      wc_expand(c,0) returns [ c0 c0 c1 c1 c2 c2 c3 c3 ]
     262             :      wc_expand(c,1) returns [ c4 c4 c5 c5 c6 c6 c7 c7 ]
     263             : 
     264             :    Conversely:
     265             : 
     266             :      wc_narrow(a,b) returns [ a0 a2 a4 a6 b0 b2 b4 b6 ]
     267             : 
     268             :    which is useful for turning two paired lane conditionals into a
     269             :    single lane conditional.  U.B. if a, b, and/or c are not proper
     270             :    vector conditionals.  These are useful, for example, for vectorizing
     271             :    64-bit pointer arithmetic used in 32-bit lane SIMD. */
     272             : 
     273        1530 : #define wc_expand(c,imm_hi) _mm256_cvtepi32_epi64( _mm256_extractf128_si256( (c), (imm_hi) ) )
     274             : 
     275   229939051 : static inline wc_t wc_narrow( wc_t a, wc_t b ) {
     276   229939051 :   __m128 a01 = _mm_castsi128_ps( _mm256_extractf128_si256( a, 0 ) ); /* [ a0l a0h a1l a1h ] */
     277   229939051 :   __m128 a23 = _mm_castsi128_ps( _mm256_extractf128_si256( a, 1 ) ); /* [ a2l a2h a3l a3h ] */
     278   229939051 :   __m128 b01 = _mm_castsi128_ps( _mm256_extractf128_si256( b, 0 ) ); /* [ b0l b0h b1l b1h ] */
     279   229939051 :   __m128 b23 = _mm_castsi128_ps( _mm256_extractf128_si256( b, 1 ) ); /* [ b2l b2h b3l b3h ] */
     280   229939051 :   return _mm256_setr_m128i( _mm_castps_si128( _mm_shuffle_ps( a01, a23, _MM_SHUFFLE(2,0,2,0) ) ),
     281   229939051 :                             _mm_castps_si128( _mm_shuffle_ps( b01, b23, _MM_SHUFFLE(2,0,2,0) ) ) );
     282   229939051 : }
     283             : 
     284             : /* wc_gather(b,i) returns [ -!!b[i(0)] -!!b[i(1)] ... -!!b[i(7)] ] where
     285             :    b is an "int const *" (0/non-zero map to false/true) and i is a wi_t.
     286             : 
     287             :    wc_gather_fast(b,i) returns [ b[i(0)] b[i(1)] ... b[i(7)] ] where b s
     288             :    an "int const *".   User promises b[i(:)] values are already either 0
     289             :    or -1.  i here is a wi_t.  */
     290             : 
     291    11809491 : #define wc_gather(b,i)      _mm256_xor_si256( _mm256_set1_epi32( -1 ), \
     292    11809491 :                                               _mm256_cmpeq_epi32( _mm256_i32gather_epi32( (b), (i), 4 ), _mm256_setzero_si256() ) )
     293    11809491 : #define wc_gather_fast(b,i) _mm256_i32gather_epi32( (b), (i), 4 )
     294             : 
     295             : /* wc_transpose_8x8 transposes the 8x8 matrix stored in wc_t r0,r1,...r7
     296             :    and stores the result in 8x8 matrix wc_t c0,c1,...c7.  All
     297             :    c0,c1,...c7 should be different for a well defined result.
     298             :    Otherwise, in-place operation and/or using the same wc_t to specify
     299             :    multiple rows of r is fine. */
     300             : 
     301         765 : #define wc_transpose_8x8( r0,r1,r2,r3,r4,r5,r6,r7, c0,c1,c2,c3,c4,c5,c6,c7 ) do {                                                 \
     302         765 :     wc_t _wc_transpose_r0 = (r0); wc_t _wc_transpose_r1 = (r1); wc_t _wc_transpose_r2 = (r2); wc_t _wc_transpose_r3 = (r3);       \
     303         765 :     wc_t _wc_transpose_r4 = (r4); wc_t _wc_transpose_r5 = (r5); wc_t _wc_transpose_r6 = (r6); wc_t _wc_transpose_r7 = (r7);       \
     304         765 :     wc_t _wc_transpose_t;                                                                                                         \
     305         765 :     /* Transpose 4x4 blocks */                                                                                                    \
     306         765 :     _wc_transpose_t = _wc_transpose_r0; _wc_transpose_r0 = _mm256_permute2f128_si256( _wc_transpose_t,  _wc_transpose_r4, 0x20 ); \
     307         765 :     /**/                                _wc_transpose_r4 = _mm256_permute2f128_si256( _wc_transpose_t,  _wc_transpose_r4, 0x31 ); \
     308         765 :     _wc_transpose_t = _wc_transpose_r1; _wc_transpose_r1 = _mm256_permute2f128_si256( _wc_transpose_t,  _wc_transpose_r5, 0x20 ); \
     309         765 :     /**/                                _wc_transpose_r5 = _mm256_permute2f128_si256( _wc_transpose_t,  _wc_transpose_r5, 0x31 ); \
     310         765 :     _wc_transpose_t = _wc_transpose_r2; _wc_transpose_r2 = _mm256_permute2f128_si256( _wc_transpose_t,  _wc_transpose_r6, 0x20 ); \
     311         765 :     /**/                                _wc_transpose_r6 = _mm256_permute2f128_si256( _wc_transpose_t,  _wc_transpose_r6, 0x31 ); \
     312         765 :     _wc_transpose_t = _wc_transpose_r3; _wc_transpose_r3 = _mm256_permute2f128_si256( _wc_transpose_t,  _wc_transpose_r7, 0x20 ); \
     313         765 :     /**/                                _wc_transpose_r7 = _mm256_permute2f128_si256( _wc_transpose_t,  _wc_transpose_r7, 0x31 ); \
     314         765 :     /* Transpose 2x2 blocks */                                                                                                    \
     315         765 :     _wc_transpose_t = _wc_transpose_r0; _wc_transpose_r0 = _mm256_unpacklo_epi32(     _wc_transpose_t,  _wc_transpose_r2 );       \
     316         765 :     /**/                                _wc_transpose_r2 = _mm256_unpackhi_epi32(     _wc_transpose_t,  _wc_transpose_r2 );       \
     317         765 :     _wc_transpose_t = _wc_transpose_r1; _wc_transpose_r1 = _mm256_unpacklo_epi32(     _wc_transpose_t,  _wc_transpose_r3 );       \
     318         765 :     /**/                                _wc_transpose_r3 = _mm256_unpackhi_epi32(     _wc_transpose_t,  _wc_transpose_r3 );       \
     319         765 :     _wc_transpose_t = _wc_transpose_r4; _wc_transpose_r4 = _mm256_unpacklo_epi32(     _wc_transpose_t,  _wc_transpose_r6 );       \
     320         765 :     /**/                                _wc_transpose_r6 = _mm256_unpackhi_epi32(     _wc_transpose_t,  _wc_transpose_r6 );       \
     321         765 :     _wc_transpose_t = _wc_transpose_r5; _wc_transpose_r5 = _mm256_unpacklo_epi32(     _wc_transpose_t,  _wc_transpose_r7 );       \
     322         765 :     /**/                                _wc_transpose_r7 = _mm256_unpackhi_epi32(     _wc_transpose_t,  _wc_transpose_r7 );       \
     323         765 :     /* Transpose 1x1 blocks */                                                                                                    \
     324         765 :     /**/                                (c0)             = _mm256_unpacklo_epi32(     _wc_transpose_r0, _wc_transpose_r1 );       \
     325         765 :     /**/                                (c1)             = _mm256_unpackhi_epi32(     _wc_transpose_r0, _wc_transpose_r1 );       \
     326         765 :     /**/                                (c2)             = _mm256_unpacklo_epi32(     _wc_transpose_r2, _wc_transpose_r3 );       \
     327         765 :     /**/                                (c3)             = _mm256_unpackhi_epi32(     _wc_transpose_r2, _wc_transpose_r3 );       \
     328         765 :     /**/                                (c4)             = _mm256_unpacklo_epi32(     _wc_transpose_r4, _wc_transpose_r5 );       \
     329         765 :     /**/                                (c5)             = _mm256_unpackhi_epi32(     _wc_transpose_r4, _wc_transpose_r5 );       \
     330         765 :     /**/                                (c6)             = _mm256_unpacklo_epi32(     _wc_transpose_r6, _wc_transpose_r7 );       \
     331         765 :     /**/                                (c7)             = _mm256_unpackhi_epi32(     _wc_transpose_r6, _wc_transpose_r7 );       \
     332         765 :   } while(0)

Generated by: LCOV version 1.14