LCOV - code coverage report
Current view: top level - util/simd - fd_sse_vu.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 93 101 92.1 %
Date: 2025-01-08 12:08:44 Functions: 24 147 16.3 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_sse_h
       2             : #error "Do not include this directly; use fd_sse.h"
       3             : #endif
       4             : 
       5             : /* Vector uint API ****************************************************/
       6             : 
       7             : /* A vu_t is a vector where each 32-bit wide lane holds an unsigned
       8             :    32-bit integer (an "uint").  These mirror vc and vf as much as
       9             :    possible.
      10             : 
      11             :    These mirror the other APIs as much as possible.  Macros are
      12             :    preferred over static inlines when it is possible to do it robustly
      13             :    to reduce the risk of the compiler mucking it up. */
      14             : 
      15   330519115 : #define vu_t __m128i
      16             : 
      17             : /* Constructors */
      18             : 
      19             : /* Given the uint values, return ... */
      20             : 
      21    95128182 : #define vu(u0,u1,u2,u3) _mm_setr_epi32( (int)(u0), (int)(u1), (int)(u2), (int)(u3) ) /* [ u0 u1 u2 u3 ] */
      22             : 
      23      393216 : #define vu_bcast(u0) _mm_set1_epi32( (int)(u0) ) /* [ u0 u0 u0 u0 ] */
      24             : 
      25             : static inline vu_t /* [ u0 u1 u0 u1 ] */
      26      196608 : vu_bcast_pair( uint u0, uint u1 ) {
      27      196608 :   int i0 = (int)u0; int i1 = (int)u1;
      28      196608 :   return _mm_setr_epi32( i0, i1, i0, i1 );
      29      196608 : }
      30             : 
      31             : static inline vu_t /* [ u0 u0 u1 u1 ] */
      32      196608 : vu_bcast_wide( uint u0, uint u1 ) {
      33      196608 :   int i0 = (int)u0; int i1 = (int)u1;
      34      196608 :   return _mm_setr_epi32( i0, i0, i1, i1 );
      35      196608 : }
      36             : 
      37             : /* vu_permute returns [ x(imm_i0) x(imm_i1) x(imm_i2) x(imm_i3) ].
      38             :    imm_i* should be compile time constants in 0:3. */
      39             : 
      40   444660340 : #define vu_permute(x,imm_i0,imm_i1,imm_i2,imm_i3) _mm_shuffle_epi32( (x), _MM_SHUFFLE( (imm_i3), (imm_i2), (imm_i1), (imm_i0) ) )
      41             : 
      42             : /* vu_permute2 returns [ a(imm_i0) a(imm_i1) b(imm_i2) b(imm_i3) ].
      43             :    imm_i* should be compile time constants in 0:3. */
      44             : 
      45   889320680 : #define vu_permute2(a,b,imm_i0,imm_i1,imm_i2,imm_i3) ((vu_t)_mm_shuffle_ps( (vf_t)(a), (vf_t)(b), _MM_SHUFFLE( (imm_i3), (imm_i2), (imm_i1), (imm_i0) ) ))
      46             : 
      47             : /* Predefined constants */
      48             : 
      49    44466034 : #define vu_zero() _mm_setzero_si128() /* Return [ 0U 0U 0U 0U ] */
      50    61538355 : #define vu_one()  _mm_set1_epi32( 1 ) /* Return [ 1U 1U 1U 1U ] */
      51             : 
      52             : /* Memory operations */
      53             : 
      54             : /* vu_ld return the 4 uints at the 16-byte aligned / 16-byte sized
      55             :    location p as a vector uint.  vu_ldu is the same but p does not have
      56             :    to be aligned.  vu_st writes the vector uint to the 16-byte aligned /
      57             :    16-byte sized location p as 4 uints.  vu_stu is the same but p does
      58             :    not have to be aligned.  In all these lane l will be at p[l].  FIXME:
      59             :    USE ATTRIBUTES ON P PASSED TO THESE?
      60             : 
      61             :    Note: gcc knows a __m128i may alias. */
      62             : 
      63   160538364 : static inline vu_t vu_ld( uint const * p ) { return _mm_load_si128(  (__m128i const *)p ); }
      64   193538367 : static inline void vu_st( uint * p, vu_t i ) { _mm_store_si128(  (__m128i *)p, i ); }
      65             : 
      66   246153420 : static inline vu_t vu_ldu( void const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
      67   246153420 : static inline void vu_stu( void * p, vu_t i ) { _mm_storeu_si128( (__m128i *)p, i ); }
      68             : 
      69             : /* vu_ldif is an optimized equivalent to vu_notczero(c,vu_ldu(p)) (may
      70             :    have different behavior if c is not a proper vector conditional).  It
      71             :    is provided for symmetry with the vu_stif operation.  vu_stif stores
      72             :    x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
      73             :    Undefined behavior if c is not a proper vector conditional. */
      74             : 
      75             : #define vu_ldif(c,p)   _mm_maskload_epi32( (p),(c))
      76             : #define vu_stif(c,p,x) _mm_maskstore_epi32((p),(c),(x))
      77             : 
      78             : /* Element operations */
      79             : 
      80             : /* vu_extract extracts the uint in lane imm from the vector uint as an
      81             :    uint.  vu_insert returns the vector uint formed by replacing the
      82             :    value in lane imm of a with the provided uint.  imm should be a
      83             :    compile time constant in 0:3.  vu_extract_variable and
      84             :    vu_insert_variable are the slower but the lane n does not have to be
      85             :    known at compile time (should be in 0:3).
      86             : 
      87             :    Note: C99 TC3 allows type punning through a union. */
      88             : 
      89   246153420 : #define vu_extract(a,imm)  ((uint)_mm_extract_epi32( (a), (imm) ))
      90   246153420 : #define vu_insert(a,imm,v) _mm_insert_epi32( (a), (int)(v), (imm) )
      91             : 
      92             : static inline uint
      93   246153420 : vu_extract_variable( vu_t a, int n ) {
      94   246153420 :   union { __m128i m[1]; uint u[4]; } t[1];
      95   246153420 :   _mm_store_si128( t->m, a );
      96   246153420 :   return t->u[n];
      97   246153420 : }
      98             : 
      99             : static inline vu_t
     100   246153420 : vu_insert_variable( vu_t a, int n, uint v ) {
     101   246153420 :   union { __m128i m[1]; uint u[4]; } t[1];
     102   246153420 :   _mm_store_si128( t->m, a );
     103   246153420 :   t->u[n] = v;
     104   246153420 :   return _mm_load_si128( t->m );
     105   246153420 : }
     106             : 
     107             : /* Given [a0 a1 a2 a3] and/or [b0 b1 b2 b3], return ... */
     108             : 
     109             : /* Arithmetic operations */
     110             : 
     111             : #define vu_neg(a) _mm_sub_epi32( _mm_setzero_si128(), (a) ) /* [ -a0  -a1  ... -a3  ] (twos complement handling) */
     112             : #define vu_abs(a) (a)                                       /* [ |a0| |a1| ... |a3| ] (twos complement handling) */
     113             : 
     114             : #define vu_min(a,b) _mm_min_epu32(   (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a3,b3) ] */
     115             : #define vu_max(a,b) _mm_max_epu32(   (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a3,b3) ] */
     116  2772524540 : #define vu_add(a,b) _mm_add_epi32(   (a), (b) ) /* [ a0 +b0     a1 +b1     ... a3 +b3     ] */
     117             : #define vu_sub(a,b) _mm_sub_epi32(   (a), (b) ) /* [ a0 -b0     a1 -b1     ... a3 -b3     ] */
     118             : #define vu_mul(a,b) _mm_mullo_epi32( (a), (b) ) /* [ a0 *b0     a1 *b1     ... a3 *b3     ] */
     119             : 
     120             : /* Binary operations */
     121             : 
     122             : /* Note: vu_shl/vu_shr/vu_shru is a left/signed right/unsigned right
     123             :    shift by imm bits; imm should be a compile time constant in 0:31.
     124             :    The variable variants are slower but do not require the shift amount
     125             :    to be known at compile time (should still be in 0:31). */
     126             : 
     127             : #define vu_not(a) _mm_xor_si128( _mm_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a3 ] */
     128             : 
     129             : #define vu_shl(a,imm) _mm_slli_epi32( (a), (imm) ) /* [ a0<<imm a1<<imm ... a3<<imm ] */
     130             : #define vu_shr(a,imm) _mm_srli_epi32( (a), (imm) ) /* [ a0>>imm a1>>imm ... a3>>imm ] */
     131             : 
     132             : #define vu_shl_variable(a,n) _mm_sll_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     133             : #define vu_shr_variable(a,n) _mm_srl_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     134             : 
     135             : #define vu_shl_vector(a,b) _mm_sllv_epi32( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a3<<b3 ] */
     136             : #define vu_shr_vector(a,b) _mm_srlv_epi32( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a3>>b3 ] */
     137             : 
     138             : #define vu_and(a,b)    _mm_and_si128(    (a), (b) ) /* [   a0 &b0    a1& b1 ...   a3& b3 ] */
     139             : #define vu_andnot(a,b) _mm_andnot_si128( (a), (b) ) /* [ (~a0)&b0  (~a1)&b1 ... (~a3)&b3 ] */
     140   901626960 : #define vu_or(a,b)     _mm_or_si128(     (a), (b) ) /* [   a0 |b0    a1 |b1 ...   a3 |b3 ] */
     141  3973981260 : #define vu_xor(a,b)    _mm_xor_si128(    (a), (b) ) /* [   a0 ^b0    a1 ^b1 ...   a3 ^b3 ] */
     142             : 
     143             : /* vu_rol(x,n) returns vu( rotate_left (x0,n), rotate_left (x1,n), ... )
     144             :    vu_ror(x,n) returns vu( rotate_right(x0,n), rotate_right(x1,n), ... ) */
     145             : 
     146             : #if FD_HAS_AVX512
     147   440131112 : #define vu_rol(a,imm)  _mm_rol_epi32( (a), (imm) )
     148             : #define vu_ror(a,imm)  _mm_ror_epi32( (a), (imm) )
     149             : #else
     150   884456528 : static inline vu_t vu_rol( vu_t a, int imm ) { return vu_or( vu_shl( a, imm & 31 ), vu_shr( a, (-imm) & 31 ) ); }
     151     4194304 : static inline vu_t vu_ror( vu_t a, int imm ) { return vu_or( vu_shr( a, imm & 31 ), vu_shl( a, (-imm) & 31 ) ); }
     152             : #endif
     153             : 
     154     6291456 : static inline vu_t vu_rol_variable( vu_t a, int n ) { return vu_or( vu_shl_variable( a, n&31 ), vu_shr_variable( a, (-n)&31 ) ); }
     155     6291456 : static inline vu_t vu_ror_variable( vu_t a, int n ) { return vu_or( vu_shr_variable( a, n&31 ), vu_shl_variable( a, (-n)&31 ) ); }
     156             : 
     157           0 : static inline vu_t vu_rol_vector( vu_t a, vi_t b ) {
     158           0 :   vi_t m = vi_bcast( 31 );
     159           0 :   return vu_or( vu_shl_vector( a, vi_and( b, m ) ), vu_shr_vector( a, vi_and( vi_neg( b ), m ) ) );
     160           0 : }
     161             : 
     162           0 : static inline vu_t vu_ror_vector( vu_t a, vi_t b ) {
     163           0 :   vi_t m = vi_bcast( 31 );
     164           0 :   return vu_or( vu_shr_vector( a, vi_and( b, m ) ), vu_shl_vector( a, vi_and( vi_neg( b ), m ) ) );
     165           0 : }
     166             : 
     167      393216 : static inline vu_t vu_bswap( vu_t a ) {
     168      393216 :   vu_t m = vu_bcast( 0x00FF00FFU );                                            /* Probably hoisted */
     169      393216 :   vu_t t = vu_rol( a, 16 );                                                    /* Swap E/O 16-bit pairs */
     170      393216 :   return vu_or( vu_andnot( m, vu_shl( t, 8 ) ), vu_and( m, vu_shr( t, 8 ) ) ); /* Swap E/O  8-bit pairs */
     171      393216 : }
     172             : 
     173             : /* Logical operations */
     174             : 
     175             : /* Like noted below in the vu_to_{vf,vd} converters, Intel clearly has
     176             :    the hardware to do a _mm_cmpgt_epu32 given that _mm_cmpgt_epi32
     177             :    exists but doesn't expose it in the ISA pre AVX-512.  Sigh ... twos
     178             :    complement bit tricks to the rescue for vu_{gt,lt,ge,le}. */
     179             : 
     180             : #define vu_lnot(a)     _mm_cmpeq_epi32( (a), _mm_setzero_si128() ) /* [  !a0  !a1 ...  !a3 ] */
     181             : #define vu_lnotnot(a)                                              /* [ !!a0 !!a1 ... !!a3 ] */ \
     182             :   _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( (a), _mm_setzero_si128() ) )
     183             : 
     184             : #define vu_eq(a,b) _mm_cmpeq_epi32( (a), (b) )                                        /* [ a0==b0 a1==b1 ... a3==b3 ] */
     185             : #define vu_gt(a,b)                                                                    /* [ a0> b0 a1> b1 ... a3> b3 ] */ \
     186             :   _mm_cmpgt_epi32( _mm_sub_epi32( (a), _mm_set1_epi32( (int)(1U<<31) ) ),                                                \
     187             :                    _mm_sub_epi32( (b), _mm_set1_epi32( (int)(1U<<31) ) ) )
     188             : #define vu_lt(a,b) vu_gt( (b), (a) )                                                  /* [ a0< b0 a1< b1 ... a3> b3 ] */
     189             : #define vu_ne(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a3!=b3 ] */
     190             : #define vu_ge(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), vu_gt( (b), (a) ) )           /* [ a0>=b0 a1>=b1 ... a3>=b3 ] */
     191             : #define vu_le(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), vu_gt( (a), (b) ) )           /* [ a0<=b0 a1<=b1 ... a3<=b3 ] */
     192             : 
     193             : /* Conditional operations */
     194             : 
     195             : #define vu_czero(c,f)    _mm_andnot_si128( (c), (f) ) /* [ c0?0U:f0 c1?0U:f1 ... c3?0U:f3 ] */
     196             : #define vu_notczero(c,f) _mm_and_si128(    (c), (f) ) /* [ c0?f0:0U c1?f1:0U ... c3?f3:0U ] */
     197             : 
     198      524288 : #define vu_if(c,t,f) _mm_blendv_epi8(  (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c3?t3:f3 ] */
     199             : 
     200             : /* Conversion operations */
     201             : 
     202             : /* Summarizing:
     203             : 
     204             :    vu_to_vc(a)               returns [ !!a0 !!a1 ... a3 ]
     205             : 
     206             :    vu_to_vf(a)               returns [ (float)a0 (float)a1 ... (float)a3 ]
     207             : 
     208             :    vu_to_vi(a)               returns [ (int)a0 (int)a1 ... (int)a3 ]
     209             : 
     210             :    vu_to_vd(a,imm_i0,imm_i1) returns [ (double)a(imm_i0) (double)a(imm_i1) ]
     211             : 
     212             :    vu_to_vl(a,imm_i0,imm_i1) returns [ (long)a(imm_i0) (long)a(imm_i1) ]
     213             : 
     214             :    vu_to_vv(a,imm_i0,imm_i1) returns [ (ulong)a(imm_i0) (ulong)a(imm_i1) ]
     215             : 
     216             :    where imm_i* should be a compile time constant in 0:3.
     217             : 
     218             :    The raw variants just treat the raw bits as the corresponding vector
     219             :    type.  For vu_to_vc_raw, the user promises vu contains a proper
     220             :    vector conditional (i.e. 0 or -1 in each lane).  vu_to_vf_raw is
     221             :    useful for doing advanced bit tricks on floating point values.  The
     222             :    others are probably dubious but are provided for completness. */
     223             : 
     224             : #define vu_to_vc(a)               _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( (a), _mm_setzero_si128() ) )
     225             : #define vu_to_vi(a)               (a)
     226             : 
     227     3538944 : static inline __m128d vu_to_vd_core( vu_t u ) { /* FIXME: workaround vd_t isn't declared at this point */
     228             : 
     229             :   /* Note: Given that _mm_cvtepi32_pd exists, Intel clearly has the
     230             :      hardware under the hood to support a _mm_cvtepu32_pd but didn't
     231             :      bother to expose it pre AVX-512 ... sigh (all too typical
     232             :      unfortunately).  We can do a mix of twos complement and floating
     233             :      point hacks to emulate it without spilling. */
     234             : 
     235     3538944 :   __m128i c  = _mm_cmpgt_epi32( _mm_setzero_si128(), u );         // 0      if u<2^31, -1     o.w
     236     3538944 :   __m128d d  = _mm_cvtepi32_pd( u );                              // u      if u<2^31, u-2^32 o.w, exact
     237     3538944 :   __m128d ds = _mm_add_pd( d, _mm_set1_pd( (double)(1UL<<32) ) ); // u+2^32 if u<2^31, u      o.w, exact
     238     3538944 :   __m128i cl = _mm_cvtepi32_epi64( c );                           // 0L     if u<2^31, -1L    o.w
     239     3538944 :   return _mm_blendv_pd( d, ds, _mm_castsi128_pd( cl ) );          // u
     240             : 
     241     3538944 : }
     242             : 
     243             : #define vu_to_vd(a,imm_i0,imm_i1) vu_to_vd_core( _mm_shuffle_epi32( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
     244             : 
     245      196608 : static inline vf_t vu_to_vf( vu_t u ) {
     246             : 
     247             :   /* See note above re ISA dubiousness.  Note that we can't do the same
     248             :      trick as vu_to_vd due to single precision roundoff limitations (the
     249             :      _mm_cvtepi32_pd equivalent would not be exact such that add to
     250             :      correct the twos complement mangling would add a possible second
     251             :      roundoff error ... this would result in slightly different values
     252             :      occasionally when u is >~ 2^31).  We instead convert the two
     253             :      halves to double (exact), convert the double to float (single
     254             :      roundoff error) and then concat the two float halves to make a
     255             :      correctly rounded implementation. */
     256             : 
     257      196608 :   return _mm_shuffle_ps( _mm_cvtpd_ps( vu_to_vd_core(u) ), _mm_cvtpd_ps( vu_to_vd(u,2,3) ), _MM_SHUFFLE(1,0,1,0) );
     258      196608 : }
     259             : 
     260             : #define vu_to_vl(a,imm_i0,imm_i1) _mm_cvtepu32_epi64( _mm_shuffle_epi32( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
     261             : #define vu_to_vv(a,imm_i0,imm_i1) _mm_cvtepu32_epi64( _mm_shuffle_epi32( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
     262             : 
     263             : #define vu_to_vc_raw(a) (a)
     264             : #define vu_to_vf_raw(a) _mm_castsi128_ps( (a) )
     265             : #define vu_to_vi_raw(a) (a)
     266             : #define vu_to_vd_raw(a) _mm_castsi128_pd( (a) )
     267             : #define vu_to_vl_raw(a) (a)
     268             : #define vu_to_vv_raw(a) (a)
     269             : 
     270             : /* Reduction operations */
     271             : 
     272             : static inline vu_t
     273      196608 : vu_sum_all( vu_t x ) { /* Returns vu_bcast( sum( x ) ) */
     274      196608 :   x = _mm_hadd_epi32( x, x );    /* x01 x23 ... */
     275      196608 :   return _mm_hadd_epi32( x, x ); /* xsum ...    */
     276      196608 : }
     277             : 
     278             : static inline vu_t
     279      196608 : vu_min_all( vu_t x ) { /* Returns vu_bcast( min( x ) ) */
     280      196608 :   __m128i y;
     281      196608 :   y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x2  x3  x0  x1 */
     282      196608 :   x = _mm_min_epu32( x, y );                             /* x02 x13 ...    */
     283      196608 :   y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x13 x02 ...    */
     284      196608 :   x = _mm_min_epu32( x, y );                             /* xmin ...       */
     285      196608 :   return x;
     286      196608 : }
     287             : 
     288             : static inline vu_t
     289      196608 : vu_max_all( vu_t x ) { /* Returns vu_bcast( max( x ) ) */
     290      196608 :   __m128i y;
     291      196608 :   y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x2  x3  x0  x1 */
     292      196608 :   x = _mm_max_epu32( x, y );                             /* x02 x13 ...    */
     293      196608 :   y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x13 x02 ...    */
     294      196608 :   x = _mm_max_epu32( x, y );                             /* xmax ...       */
     295      196608 :   return x;
     296      196608 : }
     297             : 
     298             : /* Misc operations */
     299             : 
     300             : /* vu_gather(b,i) returns [ b[i(0)] b[i(1)] ... b[i(3)] ] where b is a
     301             :    "uint const *"  and i is a vi_t.  We use a static inline here instead
     302             :    of a define to keep strict type checking while working around yet
     303             :    another Intel intrinsic type mismatch issue. */
     304             : 
     305             : #if defined(__AVX2__)
     306    61538355 : static inline vu_t vu_gather( uint const * b, vi_t i ) {
     307    61538355 :   return _mm_i32gather_epi32( (int const *)b, (i), 4 );
     308    61538355 : }
     309             : #endif /* defined(__AVX2__) */
     310             : 
     311             : /* vu_transpose_4x4 transposes the 4x4 matrix stored in vu_t r0,r1,r2,r3
     312             :    and stores the result in 4x4 matrix vu_t c0,c1,c2,c3.  All
     313             :    c0,c1,c2,c3 should be different for a well defined result.
     314             :    Otherwise, in-place operation and/or using the same vu_t to specify
     315             :    multiple rows of r is fine. */
     316             : 
     317      196608 : #define vu_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do {                                                                   \
     318      196608 :     vu_t _vu_transpose_r0 = (r0); vu_t _vu_transpose_r1 = (r1); vu_t _vu_transpose_r2 = (r2); vu_t _vu_transpose_r3 = (r3); \
     319      196608 :     vu_t _vu_transpose_t;                                                                                                   \
     320      196608 :     /* Transpose 2x2 blocks */                                                                                              \
     321      196608 :     _vu_transpose_t = _vu_transpose_r0; _vu_transpose_r0 = _mm_unpacklo_epi32( _vu_transpose_t,  _vu_transpose_r2 );        \
     322      196608 :     /**/                                _vu_transpose_r2 = _mm_unpackhi_epi32( _vu_transpose_t,  _vu_transpose_r2 );        \
     323      196608 :     _vu_transpose_t = _vu_transpose_r1; _vu_transpose_r1 = _mm_unpacklo_epi32( _vu_transpose_t,  _vu_transpose_r3 );        \
     324      196608 :     /**/                                _vu_transpose_r3 = _mm_unpackhi_epi32( _vu_transpose_t,  _vu_transpose_r3 );        \
     325      196608 :     /* Transpose 1x1 blocks */                                                                                              \
     326      196608 :     /**/                                (c0)             = _mm_unpacklo_epi32( _vu_transpose_r0, _vu_transpose_r1 );        \
     327      196608 :     /**/                                (c1)             = _mm_unpackhi_epi32( _vu_transpose_r0, _vu_transpose_r1 );        \
     328      196608 :     /**/                                (c2)             = _mm_unpacklo_epi32( _vu_transpose_r2, _vu_transpose_r3 );        \
     329      196608 :     /**/                                (c3)             = _mm_unpackhi_epi32( _vu_transpose_r2, _vu_transpose_r3 );        \
     330      196608 :   } while(0)

Generated by: LCOV version 1.14