LCOV - code coverage report
Current view: top level - util/simd - fd_avx_wu.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 126 134 94.0 %
Date: 2025-01-08 12:08:44 Functions: 35 15370 0.2 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_avx_h
       2             : #error "Do not include this directly; use fd_avx.h"
       3             : #endif
       4             : 
       5             : /* Vector uint API ****************************************************/
       6             : 
       7             : /* A wu_t is a vector where each 32-bit wide lane holds an unsigned
       8             :    32-bit integer (a "uint").  These mirror wc and wf as much as
       9             :    possible.
      10             : 
      11             :    These mirror the other APIs as much as possible.  Macros are
      12             :    preferred over static inlines when it is possible to do it robustly
      13             :    to reduce the risk of the compiler mucking it up. */
      14             : 
      15 39131053619 : #define wu_t __m256i
      16             : 
      17             : /* Constructors */
      18             : 
      19             : /* Given the uint values, return ... */
      20             : 
      21             : #define wu(u0,u1,u2,u3,u4,u5,u6,u7) /* [ u0 u1 u2 u3 u4 u5 u6 u7 ] */ \
      22    59179779 :   _mm256_setr_epi32( (int)(u0), (int)(u1), (int)(u2), (int)(u3), (int)(u4), (int)(u5), (int)(u6), (int)(u7) )
      23             : 
      24  4650037972 : #define wu_bcast(u0) _mm256_set1_epi32( (int)(u0) ) /* [ u0 u0 u0 u0 u0 u0 u0 u0 ] */
      25             : 
      26             : static inline wu_t /* [ u0 u1 u0 u1 u0 u1 u0 u1 ] */
      27      196608 : wu_bcast_pair( uint u0, uint u1 ) {
      28      196608 :   int i0 = (int)u0; int i1 = (int)u1;
      29      196608 :   return _mm256_setr_epi32( i0, i1, i0, i1, i0, i1, i0, i1 );
      30      196608 : }
      31             : 
      32             : static inline wu_t /* [ u0 u0 u0 u0 u1 u1 u1 u1 ] */
      33      196608 : wu_bcast_lohi( uint u0, uint u1 ) {
      34      196608 :   int i0 = (int)u0; int i1 = (int)u1;
      35      196608 :   return _mm256_setr_epi32( i0, i0, i0, i0, i1, i1, i1, i1 );
      36      196608 : }
      37             : 
      38             : static inline wu_t /* [ u0 u1 u2 u3 u0 u1 u2 u3 ] */
      39      196608 : wu_bcast_quad( uint u0, uint u1, uint u2, uint u3 ) {
      40      196608 :   int i0 = (int)u0; int i1 = (int)u1; int i2 = (int)u2; int i3 = (int)u3;
      41      196608 :   return _mm256_setr_epi32( i0, i1, i2, i3, i0, i1, i2, i3 );
      42      196608 : }
      43             : 
      44             : static inline wu_t /* [ u0 u0 u1 u1 u2 u2 u3 u3 ] */
      45      196608 : wu_bcast_wide( uint u0, uint u1, uint u2, uint u3 ) {
      46      196608 :   int i0 = (int)u0; int i1 = (int)u1; int i2 = (int)u2; int i3 = (int)u3;
      47      196608 :   return _mm256_setr_epi32( i0, i0, i1, i1, i2, i2, i3, i3 );
      48      196608 : }
      49             : 
      50             : /* No general wu_permute due to cross-128-bit lane limitations in AVX.
      51             :    Useful cases are provided below.  Given [ u0 u1 u2 u3 u4 u5 u6 u7 ],
      52             :    return ... */
      53             : 
      54             : #define wu_bcast_even(x)      /* [ u0 u0 u2 u2 u4 u4 u6 u6 ] */ \
      55             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(2,2,0,0) ) )
      56             : 
      57             : #define wu_bcast_odd(x)       /* [ u1 u1 u3 u3 u5 u5 u7 u7 ] */ \
      58             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(3,3,1,1) ) )
      59             : 
      60             : #define wu_exch_adj(x)        /* [ u1 u0 u3 u2 u5 u4 u7 u6 ] */ \
      61             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(2,3,0,1) ) )
      62             : 
      63             : #define wu_exch_adj_pair(x)   /* [ u2 u3 u0 u1 u6 u7 u4 u5 ] */ \
      64             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( (x) ), _MM_SHUFFLE(1,0,3,2) ) )
      65             : 
      66             : static inline wu_t
      67      196608 : wu_exch_adj_quad( wu_t x ) { /* [ u4 u5 u6 u7 u0 u1 u2 u3 ] */
      68      196608 :   return _mm256_permute2f128_si256( x, x, 1 );
      69      196608 : }
      70             : 
      71             : /* Predefined constants */
      72             : 
      73   229430193 : #define wu_zero() _mm256_setzero_si256() /* Return [ 0U 0U 0U 0U 0U 0U 0U 0U ] */
      74    58589955 : #define wu_one()  _mm256_set1_epi32( 1 ) /* Return [ 1U 1U 1U 1U 1U 1U 1U 1U ] */
      75             : 
      76             : /* Memory operations */
      77             : 
      78             : /* wu_ld return the 8 uints at the 32-byte aligned / 32-byte sized
      79             :    location p as a vector uint.  wu_ldu is the same but p does not have
      80             :    to be aligned.  wu_st writes the vector uint to the 32-byte aligned /
      81             :    32-byte sized location p as 8 uints.  wu_stu is the same but p does
      82             :    not have to be aligned.  In all these lane l will be at p[l].  FIXME:
      83             :    USE ATTRIBUTES ON P PASSED TO THESE?
      84             : 
      85             :    Note: gcc knows a __m256i may alias. */
      86             : 
      87    58589955 : static inline wu_t wu_ld( uint const * p ) { return _mm256_load_si256(  (__m256i const *)p ); }
      88   670286259 : static inline void wu_st( uint * p, wu_t i ) { _mm256_store_si256(  (__m256i *)p, i ); }
      89             : 
      90  5214795846 : static inline wu_t wu_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
      91   652572955 : static inline void wu_stu( void * p, wu_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
      92             : 
      93             : /* wu_ldif is an optimized equivalent to wu_notczero(c,wu_ldu(p)) (may
      94             :    have different behavior if c is not a proper vector conditional).  It
      95             :    is provided for symmetry with the wu_stif operation.  wu_stif stores
      96             :    x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
      97             :    Undefined behavior if c is not a proper vector conditional. */
      98             : 
      99             : #define wu_ldif(c,p)   _mm256_maskload_epi32( (p),(c))
     100             : #define wu_stif(c,p,x) _mm256_maskstore_epi32((p),(c),(x))
     101             : 
     102             : /* Element operations */
     103             : 
     104             : /* wu_extract extracts the uint in lane imm from the vector uint.
     105             :    wu_insert returns the vector uint formed by replacing the value in
     106             :    lane imm of a with the provided uint.  imm should be a compile time
     107             :    constant in 0:7.  wu_extract_variable and wu_insert_variable are the
     108             :    slower but the lane n does not have to be known at compile time
     109             :    (should still be in 0:7).
     110             : 
     111             :    Note: C99 TC3 allows type punning through a union. */
     112             : 
     113   468719640 : #define wu_extract(a,imm)  ((uint)_mm256_extract_epi32( (a), (imm) ))
     114   468719640 : #define wu_insert(a,imm,v) _mm256_insert_epi32( (a), (int)(v), (imm) )
     115             : 
     116             : static inline uint
     117   468719640 : wu_extract_variable( wu_t a, int n ) {
     118   468719640 :   union { __m256i m[1]; uint u[8]; } t[1];
     119   468719640 :   _mm256_store_si256( t->m, a );
     120   468719640 :   return t->u[n];
     121   468719640 : }
     122             : 
     123             : static inline wu_t
     124   468719640 : wu_insert_variable( wu_t a, int n, uint v ) {
     125   468719640 :   union { __m256i m[1]; uint u[8]; } t[1];
     126   468719640 :   _mm256_store_si256( t->m, a );
     127   468719640 :   t->u[n] = v;
     128   468719640 :   return _mm256_load_si256( t->m );
     129   468719640 : }
     130             : 
     131             : /* Given [a0 a1 a2 a3 a4 a5 a6 a7] and/or [b0 b1 b2 b3 b4 b5 b6 b7],
     132             :    return ... */
     133             : 
     134             : /* Arithmetic operations */
     135             : 
     136             : #define wu_neg(a) _mm256_sub_epi32( _mm256_setzero_si256(), (a) ) /* [ -a0  -a1  ... -a7  ] (twos complement handling) */
     137             : #define wu_abs(a) (a)                                             /* [ |a0| |a1| ... |a7| ] (twos complement handling) */
     138             : 
     139             : #define wu_min(a,b) _mm256_min_epu32(   (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a7,b7) ] */
     140             : #define wu_max(a,b) _mm256_max_epu32(   (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a7,b7) ] */
     141 84509905578 : #define wu_add(a,b) _mm256_add_epi32(   (a), (b) ) /* [ a0 +b0     a1 +b1     ... a7 +b7     ] */
     142             : #define wu_sub(a,b) _mm256_sub_epi32(   (a), (b) ) /* [ a0 -b0     a1 -b1     ... a7 -b7     ] */
     143             : #define wu_mul(a,b) _mm256_mullo_epi32( (a), (b) ) /* [ a0 *b0     a1 *b1     ... a7 *b7     ] */
     144             : 
     145             : /* Binary operations */
     146             : 
     147             : /* Note: wu_shl/wu_shr is an unsigned left/right shift by imm bits; imm
     148             :    must be a compile time constant in [0,31].  The variable variants are
     149             :    slower but do not require the shift amount to be known at compile
     150             :    time (should still be in [0,31]). */
     151             : 
     152             : #define wu_not(a) _mm256_xor_si256( _mm256_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a7 ] */
     153             : 
     154             : #define wu_shl(a,imm)  _mm256_slli_epi32( (a), (imm) ) /* [ a0<<imm a1<<imm ... a7<<imm ] */
     155             : #define wu_shr(a,imm)  _mm256_srli_epi32( (a), (imm) ) /* [ a0>>imm a1>>imm ... a7>>imm ] */
     156             : 
     157             : #define wu_shl_variable(a,n) _mm256_sll_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     158             : #define wu_shr_variable(a,n) _mm256_srl_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     159             : 
     160  6546992260 : #define wu_shl_vector(a,b)   _mm256_sllv_epi32( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a7<<b7 ] */
     161             : #define wu_shr_vector(a,b)   _mm256_srlv_epi32( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a7>>b7 ] */
     162             : 
     163  6546992260 : #define wu_and(a,b)    _mm256_and_si256(    (a), (b) ) /* [   a0 &b0    a1& b1 ...   a7& b7 ] */
     164             : #define wu_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0  (~a1)&b1 ... (~a7)&b7 ] */
     165 >15294*10^7 : #define wu_or(a,b)     _mm256_or_si256(     (a), (b) ) /* [   a0 |b0    a1 |b1 ...   a7 |b7 ] */
     166 13704882404 : #define wu_xor(a,b)    _mm256_xor_si256(    (a), (b) ) /* [   a0 ^b0    a1 ^b1 ...   a7 ^b7 ] */
     167             : 
     168             : /* wu_rol(x,n) returns wu( rotate_left (x0,n), rotate_left (x1,n), ... )
     169             :    wu_ror(x,n) returns wu( rotate_right(x0,n), rotate_right(x1,n), ... ) */
     170             : 
     171             : #if FD_HAS_AVX512
     172  2061326256 : #define wu_rol(a,imm)  _mm256_rol_epi32( (a), (imm) )
     173             : #define wu_ror(a,imm)  _mm256_ror_epi32( (a), (imm) )
     174             : #else
     175 >14070*10^7 : static inline wu_t wu_rol( wu_t a, int imm ) { return wu_or( wu_shl( a, imm & 31 ), wu_shr( a, (-imm) & 31 ) ); }
     176     4194304 : static inline wu_t wu_ror( wu_t a, int imm ) { return wu_or( wu_shr( a, imm & 31 ), wu_shl( a, (-imm) & 31 ) ); }
     177             : #endif
     178             : 
     179     6291456 : static inline wu_t wu_rol_variable( wu_t a, int n ) { return wu_or( wu_shl_variable( a, n&31 ), wu_shr_variable( a, (-n)&31 ) ); }
     180     6291456 : static inline wu_t wu_ror_variable( wu_t a, int n ) { return wu_or( wu_shr_variable( a, n&31 ), wu_shl_variable( a, (-n)&31 ) ); }
     181             : 
     182           0 : static inline wu_t wu_rol_vector( wu_t a, wi_t b ) {
     183           0 :   wi_t m = wi_bcast( 31 );
     184           0 :   return wu_or( wu_shl_vector( a, wi_and( b, m ) ), wu_shr_vector( a, wi_and( wi_neg( b ), m ) ) );
     185           0 : }
     186             : 
     187           0 : static inline wu_t wu_ror_vector( wu_t a, wi_t b ) {
     188           0 :   wi_t m = wi_bcast( 31 );
     189           0 :   return wu_or( wu_shr_vector( a, wi_and( b, m ) ), wu_shl_vector( a, wi_and( wi_neg( b ), m ) ) );
     190           0 : }
     191             : 
     192  4200960748 : static inline wu_t wu_bswap( wu_t a ) {
     193  4200960748 :   wu_t m = wu_bcast( 0x00FF00FFU );                                            /* Probably hoisted */
     194  4200960748 :   wu_t t = wu_rol( a, 16 );                                                    /* Swap E/O 16-bit pairs */
     195  4200960748 :   return wu_or( wu_andnot( m, wu_shl( t, 8 ) ), wu_and( m, wu_shr( t, 8 ) ) ); /* Swap E/O  8-bit pairs */
     196  4200960748 : }
     197             : 
     198             : /* Logical operations */
     199             : 
     200             : /* Like noted below in the wu_to_{wf,wd} converters, Intel clearly has
     201             :    the hardware to do a _mm256_cmpgt_epu32 given that _mm256_cmpgt_epi32
     202             :    exists but doesn't expose it in the ISA pre AVX-512.  Sigh ... twos
     203             :    complement bit tricks to the rescue for wu_{gt,lt,ge,le}. */
     204             : 
     205             : #define wu_lnot(a)    _mm256_cmpeq_epi32( (a), _mm256_setzero_si256() ) /* [  !a0  !a1 ...  !a7 ] */
     206             : #define wu_lnotnot(a)                                                   /* [ !!a0 !!a1 ... !!a7 ] */ \
     207             :   _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( (a), _mm256_setzero_si256() ) )
     208             : 
     209             : #define wu_eq(a,b) _mm256_cmpeq_epi32( (a), (b) )                                              /* [ a0==b0 a1==b1 ... a7==b7 ] */
     210             : #define wu_gt(a,b)                                                                             /* [ a0> b0 a1> b1 ... a7> b7 ] */ \
     211             :   _mm256_cmpgt_epi32( _mm256_sub_epi32( (a), _mm256_set1_epi32( (int)(1U<<31) ) ),                                                \
     212             :                       _mm256_sub_epi32( (b), _mm256_set1_epi32( (int)(1U<<31) ) ) )
     213             : #define wu_lt(a,b) wu_gt( (b), (a) )                                                           /* [ a0< b0 a1< b1 ... a7< b7 ] */
     214             : #define wu_ne(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a7!=b7 ] */
     215             : #define wu_ge(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), wu_gt( (b), (a) ) )              /* [ a0>=b0 a1>=b1 ... a7>=b7 ] */
     216             : #define wu_le(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), wu_gt( (a), (b) ) )              /* [ a0<=b0 a1<=b1 ... a7<=b7 ] */
     217             : 
     218             : /* Conditional operations */
     219             : 
     220             : #define wu_czero(c,f)    _mm256_andnot_si256( (c), (f) ) /* [ c0?0U:f0 c1?0U:f1 ... c7?0U:f7 ] */
     221             : #define wu_notczero(c,f) _mm256_and_si256(    (c), (f) ) /* [ c0?f0:0U c1?f1:0U ... c7?f7:0U ] */
     222             : 
     223             : #define wu_if(c,t,f) _mm256_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c7?t7:f7 ] */
     224             : 
     225             : /* Conversion operations */
     226             : 
     227             : /* Summarizing:
     228             : 
     229             :    wu_to_wc(a)   returns [ !!a0 !!a1 ... !!a7 ]
     230             : 
     231             :    wu_to_wf(a)   returns [ (float)a0 (float)a1 ... (float)a7 ]
     232             : 
     233             :    wu_to_wi(a)   returns [ (int)a0 (int)a1 ... (int)a7 ]
     234             : 
     235             :    wu_to_wd(a,0) returns [ (double)a0 (double)a1 (double)a2 (double)a3 ]
     236             :    wu_to_wd(a,1) returns [ (double)a4 (double)a5 (double)a6 (double)a7 ]
     237             : 
     238             :    wu_to_wl(a,0) returns [ (long)a0   (long)a1   (long)a2   (long)a3   ]
     239             :    wu_to_wl(a,1) returns [ (long)a4   (long)a5   (long)a6   (long)a7   ]
     240             : 
     241             :    wu_to_wv(a,0) returns [ (ulong)a0  (ulong)a1  (ulong)a2  (ulong)a3  ]
     242             :    wu_to_wv(a,1) returns [ (ulong)a4  (ulong)a5  (ulong)a6  (ulong)a7  ]
     243             : 
     244             :    where imm_hi should be a compile time constant.
     245             : 
     246             :    For wu_to_{wd,wl}, the permutation used for the conversion is less
     247             :    flexible due to cross 128-bit lane limitations in AVX.  If imm_hi==0,
     248             :    the conversion is done to lanes 0:3.  Otherwise, the conversion is
     249             :    done to lanes 4:7.
     250             : 
     251             :    The raw variants just treat the raw bits as the corresponding vector
     252             :    type.  For wu_to_wc_raw, the user promises wu contains a proper
     253             :    vector conditional (e.g. 0 or -1 in each lane).  wu_to_wf_raw is
     254             :    useful for doing advanced bit tricks on floating point values.  The
     255             :    others are probably dubious but are provided for completness. */
     256             : 
     257             : #define wu_to_wc(a) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( (a), _mm256_setzero_si256() ) )
     258             : 
     259             : #define wu_to_wi(a) (a)
     260             : 
     261      786432 : static inline __m256d wu_to_wd( wu_t u, int imm_hi ) { /* FIXME: workaround wd_t isn't declared at this point */
     262             : 
     263             :   /* Note: Given that _mm256_cvtepi32_pd exists, Intel clearly has the
     264             :      hardware under the hood to support a _mm256_cvtepu32_pd but didn't
     265             :      bother to expose it pre AVX-512 ... sigh (all too typical
     266             :      unfortunately).  We can do a mix of twos complement and floating
     267             :      point hacks to emulate it without spilling. */
     268             : 
     269      786432 :   __m128i i  = imm_hi ? _mm256_extractf128_si256( u, 1 ) : _mm256_extractf128_si256( u, 0 ); // u      if u<2^31, u-2^32 o.w
     270      786432 :   __m128i c  = _mm_cmpgt_epi32( _mm_setzero_si128(), i );                                    // 0      if u<2^31, -1     o.w
     271      786432 :   __m256d d  = _mm256_cvtepi32_pd( i );                                                      // u      if u<2^31, u-2^32 o.w, exact
     272      786432 :   __m256d ds = _mm256_add_pd( d, _mm256_set1_pd( (double)(1UL<<32) ) );                      // u+2^32 if u<2^31, u      o.w, exact
     273      786432 :   __m256i cl = _mm256_cvtepi32_epi64( c );                                                   // 0L     if u<2^31, -1L    o.w
     274      786432 :   return _mm256_blendv_pd( d, ds, _mm256_castsi256_pd( cl ) );                               // u
     275      786432 : }
     276             : 
     277      196608 : static inline wf_t wu_to_wf( wu_t a ) {
     278             : 
     279             :   /* See note above re ISA dubiousness.  Note that we can't do the same
     280             :      trick as wu_to_wd due to single precision roundoff limitations (the
     281             :      _mm256_cvtepi32_pd equivalent would not be exact such that add to
     282             :      correct the twos complement mangling would add a possible second
     283             :      roundoff error ... this would result in slightly different values
     284             :      occasionally when u is >~ 2^31).  We instead convert the two
     285             :      halves to double (exact), convert the double to float (single
     286             :      roundoff error) and then concat the two float halves to make a
     287             :      correctly rounded implementation. */
     288             : 
     289      196608 :   return _mm256_setr_m128( _mm256_cvtpd_ps( wu_to_wd( a, 0 ) ), _mm256_cvtpd_ps( wu_to_wd( a, 1 ) ) );
     290      196608 : }
     291             : 
     292             : #define wu_to_wl(a,imm_hi) _mm256_cvtepu32_epi64( _mm256_extractf128_si256( (a), !!(imm_hi) ) )
     293             : #define wu_to_wv(a,imm_hi) _mm256_cvtepu32_epi64( _mm256_extractf128_si256( (a), !!(imm_hi) ) )
     294             : 
     295             : #define wu_to_wc_raw(a) (a)
     296             : #define wu_to_wf_raw(a) _mm256_castsi256_ps( (a) )
     297             : #define wu_to_wi_raw(a) (a)
     298             : #define wu_to_wd_raw(a) _mm256_castsi256_pd( (a) )
     299             : #define wu_to_wl_raw(a) (a)
     300    28682184 : #define wu_to_wv_raw(a) (a)
     301             : 
     302             : /* Reduction operations */
     303             : 
     304             : static inline wu_t
     305      196608 : wu_sum_all( wu_t x ) { /* Returns wu_bcast( sum( x ) ) */
     306      196608 :   x = _mm256_add_epi32( x, _mm256_permute2f128_si256( x, x, 1 ) ); /* x04   x15   x26   x37   ... */
     307      196608 :   x = _mm256_hadd_epi32( x, x );                                   /* x0145 x2367 ... */
     308      196608 :   return _mm256_hadd_epi32( x, x );                                /* xsum  ... */
     309      196608 : }
     310             : 
     311             : static inline wu_t
     312      196608 : wu_min_all( wu_t x ) { /* Returns wu_bcast( min( x ) ) */
     313      196608 :   __m256i y = _mm256_permute2f128_si256( x, x, 1 );         /* x4    x5    x6   x7    x0    x1   x2    x3   */
     314      196608 :   x = _mm256_min_epu32( x, y );                             /* x04   x15   x26  x37   ... */
     315      196608 :   y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x26   x37   x04  x15   ... */
     316      196608 :   x = _mm256_min_epu32( x, y );                             /* x0246 x1357 ... */
     317      196608 :   y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x1357 x0246 ... */
     318      196608 :   x = _mm256_min_epu32( x, y );                             /* xmin  ... */
     319      196608 :   return x;
     320      196608 : }
     321             : 
     322             : static inline wu_t
     323      196608 : wu_max_all( wu_t x ) { /* Returns wu_bcast( max( x ) ) */
     324      196608 :   __m256i y = _mm256_permute2f128_si256( x, x, 1 );         /* x4    x5    x6   x7    x0    x1   x2    x3   */
     325      196608 :   x = _mm256_max_epu32( x, y );                             /* x04   x15   x26  x37   ... */
     326      196608 :   y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x26   x37   x04  x15   ... */
     327      196608 :   x = _mm256_max_epu32( x, y );                             /* x0246 x1357 ... */
     328      196608 :   y = _mm256_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x1357 x0246 ... */
     329      196608 :   x = _mm256_max_epu32( x, y );                             /* xmax  ... */
     330      196608 :   return x;
     331      196608 : }
     332             : 
     333             : /* Misc operations */
     334             : 
     335             : /* wu_gather(b,i) returns [ b[i(0)] b[i(1)] ... b[i(7)] ] where b is a
     336             :    "uint const *" and i is a wi_t.  We use a static inline here instead
     337             :    of a define to keep strict type checking while working around yet
     338             :    another Intel intrinsic type mismatch issue. */
     339             : 
     340    58589955 : static inline wu_t wu_gather( uint const * b, wi_t i ) {
     341    58589955 :   return _mm256_i32gather_epi32( (int const *)b, (i), 4 );
     342    58589955 : }
     343             : 
     344             : /* wu_transpose_8x8 transposes the 8x8 matrix stored in wu_t r0,r1,...r7
     345             :    and stores the result in 8x8 matrix wu_t c0,c1,...c7.  All
     346             :    c0,c1,...c7 should be different for a well defined result.
     347             :    Otherwise, in-place operation and/or using the same wu_t to specify
     348             :    multiple rows of r is fine. */
     349             : 
     350   552803150 : #define wu_transpose_8x8( r0,r1,r2,r3,r4,r5,r6,r7, c0,c1,c2,c3,c4,c5,c6,c7 ) do {                                                 \
     351   552803150 :     wu_t _wu_transpose_r0 = (r0); wu_t _wu_transpose_r1 = (r1); wu_t _wu_transpose_r2 = (r2); wu_t _wu_transpose_r3 = (r3);       \
     352   552803150 :     wu_t _wu_transpose_r4 = (r4); wu_t _wu_transpose_r5 = (r5); wu_t _wu_transpose_r6 = (r6); wu_t _wu_transpose_r7 = (r7);       \
     353   552803150 :     wu_t _wu_transpose_t;                                                                                                         \
     354   552803150 :     /* Transpose 4x4 blocks */                                                                                                    \
     355   552803150 :     _wu_transpose_t = _wu_transpose_r0; _wu_transpose_r0 = _mm256_permute2f128_si256( _wu_transpose_t,  _wu_transpose_r4, 0x20 ); \
     356   552803150 :     /**/                                _wu_transpose_r4 = _mm256_permute2f128_si256( _wu_transpose_t,  _wu_transpose_r4, 0x31 ); \
     357   552803150 :     _wu_transpose_t = _wu_transpose_r1; _wu_transpose_r1 = _mm256_permute2f128_si256( _wu_transpose_t,  _wu_transpose_r5, 0x20 ); \
     358   552803150 :     /**/                                _wu_transpose_r5 = _mm256_permute2f128_si256( _wu_transpose_t,  _wu_transpose_r5, 0x31 ); \
     359   552803150 :     _wu_transpose_t = _wu_transpose_r2; _wu_transpose_r2 = _mm256_permute2f128_si256( _wu_transpose_t,  _wu_transpose_r6, 0x20 ); \
     360   552803150 :     /**/                                _wu_transpose_r6 = _mm256_permute2f128_si256( _wu_transpose_t,  _wu_transpose_r6, 0x31 ); \
     361   552803150 :     _wu_transpose_t = _wu_transpose_r3; _wu_transpose_r3 = _mm256_permute2f128_si256( _wu_transpose_t,  _wu_transpose_r7, 0x20 ); \
     362   552803150 :     /**/                                _wu_transpose_r7 = _mm256_permute2f128_si256( _wu_transpose_t,  _wu_transpose_r7, 0x31 ); \
     363   552803150 :     /* Transpose 2x2 blocks */                                                                                                    \
     364   552803150 :     _wu_transpose_t = _wu_transpose_r0; _wu_transpose_r0 = _mm256_unpacklo_epi32(     _wu_transpose_t,  _wu_transpose_r2 );       \
     365   552803150 :     /**/                                _wu_transpose_r2 = _mm256_unpackhi_epi32(     _wu_transpose_t,  _wu_transpose_r2 );       \
     366   552803150 :     _wu_transpose_t = _wu_transpose_r1; _wu_transpose_r1 = _mm256_unpacklo_epi32(     _wu_transpose_t,  _wu_transpose_r3 );       \
     367   552803150 :     /**/                                _wu_transpose_r3 = _mm256_unpackhi_epi32(     _wu_transpose_t,  _wu_transpose_r3 );       \
     368   552803150 :     _wu_transpose_t = _wu_transpose_r4; _wu_transpose_r4 = _mm256_unpacklo_epi32(     _wu_transpose_t,  _wu_transpose_r6 );       \
     369   552803150 :     /**/                                _wu_transpose_r6 = _mm256_unpackhi_epi32(     _wu_transpose_t,  _wu_transpose_r6 );       \
     370   552803150 :     _wu_transpose_t = _wu_transpose_r5; _wu_transpose_r5 = _mm256_unpacklo_epi32(     _wu_transpose_t,  _wu_transpose_r7 );       \
     371   552803150 :     /**/                                _wu_transpose_r7 = _mm256_unpackhi_epi32(     _wu_transpose_t,  _wu_transpose_r7 );       \
     372   552803150 :     /* Transpose 1x1 blocks */                                                                                                    \
     373   552803150 :     /**/                                (c0)             = _mm256_unpacklo_epi32(     _wu_transpose_r0, _wu_transpose_r1 );       \
     374   552803150 :     /**/                                (c1)             = _mm256_unpackhi_epi32(     _wu_transpose_r0, _wu_transpose_r1 );       \
     375   552803150 :     /**/                                (c2)             = _mm256_unpacklo_epi32(     _wu_transpose_r2, _wu_transpose_r3 );       \
     376   552803150 :     /**/                                (c3)             = _mm256_unpackhi_epi32(     _wu_transpose_r2, _wu_transpose_r3 );       \
     377   552803150 :     /**/                                (c4)             = _mm256_unpacklo_epi32(     _wu_transpose_r4, _wu_transpose_r5 );       \
     378   552803150 :     /**/                                (c5)             = _mm256_unpackhi_epi32(     _wu_transpose_r4, _wu_transpose_r5 );       \
     379   552803150 :     /**/                                (c6)             = _mm256_unpacklo_epi32(     _wu_transpose_r6, _wu_transpose_r7 );       \
     380   552803150 :     /**/                                (c7)             = _mm256_unpackhi_epi32(     _wu_transpose_r6, _wu_transpose_r7 );       \
     381   552803150 :   } while(0)

Generated by: LCOV version 1.14