LCOV - code coverage report
Current view: top level - util/simd - fd_avx_wb.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 142 144 98.6 %
Date: 2025-01-08 12:08:44 Functions: 64 15384 0.4 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_avx_h
       2             : #error "Do not include this directly; use fd_avx.h"
       3             : #endif
       4             : 
       5             : /* Vector byte API *****************************************************/
       6             : 
       7             : /* A wb_t is a vector where each 8-bit wide lane holds an unsigned 8-bit
       8             :    integer (a "uchar").
       9             : 
      10             :    These mirror the other APIs as much as possible.  Macros are
      11             :    preferred over static inlines when it is possible to do it robustly
      12             :    to reduce the risk of the compiler mucking it up. */
      13             : 
      14  4925087643 : #define wb_t __m256i
      15             : 
      16             : /* Constructors */
      17             : 
      18             : /* TODO: update older SIMD modules to follow the more general convention
      19             :    below. */
      20             : 
      21             : /* Given the uchar values, return ... */
      22             : 
      23             : #define wb(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,b11,b12,b13,b14,b15,                                                 \
      24             :            b16,b17,b18,b19,b20,b21,b22,b23,b24,b25,b26,b27,b28,b29,b30,b31) /* [ b0 b1 ... b31 ] */                         \
      25  6118340325 :   _mm256_setr_epi8( (char)( b0), (char)( b1), (char)( b2), (char)( b3), (char)( b4), (char)( b5), (char)( b6), (char)( b7), \
      26  6118340325 :                     (char)( b8), (char)( b9), (char)(b10), (char)(b11), (char)(b12), (char)(b13), (char)(b14), (char)(b15), \
      27  6118340325 :                     (char)(b16), (char)(b17), (char)(b18), (char)(b19), (char)(b20), (char)(b21), (char)(b22), (char)(b23), \
      28  6118340325 :                     (char)(b24), (char)(b25), (char)(b26), (char)(b27), (char)(b28), (char)(b29), (char)(b30), (char)(b31) )
      29             : 
      30      393216 : #define wb_bcast(b0) _mm256_set1_epi8( (char)(b0) ) /* [ b0 b0 ... b0 ] */
      31             : 
      32             : static inline wb_t /* [ b0 b1 b0 b1 ... b0 b1 ] */
      33     1376550 : wb_bcast_pair( uchar b0, uchar b1 ) {
      34     1376550 :   return _mm256_setr_epi8( (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1),
      35     1376550 :                            (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1),
      36     1376550 :                            (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1),
      37     1376550 :                            (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1) );
      38     1376550 : }
      39             : 
      40             : static inline wb_t /* [ b0 b1 b2 b3 b0 b1 b2 b3 ... b0 b1 b2 b3 ] */
      41      589824 : wb_bcast_quad( uchar b0, uchar b1, uchar b2, uchar b3 ) {
      42      589824 :   return _mm256_setr_epi8( (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3),
      43      589824 :                            (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3),
      44      589824 :                            (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3),
      45      589824 :                            (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3) );
      46      589824 : }
      47             : 
      48             : static inline wb_t /* [ b0 b1 ... b7 b0 b1 ... b7 b0 b1 ... b7 b0 b1 ... b7 ] */
      49      196608 : wb_bcast_oct( uchar b0, uchar b1, uchar b2, uchar b3, uchar b4, uchar b5, uchar b6, uchar b7 ) {
      50      196608 :   return _mm256_setr_epi8( (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7),
      51      196608 :                            (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7),
      52      196608 :                            (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7),
      53      196608 :                            (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7) );
      54      196608 : }
      55             : 
      56             : static inline wb_t /* [ b0 b1 ... b15 b0 b1 ... b15 ] */
      57             : wb_bcast_hex( uchar b0, uchar b1, uchar b2,  uchar b3,  uchar b4,  uchar b5,  uchar b6,  uchar b7,
      58      196608 :               uchar b8, uchar b9, uchar b10, uchar b11, uchar b12, uchar b13, uchar b14, uchar b15 ) {
      59      196608 :   return _mm256_setr_epi8( (char)(b0), (char)(b1), (char)(b2),  (char)(b3),  (char)(b4),  (char)(b5),  (char)(b6),  (char)(b7),
      60      196608 :                            (char)(b8), (char)(b9), (char)(b10), (char)(b11), (char)(b12), (char)(b13), (char)(b14), (char)(b15),
      61      196608 :                            (char)(b0), (char)(b1), (char)(b2),  (char)(b3),  (char)(b4),  (char)(b5),  (char)(b6),  (char)(b7),
      62      196608 :                            (char)(b8), (char)(b9), (char)(b10), (char)(b11), (char)(b12), (char)(b13), (char)(b14), (char)(b15) );
      63      196608 : }
      64             : 
      65             : static inline wb_t /* [ b0 b0 ... b0 b1 b1 ... b1 ] */
      66      196608 : wb_expand_pair( uchar b0, uchar b1 ) {
      67      196608 :   return _mm256_setr_epi8( (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0),
      68      196608 :                            (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0),
      69      196608 :                            (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1),
      70      196608 :                            (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1) );
      71      196608 : }
      72             : 
      73             : static inline wb_t /* [ b0 b0 ... b0 b1 b1 ... b1 b2 b2 ... b2 b3 b3 ... b3 ] */
      74      196608 : wb_expand_quad( uchar b0, uchar b1, uchar b2, uchar b3 ) {
      75      196608 :   return _mm256_setr_epi8( (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0),
      76      196608 :                            (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1),
      77      196608 :                            (char)(b2), (char)(b2), (char)(b2), (char)(b2), (char)(b2), (char)(b2), (char)(b2), (char)(b2),
      78      196608 :                            (char)(b3), (char)(b3), (char)(b3), (char)(b3), (char)(b3), (char)(b3), (char)(b3), (char)(b3) );
      79      196608 : }
      80             : 
      81             : static inline wb_t /* [ b0 b0 b0 b0 b1 b1 b1 b1 ... b7 b7 b7 b7 ] */
      82      196608 : wb_expand_oct( uchar b0, uchar b1, uchar b2, uchar b3, uchar b4, uchar b5, uchar b6, uchar b7 ) {
      83      196608 :   return _mm256_setr_epi8( (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b1), (char)(b1), (char)(b1), (char)(b1),
      84      196608 :                            (char)(b2), (char)(b2), (char)(b2), (char)(b2), (char)(b3), (char)(b3), (char)(b3), (char)(b3),
      85      196608 :                            (char)(b4), (char)(b4), (char)(b4), (char)(b4), (char)(b5), (char)(b5), (char)(b5), (char)(b5),
      86      196608 :                            (char)(b6), (char)(b6), (char)(b6), (char)(b6), (char)(b7), (char)(b7), (char)(b7), (char)(b7) );
      87      196608 : }
      88             : 
      89             : static inline wb_t /* [ b0 b0 b1 b1 ... b15 b15 ] */
      90             : wb_expand_hex( uchar b0, uchar b1, uchar  b2, uchar  b3, uchar  b4, uchar  b5, uchar  b6, uchar b7,
      91      196608 :                uchar b8, uchar b9, uchar b10, uchar b11, uchar b12, uchar b13, uchar b14, uchar b15 ) {
      92      196608 :   return _mm256_setr_epi8( (char)( b0), (char)( b0), (char)( b1), (char)( b1), (char)( b2), (char)( b2), (char)( b3), (char)( b3),
      93      196608 :                            (char)( b4), (char)( b4), (char)( b5), (char)( b5), (char)( b6), (char)( b6), (char)( b7), (char)( b7),
      94      196608 :                            (char)( b8), (char)( b8), (char)( b9), (char)( b9), (char)(b10), (char)(b10), (char)(b11), (char)(b11),
      95      196608 :                            (char)(b12), (char)(b12), (char)(b13), (char)(b13), (char)(b14), (char)(b14), (char)(b15), (char)(b15) );
      96      196608 : }
      97             : 
      98             : /* No general wb_permute due to cross-128-bit lane limitations in AVX.
      99             :    Useful cases are provided below.  Given [ b0 b1 ... b31 ], return ...  */
     100             : 
     101             : #define wb_exch_adj(x)      /* [ b1 b0 b3 b2 ... b31 b30 ] */ \
     102         294 :   _mm256_shuffle_epi8( (x), wb( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, \
     103         294 :                                 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 ) )
     104             : 
     105             : #define wb_exch_adj_pair(x) /* [ b2 b3 b0 b1 .. b30 b31 b28 b29 ] */ \
     106  3058481814 :   _mm256_shuffle_epi8( (x), wb( 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, \
     107  3058481814 :                                 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ) )
     108             : 
     109             : #define wb_exch_adj_quad(x) /* [ b4 b5 b6 b7 b0 b1 b2 b3 .. b28 b29 b30 b31 ] */      \
     110             :   _mm256_shuffle_epi8( (x), wb( 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, \
     111             :                                 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 ) )
     112             : 
     113             : #define wb_exch_adj_oct(x)  /* [ b8 b9 ... b15 b0 b1 ... b7 b24 b25 ... b31 b16 b17 ... b23 ] */ \
     114             :   _mm256_shuffle_epi8( (x), wb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,            \
     115             :                                 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) )
     116             : 
     117             : static inline wb_t          /* [ b16 b17 ... b31 b0 b1 ... b15 ] */
     118      196608 : wb_exch_adj_hex( wb_t x ) {
     119      196608 :   return _mm256_permute2f128_si256( x, x, 1 );
     120      196608 : }
     121             : 
     122             : #define wb_bcast_even(x)    /* [ b0 b0 b2 b2 ... b30 b30 ] */                            \
     123             :   _mm256_shuffle_epi8( (x), wb( 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,    \
     124             :                                 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 ) )
     125             : 
     126             : #define wb_bcast_odd(x)     /* [ b1 b1 b3 b3 ... b31 b31 ] */                            \
     127             :   _mm256_shuffle_epi8( (x), wb( 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,    \
     128             :                                 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 ) )
     129             : 
     130             : /* Predefined constants */
     131             : 
     132   870490330 : #define wb_zero() _mm256_setzero_si256() /* Return [ 0 0 ... 0 ] */
     133             : #define wb_one()  _mm256_set1_epi8( 1 )  /* Return [ 1 1 ... 1 ] */
     134             : 
     135             : /* Memory operations */
     136             : 
     137             : /* wb_ld return the 32 uchars at the 32-byte aligned / 32-byte sized
     138             :    location p as a vector uchar.  wb_ldu is the same but p does not have
     139             :    to be aligned.  wb_st writes the vector uchar to the 32-byte aligned /
     140             :    32-byte sized location p as 32 uchars.  wb_stu is the same but p does not
     141             :    have to be aligned.  In all these lane l will be at p[l].  FIXME: USE
     142             :    ATTRIBUTES ON P PASSED TO THESE?
     143             : 
     144             :    Note: gcc knows a __m256i may alias. */
     145             : 
     146 15415434319 : static inline wb_t wb_ld( uchar const * p ) { return _mm256_load_si256(  (__m256i const *)p ); }
     147    20643981 : static inline void wb_st( uchar * p, wb_t i ) { _mm256_store_si256(  (__m256i *)p, i ); }
     148             : 
     149  3875194216 : static inline wb_t wb_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
     150  1408716134 : static inline void wb_stu( void * p, wb_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
     151             : 
     152             : /* Sadly, no maskload_epi8, so we can't provide a wb_ldif or wb_stif.
     153             :    TODO: consider emulating this? */
     154             : 
     155             : /* Element operations */
     156             : 
     157             : /* wb_extract extracts the uchar in lane imm from the vector uchar.
     158             :    wb_insert returns the vector uchar formed by replacing the value in
     159             :    lane imm of a wb_t with the provided uchar.  imm should be a compile
     160             :    time constant in 0:31.  wb_extract_variable and wb_insert_variable
     161             :    are the slower but the lane n does not have to be known at compile
     162             :    time (should still be in 0:31).
     163             : 
     164             :    Note: C99 TC3 allows type punning through a union. */
     165             : 
     166   660603072 : #define wb_extract(a,imm)  ((uchar)_mm256_extract_epi8( (a), (imm) ))
     167   660603072 : #define wb_insert(a,imm,v) _mm256_insert_epi8( (a), (char)(v), (imm) )
     168             : 
     169             : static inline uchar
     170   660603072 : wb_extract_variable( wb_t a, int n ) {
     171   660603072 :   union { __m256i m[1]; uchar i[32]; } t[1];
     172   660603072 :   _mm256_store_si256( t->m, a );
     173   660603072 :   return t->i[n];
     174   660603072 : }
     175             : 
     176             : static inline wb_t
     177   660603072 : wb_insert_variable( wb_t a, int n, uchar v ) {
     178   660603072 :   union { __m256i m[1]; uchar i[32]; } t[1];
     179   660603072 :   _mm256_store_si256( t->m, a );
     180   660603072 :   t->i[n] = v;
     181   660603072 :   return _mm256_load_si256( t->m );
     182   660603072 : }
     183             : 
     184             : /* Given [a0 a1 ... a31] and/or [b0 b1 ... b31], return ... */
     185             : 
     186             : /* Arithmetic operations */
     187             : 
     188             : #define wb_neg(a) _mm256_sub_epi8( _mm256_setzero_si256(), (a) ) /* [ -a0  -a1  ... -a31  ] (twos complement handling) */
     189             : #define wb_abs(a) (a)                                            /* [ |a0| |a1| ... |a31| ] (unsigned type, so identity) */
     190             : 
     191             : #define wb_min(a,b) _mm256_min_epu8( (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a31,b31) ] */
     192             : #define wb_max(a,b) _mm256_max_epu8( (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a31,b31) ] */
     193        1698 : #define wb_add(a,b) _mm256_add_epi8( (a), (b) ) /* [ a0 +b0     a1 +b1     ... a31 +b31     ] */
     194        3270 : #define wb_sub(a,b) _mm256_sub_epi8( (a), (b) ) /* [ a0 -b0     a1 -b1     ... a31 -b31     ] */
     195             : 
     196             : /* No wb_mul because there's no instruction for multiplying uchars.  You
     197             :    can build one with two invocations to _mm_mullo_epi16, but it won't
     198             :    be particularly fast.  Multiplication by add and shift might be
     199             :    faster honestly.  TODO: consider emulating for completeness? */
     200             : 
     201             : /* Bit operations */
     202             : 
     203             : /* Note: wb_shl/wb_shr is an unsigned left/right shift by imm bits; imm
     204             :    must be a compile time constant in 0:7.  The variable variants are
     205             :    slower but do not require the shift amount to be known at compile
     206             :    time (should still be in 0:7).
     207             : 
     208             :    vector shift amount variants are omitted for the time being as these
     209             :    are rarely needed and there seems to be little support for it.
     210             :    Probably could be done via two 16-wide vector shifts for the even/odd
     211             :    lanes and some masking tricks. */
     212             : 
     213             : #define wb_not(a) _mm256_xor_si256( _mm256_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a31 ] */
     214             : 
     215             : #define wb_shl(a,imm) wb_and( _mm256_slli_epi16( (a), (imm) ), wb_bcast( (uchar)(0xFFUL << (imm)) ) ) /* [ a0<<imm a1<<imm ... a31<<imm ] */
     216   328587396 : #define wb_shr(a,imm) wb_and( _mm256_srli_epi16( (a), (imm) ), wb_bcast( (uchar)(0xFFUL >> (imm)) ) ) /* [ a0>>imm a1>>imm ... a31>>imm ] */
     217             : 
     218             : #define wb_shl_variable(a,n) wb_and( _mm256_sll_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) ), \
     219             :                                      wb_bcast( (uchar)(0xFFUL << (n)) ) )
     220             : #define wb_shr_variable(a,n) wb_and( _mm256_srl_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) ), \
     221             :                                      wb_bcast( (uchar)(0xFFUL >> (n)) ) )
     222             : 
     223   657174939 : #define wb_and(a,b)    _mm256_and_si256(    (a), (b) ) /* [   a0 &b0    a1& b1 ...   a31& b31 ] */
     224             : #define wb_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0  (~a1)&b1 ... (~a31)&b31 ] */
     225     6292977 : #define wb_or(a,b)     _mm256_or_si256(     (a), (b) ) /* [   a0 |b0    a1 |b1 ...   a31 |b31 ] */
     226 26838393374 : #define wb_xor(a,b)    _mm256_xor_si256(    (a), (b) ) /* [   a0 ^b0    a1 ^b1 ...   a31 ^b31 ] */
     227             : 
     228     1572864 : static inline wb_t wb_rol( wb_t a, int imm ) { return wb_or( wb_shl( a, imm & 7 ), wb_shr( a, (-imm) & 7 ) ); }
     229     1572864 : static inline wb_t wb_ror( wb_t a, int imm ) { return wb_or( wb_shr( a, imm & 7 ), wb_shl( a, (-imm) & 7 ) ); }
     230             : 
     231     1572864 : static inline wb_t wb_rol_variable( wb_t a, int n ) { return wb_or( wb_shl_variable( a, n&7 ), wb_shr_variable( a, (-n)&7 ) ); }
     232     1572864 : static inline wb_t wb_ror_variable( wb_t a, int n ) { return wb_or( wb_shr_variable( a, n&7 ), wb_shl_variable( a, (-n)&7 ) ); }
     233             : 
     234             : /* Logical operations */
     235             : 
     236             : #define wb_lnot(a)    _mm256_cmpeq_epi8( (a), _mm256_setzero_si256() ) /* [  !a0  !a1 ...  !a31 ] */
     237             : #define wb_lnotnot(a)                                                  /* [ !!a0 !!a1 ... !!a31 ] */ \
     238             :   _mm256_xor_si256( _mm256_set1_epi32( -1 ), wb_lnot( (a) ) )
     239             : 
     240             : #define wb_eq(a,b) _mm256_cmpeq_epi8( (a), (b) )                                              /* [ a0==b0 a1==b1 ... a31==b31 ] */
     241             : #define wb_gt(a,b)                                                                            /* [ a0> b0 a1> b1 ... a31> b31 ] */\
     242        1698 :   _mm256_cmpgt_epi8( _mm256_sub_epi8( (a), _mm256_set1_epi8( (char)(1U<<7) ) ),                                                   \
     243        1698 :                      _mm256_sub_epi8( (b), _mm256_set1_epi8( (char)(1U<<7) ) ) )
     244        1698 : #define wb_lt(a,b) wb_gt( (b), (a) )                                                          /* [ a0< b0 a1< b1 ... a31< b31 ] */
     245             : #define wb_ne(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi8( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a31!=b31 ] */
     246             : #define wb_ge(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), wb_gt( (b), (a) ) )             /* [ a0>=b0 a1>=b1 ... a31>=b31 ] */
     247             : #define wb_le(a,b) _mm256_xor_si256( _mm256_set1_epi32( -1 ), wb_gt( (a), (b) ) )             /* [ a0<=b0 a1<=b1 ... a31<=b31 ] */
     248             : 
     249             : /* Conditional operations */
     250             : 
     251             : #define wb_czero(c,f)    _mm256_andnot_si256( (c), (f) ) /* [ c0? 0:f0 c1? 0:f1 ... c31? 0:f31 ] */
     252             : #define wb_notczero(c,f) _mm256_and_si256(    (c), (f) ) /* [ c0?f0: 0 c1?f1: 0 ... c31?f31: 0 ] */
     253             : 
     254         147 : #define wb_if(c,t,f) _mm256_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c31?t31:f31 ] */
     255             : 
     256             : /* Conversion operations */
     257             : 
     258             : /* Summarizing:
     259             : 
     260             :    wb_to_wc(a, 0)   returns [ !!a0  !!a1  ... !!a7  ]
     261             :    wb_to_wc(a, 1)   returns [ !!a8  !!a9  ... !!a15 ]
     262             :    wb_to_wc(a, 2)   returns [ !!a16 !!a17 ... !!a23 ]
     263             :    wb_to_wc(a, 3)   returns [ !!a24 !!a25 ... !!a31 ]
     264             :    // TODO: wc varints for 8, 16, and 64 wide SIMD conditionals
     265             : 
     266             :    wb_to_wf(a, 0)   returns [ (float)a0  (float)a1  ... (float)a7  ]
     267             :    wb_to_wf(a, 1)   returns [ (float)a8  (float)a9  ... (float)a15 ]
     268             :    wb_to_wf(a, 2)   returns [ (float)a16 (float)a17 ... (float)a23 ]
     269             :    wb_to_wf(a, 3)   returns [ (float)a24 (float)a25 ... (float)a31 ]
     270             : 
     271             :    wb_to_wi(a, 0)   returns [ (int)a0  (int)a1  ... (int)a7  ]
     272             :    wb_to_wi(a, 1)   returns [ (int)a8  (int)a9  ... (int)a15 ]
     273             :    wb_to_wi(a, 2)   returns [ (int)a16 (int)a17 ... (int)a23 ]
     274             :    wb_to_wi(a, 3)   returns [ (int)a24 (int)a25 ... (int)a31 ]
     275             : 
     276             :    wb_to_wu(a, 0)   returns [ (uint)a0  (uint)a1  ... (uint)a7  ]
     277             :    wb_to_wu(a, 1)   returns [ (uint)a8  (uint)a9  ... (uint)a15 ]
     278             :    wb_to_wu(a, 2)   returns [ (uint)a16 (uint)a17 ... (uint)a23 ]
     279             :    wb_to_wu(a, 3)   returns [ (uint)a24 (uint)a25 ... (uint)a31 ]
     280             : 
     281             :    wb_to_wd(a,0) returns [ (double)a0  (double)a1  (double)a2  (double)a3  ]
     282             :    wb_to_wd(a,1) returns [ (double)a4  (double)a5  (double)a6  (double)a7  ]
     283             :    ...
     284             :    wb_to_wd(a,7) returns [ (double)a28 (double)a29 (double)a30 (double)a31 ]
     285             : 
     286             :    wb_to_wl(a,0) returns [ (long)a0  (long)a1  (long)a2  (long)a3  ]
     287             :    wb_to_wl(a,1) returns [ (long)a4  (long)a5  (long)a6  (long)a7  ]
     288             :    ...
     289             :    wb_to_wl(a,7) returns [ (long)a28 (long)a29 (long)a30 (long)a31 ]
     290             : 
     291             :    wb_to_wv(a,0) returns [ (ulong)a0  (ulong)a1  (ulong)a2  (ulong)a3  ]
     292             :    wb_to_wv(a,1) returns [ (ulong)a4  (ulong)a5  (ulong)a6  (ulong)a7  ]
     293             :    ...
     294             :    wb_to_wv(a,7) returns [ (ulong)a28 (ulong)a29 (ulong)a30 (ulong)a31 ]
     295             : 
     296             :    where the above values should be compile time constants. */
     297             : 
     298             : /* wb_expand_internal_{4, 8} selects the right group of {4,8} x 32 bits
     299             :    (zero extending it) */
     300             : 
     301             : static inline __m256i
     302     3145728 : wb_expand_internal_8( wb_t a, int imm ) {
     303     3145728 :   switch( imm ) {
     304      786432 :   case 0: return _mm256_cvtepu8_epi32( _mm256_extractf128_si256( a, 0 ) );
     305      786432 :   case 1: return _mm256_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 0 ), 8 ) );
     306      786432 :   case 2: return _mm256_cvtepu8_epi32( _mm256_extractf128_si256( a, 1 ) );
     307      786432 :   case 3: return _mm256_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 1 ), 8 ) );
     308     3145728 :   }
     309           0 :   return _mm256_setzero_si256(); /* Unreachable */
     310     3145728 : }
     311             : 
     312             : static inline __m128i
     313     4718592 : wb_expand_internal_4( wb_t a, int imm ) {
     314     4718592 :   switch( imm ) {
     315      589824 :   case 0: return _mm_cvtepu8_epi32( _mm256_extractf128_si256( a, 0 ) );
     316      589824 :   case 1: return _mm_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 0 ),  4 ) );
     317      589824 :   case 2: return _mm_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 0 ),  8 ) );
     318      589824 :   case 3: return _mm_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 0 ), 12 ) );
     319      589824 :   case 4: return _mm_cvtepu8_epi32( _mm256_extractf128_si256( a, 1 ) );
     320      589824 :   case 5: return _mm_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 1 ),  4 ) );
     321      589824 :   case 6: return _mm_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 1 ),  8 ) );
     322      589824 :   case 7: return _mm_cvtepu8_epi32( _mm_bsrli_si128( _mm256_extractf128_si256( a, 1 ), 12 ) );
     323     4718592 :   }
     324           0 :   return _mm_setzero_si128(); /* Unreachable */
     325     4718592 : }
     326             : 
     327             : #define wb_to_wc( a, imm ) _mm256_xor_si256( _mm256_set1_epi32( -1 ), _mm256_cmpeq_epi32( wb_expand_internal_8( (a), (imm) ), _mm256_setzero_si256() ) )
     328             : #define wb_to_wf( a, imm ) _mm256_cvtepi32_ps( wb_expand_internal_8( (a), (imm) ) )
     329             : #define wb_to_wi( a, imm ) wb_expand_internal_8( (a), (imm) )
     330             : #define wb_to_wu( a, imm ) wb_expand_internal_8( (a), (imm) )
     331             : #define wb_to_wd( a, imm ) _mm256_cvtepi32_pd   ( wb_expand_internal_4( (a), (imm) ) )
     332             : #define wb_to_wl( a, imm ) _mm256_cvtepu32_epi64( wb_expand_internal_4( (a), (imm) ) ) /* This could be slightly faster with _mm256_cvtepu8_epi64 */
     333             : #define wb_to_wv( a, imm ) _mm256_cvtepu32_epi64( wb_expand_internal_4( (a), (imm) ) ) /* This could be slightly faster with _mm256_cvtepu8_epi64 */
     334             : 
     335             : #define wb_to_wc_raw(a) (a)
     336             : #define wb_to_wf_raw(a) _mm256_castsi256_ps( (a) )
     337             : #define wb_to_wi_raw(a) (a)
     338             : #define wb_to_wu_raw(a) (a)
     339             : #define wb_to_wd_raw(a) _mm256_castsi256_pd( (a) )
     340             : #define wb_to_wv_raw(a) (a)
     341             : #define wb_to_wl_raw(a) (a)
     342             : 
     343             : /* Reduction operations */
     344             : 
     345             : static inline wb_t
     346      196608 : wb_sum_all( wb_t x ) { /* Returns wb_bcast( sum( x ) ) */
     347      196608 :   x = _mm256_sad_epu8( x, _mm256_setzero_si256() );                /* x[0-7]       x[8-15]       x[16-23]      x[24-31]      (each stored in 64 bits) */
     348      196608 :   x = _mm256_add_epi64( x, _mm256_permute2f128_si256( x, x, 1 ) ); /* x[0-7,16-23] x[8-15,24-31] x[0-7,16-23]  x[8-15,24-31] (each stored in 64 bits) */
     349      196608 :   return _mm256_add_epi8( _mm256_shuffle_epi8( x, wb_bcast( 0 ) ) , _mm256_shuffle_epi8( x, wb_bcast( 8 ) ) ); /* Grab the low byte of each sum, broadcast it, then sum */
     350      196608 : }
     351             : 
     352             : static inline wb_t
     353      196608 : wb_min_all( wb_t x ) { /* Returns wb_bcast( min( x ) ) */
     354      196608 :   x = _mm256_min_epu8( x, _mm256_permute2f128_si256( x, x, 1 ) );    /* x0,16    x1,17  .. x15,31 x0,16  x1,17  ... x15,31 */
     355      196608 :   x = _mm256_min_epu8( x, _mm256_shuffle_epi8( x, wb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
     356      196608 :                                                       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) ) );    /* x0,8,16,24    x1,9,17,25  .. x7,15,23,31  (repeats 3 more times) */
     357      196608 :   x = _mm256_min_epu8( x, _mm256_shuffle_epi8( x, wb( 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3,
     358      196608 :                                                       4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 ) ) );          /* x0,4,8,12,16,20,24,28  .. x3,7,11,15,19,23,27,31 (repeats 7 more times)*/
     359      196608 :   x = _mm256_min_epu8( x, _mm256_shuffle_epi8( x, wb_bcast_quad( 2, 3, 0, 1 ) ) );
     360      196608 :   x = _mm256_min_epu8( x, _mm256_shuffle_epi8( x, wb_bcast_pair( 1, 0 ) ) );
     361      196608 :   return x;
     362      196608 : }
     363             : 
     364             : static inline wb_t
     365      196608 : wb_max_all( wb_t x ) { /* Returns wb_bcast( max( x ) ) */
     366      196608 :   x = _mm256_max_epu8( x, _mm256_permute2f128_si256( x, x, 1 ) );    /* x0,16    x1,17  .. x15,31 x0,16  x1,17  ... x15,31 */
     367      196608 :   x = _mm256_max_epu8( x, _mm256_shuffle_epi8( x, wb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
     368      196608 :                                                       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) ) );    /* x0,8,16,24    x1,9,17,25  .. x7,15,23,31  (repeats 3 more times) */
     369      196608 :   x = _mm256_max_epu8( x, _mm256_shuffle_epi8( x, wb( 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3,
     370      196608 :                                                       4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 ) ) );          /* x0,4,8,12,16,20,24,28  .. x3,7,11,15,19,23,27,31 (repeats 7 more times)*/
     371      196608 :   x = _mm256_max_epu8( x, _mm256_shuffle_epi8( x, wb_bcast_quad( 2, 3, 0, 1 ) ) );
     372      196608 :   x = _mm256_max_epu8( x, _mm256_shuffle_epi8( x, wb_bcast_pair( 1, 0 ) ) );
     373      196608 :   return x;
     374      196608 : }
     375             : 
     376             : /* Misc operations */
     377             : 
     378             : /* TODO: These probably are actually part of the wc post generalization
     379             :    to different width SIMD types. */
     380             : 
     381             : /* wb_{any, all} return 1 if any/all of the elements are non-zero.  The
     382             :    _fast variants are suitable for use with the return value of any of
     383             :    the wb comparison functions (e.g. wb_gt ). */
     384             : 
     385    41287692 : #define wb_any_fast( x ) ( 0 != _mm256_movemask_epi8( x ) )
     386    41287692 : #define wb_any( x ) wb_any_fast( wb_ne( (x), wb_zero( ) ) )
     387   185794614 : #define wb_all_fast( x ) ( -1 == _mm256_movemask_epi8( x ) )
     388   185794614 : #define wb_all( x ) wb_all_fast( wb_ne( (x), wb_zero( ) ) )

Generated by: LCOV version 1.14