LCOV - code coverage report
Current view: top level - util/simd - fd_sse_vb.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 73 73 100.0 %
Date: 2025-01-08 12:08:44 Functions: 23 133 17.3 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_sse_h
       2             : #error "Do not include this directly; use fd_sse.h"
       3             : #endif
       4             : 
       5             : /* Vector byte API *****************************************************/
       6             : 
       7             : /* A vb_t is a vector where each 8-bit wide lane holds an unsigned 8-bit
       8             :    integer (a "uchar").
       9             : 
      10             :    These mirror the other APIs as much as possible.  Macros are
      11             :    preferred over static inlines when it is possible to do it robustly
      12             :    to reduce the risk of the compiler mucking it up. */
      13             : 
      14  2377343648 : #define vb_t __m128i
      15             : 
      16             : /* Constructors */
      17             : 
      18             : /* Given the uchar values, return ... */
      19             : 
      20             : #define vb(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,b11,b12,b13,b14,b15 ) /* [ b0 b1 ... b15 ] */                     \
      21  1321376376 :   _mm_setr_epi8( (char)( b0), (char)( b1), (char)( b2), (char)( b3), (char)( b4), (char)( b5), (char)( b6), (char)( b7), \
      22  1321376376 :                  (char)( b8), (char)( b9), (char)(b10), (char)(b11), (char)(b12), (char)(b13), (char)(b14), (char)(b15) )
      23             : 
      24      393216 : #define vb_bcast(b0) _mm_set1_epi8( (char)(b0) ) /* [ b0 b0 ... b0 ] */
      25             : 
      26             : static inline vb_t /* [ b0 b1 b0 b1 ... b0 b1 ] */
      27     1376256 : vb_bcast_pair( uchar b0, uchar b1 ) {
      28     1376256 :   return _mm_setr_epi8( (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1),
      29     1376256 :                         (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1), (char)(b0), (char)(b1) );
      30     1376256 : }
      31             : 
      32             : static inline vb_t /* [ b0 b1 b2 b3 b0 b1 b2 b3 ... b0 b1 b2 b3 ] */
      33      589824 : vb_bcast_quad( uchar b0, uchar b1, uchar b2, uchar b3 ) {
      34      589824 :   return _mm_setr_epi8( (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3),
      35      589824 :                         (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b0), (char)(b1), (char)(b2), (char)(b3) );
      36      589824 : }
      37             : 
      38             : static inline vb_t /* [ b0 b1 ... b7 b0 b1 ... b7 ] */
      39      196608 : vb_bcast_oct( uchar b0, uchar b1, uchar b2, uchar b3, uchar b4, uchar b5, uchar b6, uchar b7 ) {
      40      196608 :   return _mm_setr_epi8( (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7),
      41      196608 :                         (char)(b0), (char)(b1), (char)(b2), (char)(b3), (char)(b4), (char)(b5), (char)(b6), (char)(b7) );
      42      196608 : }
      43             : 
      44             : static inline vb_t /* [ b0 b0 ... b0 b1 b1 ... b1 ] */
      45      196608 : vb_expand_pair( uchar b0, uchar b1 ) {
      46      196608 :   return _mm_setr_epi8( (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b0),
      47      196608 :                         (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1), (char)(b1) );
      48      196608 : }
      49             : 
      50             : static inline vb_t /* [ b0 b0 b1 b1 ... b7 b7 ] */
      51      196608 : vb_expand_quad( uchar b0, uchar b1, uchar b2, uchar b3 ) {
      52      196608 :   return _mm_setr_epi8( (char)(b0), (char)(b0), (char)(b0), (char)(b0), (char)(b1), (char)(b1), (char)(b1), (char)(b1),
      53      196608 :                         (char)(b2), (char)(b2), (char)(b2), (char)(b2), (char)(b3), (char)(b3), (char)(b3), (char)(b3) );
      54      196608 : }
      55             : 
      56             : static inline vb_t /* [ b0 b0 b1 b1 ... b7 b7 ] */
      57      196608 : vb_expand_oct( uchar b0, uchar b1, uchar b2, uchar b3, uchar b4, uchar b5, uchar b6, uchar b7 ) {
      58      196608 :   return _mm_setr_epi8( (char)(b0), (char)(b0), (char)(b1), (char)(b1), (char)(b2), (char)(b2), (char)(b3), (char)(b3),
      59      196608 :                         (char)(b4), (char)(b4), (char)(b5), (char)(b5), (char)(b6), (char)(b6), (char)(b7), (char)(b7) );
      60      196608 : }
      61             : 
      62             : #define vb_permute(x,i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15) /* [ x[i0] x[i1] ... x[i15] ] */ \
      63             :   _mm_shuffle_epi8( (x), vb( (i0), (i1), (i2),  (i3),  (i4),  (i5),  (i6),  (i7),                            \
      64             :                              (i8), (i9), (i10), (i11), (i12), (i13), (i14), (i15) ) )
      65             : 
      66             : /* Useful cases are provided below.  Given [ b0 b1 b2 b3 b4 ... b15 ], return ... */
      67             : 
      68             : #define vb_exch_adj(x)        /* [ b1 b0 b3 b2 ... b15 b14 ] */ \
      69             :   _mm_shuffle_epi8( (x), vb( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 ) )
      70             : 
      71             : #define vb_exch_adj_pair(x)   /* [ b2 b3 b0 b1 .. b14 b15 b12 b13 ] */ \
      72             :   _mm_shuffle_epi8( (x), vb( 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ) )
      73             : 
      74             : #define vb_exch_adj_quad(x)   /* [ b4 b5 b6 b7 b0 b1 b2 b3 .. b8 b9 b10 b11 ] */ \
      75             :   _mm_shuffle_epi8( (x), vb( 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 ) )
      76             : 
      77             : #define vb_exch_adj_oct(x)    /* [ b8 b9 ... b15 b0 b1 ... b7 */ \
      78             :   _mm_shuffle_epi8( (x), vb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) )
      79             : 
      80             : #define vb_bcast_even(x)      /* [ b0 b0 b2 b2 b4 b4 .. b12 b12 b14 b14 ] */ \
      81             :   _mm_shuffle_epi8( (x), vb( 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 ) )
      82             : 
      83             : #define vb_bcast_odd(x)       /* [ b1 b1 b3 b3 b5 b5 .. b13 b13 b15 b15 ] */ \
      84             :   _mm_shuffle_epi8( (x), vb( 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 ) )
      85             : 
      86             : /* Predefined constants */
      87             : 
      88    40108044 : #define vb_zero() _mm_setzero_si128() /* Return [ 0 0 ... 0 ] */
      89             : #define vb_one()  _mm_set1_epi8( 1 )  /* Return [ 1 1 ... 1 ] */
      90             : 
      91             : /* Memory operations */
      92             : 
      93             : /* vb_ld return the 16 uchars at the 16-byte aligned / 16-byte sized
      94             :    location p as a vector uchar.  vb_ldu is the same but p does not have
      95             :    to be aligned.  vb_st writes the vector uchar to the 16-byte aligned /
      96             :    16-byte sized location p as 16 uchars.  vb_stu is the same but p does
      97             :    not have to be aligned.  In all these lane l will be at p[l].  FIXME:
      98             :    USE ATTRIBUTES ON P PASSED TO THESE?
      99             : 
     100             :    Note: gcc knows a __m128i may alias. */
     101             : 
     102    20054022 : static inline vb_t vb_ld( uchar const * p ) { return _mm_load_si128(  (__m128i const *)p ); }
     103   998306782 : static inline void vb_st( uchar * p, vb_t i ) { _mm_store_si128(  (__m128i *)p, i ); }
     104             : 
     105   204898222 : static inline vb_t vb_ldu( void const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
     106   160432176 : static inline void vb_stu( void * p, vb_t i ) { _mm_storeu_si128( (__m128i *)p, i ); }
     107             : 
     108             : /* Sadly, no maskload_epi8, so we can't provide a vb_ldif or vb_stif.
     109             :    TODO: consider emulating this? */
     110             : 
     111             : /* Element operations */
     112             : 
     113             : /* vb_extract extracts the uchar in lane imm from the vector uchar.
     114             :    vb_insert returns the vector uchar formed by replacing the value in
     115             :    lane imm of a vbth the provided uchar.  imm should be a compile time
     116             :    constant in 0:15.  vb_extract_variable and vb_insert_variable are the
     117             :    slower but the lane n does not have to be known at compile time
     118             :    (should still be in 0:15).
     119             : 
     120             :    Note: C99 TC3 allows type punning through a union. */
     121             : 
     122   320864352 : #define vb_extract(a,imm)  ((uchar)_mm_extract_epi8( (a), (imm) ))
     123   320864352 : #define vb_insert(a,imm,v) _mm_insert_epi8( (a), (char)(v), (imm) )
     124             : 
     125             : static inline uchar
     126   320864352 : vb_extract_variable( vb_t a, int n ) {
     127   320864352 :   union { __m128i m[1]; uchar i[16]; } t[1];
     128   320864352 :   _mm_store_si128( t->m, a );
     129   320864352 :   return t->i[n];
     130   320864352 : }
     131             : 
     132             : static inline vb_t
     133   320864352 : vb_insert_variable( vb_t a, int n, uchar v ) {
     134   320864352 :   union { __m128i m[1]; uchar i[16]; } t[1];
     135   320864352 :   _mm_store_si128( t->m, a );
     136   320864352 :   t->i[n] = v;
     137   320864352 :   return _mm_load_si128( t->m );
     138   320864352 : }
     139             : 
     140             : /* Given [a0 a1 ... a15] and/or [b0 b1 ... b15], return ... */
     141             : 
     142             : /* Arithmetic operations */
     143             : 
     144             : #define vb_neg(a) _mm_sub_epi8( _mm_setzero_si128(), (a) ) /* [ -a0  -a1  ... -a15  ] (twos complement handling) */
     145             : #define vb_abs(a) (a)                                      /* [ |a0| |a1| ... |a15| ] (unsigned type, so identity) */
     146             : 
     147             : #define vb_min(a,b) _mm_min_epu8( (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a15,b15) ] */
     148             : #define vb_max(a,b) _mm_max_epu8( (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a15,b15) ] */
     149             : #define vb_add(a,b) _mm_add_epi8( (a), (b) ) /* [ a0 +b0     a1 +b1     ... a15 +b15     ] */
     150             : #define vb_sub(a,b) _mm_sub_epi8( (a), (b) ) /* [ a0 -b0     a1 -b1     ... a15 -b15     ] */
     151             : 
     152             : /* No vb_mul because there's no instruction for multiplying uchars.  You
     153             :    can build one with two invocations to _mm_mullo_epi16, but it won't
     154             :    be particularly fast.  Multiplication by add and shift might be
     155             :    faster honestly.  TODO: consider emulating for completeness? */
     156             : 
     157             : /* Bit operations */
     158             : 
     159             : /* Note: vb_shl/vb_shr is an unsigned left/right shift by imm bits; imm
     160             :    must be a compile time constant in 0:7.  The variable variants are
     161             :    slower but do not require the shift amount to be known at compile
     162             :    time (should still be in 0:7). */
     163             : 
     164             : #define vb_not(a) _mm_xor_si128( _mm_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a15 ] */
     165             : 
     166             : #define vb_shl(a,imm) vb_and( _mm_slli_epi16( (a), (imm) ), vb_bcast( (uchar)(0xFFUL << (imm)) ) ) /* [ a0<<imm a1<<imm ... a15<<imm ] */
     167             : #define vb_shr(a,imm) vb_and( _mm_srli_epi16( (a), (imm) ), vb_bcast( (uchar)(0xFFUL >> (imm)) ) ) /* [ a0>>imm a1>>imm ... a15>>imm ] (treat a as unsigned) */
     168             : 
     169             : #define vb_shl_variable(a,n) vb_and( _mm_sll_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) ), \
     170             :                                      vb_bcast( (uchar)(0xFFUL << (n)) ) )
     171             : #define vb_shr_variable(a,n) vb_and( _mm_srl_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) ), \
     172             :                                      vb_bcast( (uchar)(0xFFUL >> (n)) ) )
     173             : 
     174             : #define vb_and(a,b)    _mm_and_si128(    (a), (b) ) /* [   a0 &b0    a1& b1 ...   a15& b15 ] */
     175             : #define vb_andnot(a,b) _mm_andnot_si128( (a), (b) ) /* [ (~a0)&b0  (~a1)&b1 ... (~a15)&b15 ] */
     176     6291456 : #define vb_or(a,b)     _mm_or_si128(     (a), (b) ) /* [   a0 |b0    a1 |b1 ...   a15 |b15 ] */
     177             : #define vb_xor(a,b)    _mm_xor_si128(    (a), (b) ) /* [   a0 ^b0    a1 ^b1 ...   a15 ^b15 ] */
     178             : 
     179     1572864 : static inline vb_t vb_rol( vb_t a, int imm ) { return vb_or( vb_shl( a, imm & 7 ), vb_shr( a, (-imm) & 7 ) ); }
     180     1572864 : static inline vb_t vb_ror( vb_t a, int imm ) { return vb_or( vb_shr( a, imm & 7 ), vb_shl( a, (-imm) & 7 ) ); }
     181             : 
     182     1572864 : static inline vb_t vb_rol_variable( vb_t a, int n ) { return vb_or( vb_shl_variable( a, n&7 ), vb_shr_variable( a, (-n)&7 ) ); }
     183     1572864 : static inline vb_t vb_ror_variable( vb_t a, int n ) { return vb_or( vb_shr_variable( a, n&7 ), vb_shl_variable( a, (-n)&7 ) ); }
     184             : 
     185             : /* Logical operations */
     186             : 
     187             : #define vb_lnot(a)    _mm_cmpeq_epi8( (a), _mm_setzero_si128() ) /* [  !a0  !a1 ...  !a15 ] */
     188             : #define vb_lnotnot(a)                                            /* [ !!a0 !!a1 ... !!a15 ] */ \
     189             :   _mm_xor_si128( _mm_set1_epi32( -1 ), vb_lnot( (a) ) )
     190             : 
     191             : #define vb_eq(a,b) _mm_cmpeq_epi8( (a), (b) )                                            /* [ a0==b0 a1==b1 ... a15==b15 ] */
     192             : #define vb_gt(a,b) _mm_cmpgt_epi8( _mm_sub_epi8( (a), _mm_set1_epi8( (char)(1U<<7) ) ),  /* [ a0> b0 a1> b1 ... a15> b15 ] */ \
     193             :                                    _mm_sub_epi8( (b), _mm_set1_epi8( (char)(1U<<7) ) ) )
     194             : #define vb_lt(a,b) vb_gt( (b), (a) )                                                     /* [ a0< b0 a1< b1 ... a15< b15 ] */
     195             : #define vb_ne(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi8( (a), (b) ) )     /* [ a0!=b0 a1!=b1 ... a15!=b15 ] */
     196             : #define vb_ge(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), vb_gt( (b), (a) ) )              /* [ a0>=b0 a1>=b1 ... a15>=b15 ] */
     197             : #define vb_le(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), vb_gt( (a), (b) ) )              /* [ a0<=b0 a1<=b1 ... a15<=b15 ] */
     198             : 
     199             : /* Conditional operations */
     200             : 
     201             : #define vb_czero(c,f)    _mm_andnot_si128( (c), (f) ) /* [ c0? 0:f0 c1? 0:f1 ... c15? 0:f15 ] */
     202             : #define vb_notczero(c,f) _mm_and_si128(    (c), (f) ) /* [ c0?f0: 0 c1?f1: 0 ... c15?f15: 0 ] */
     203             : 
     204             : #define vb_if(c,t,f) _mm_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c15?t15:f15 ] */
     205             : 
     206             : /* Conversion operations */
     207             : 
     208             : /* Summarizing:
     209             : 
     210             :    vb_to_vc(a, 0)   returns [ !!a0  !!a1  !!a2  !!a3  ]
     211             :    vb_to_vc(a, 1)   returns [ !!a4  !!a5  !!a6  !!a7  ]
     212             :    vb_to_vc(a, 2)   returns [ !!a8  !!a9  !!a10 !!a11 ]
     213             :    vb_to_vc(a, 3)   returns [ !!a12 !!a13 !!a14 !!a15 ]
     214             : 
     215             :    vb_to_vf(a, 0)   returns [ (float)a0  (float)a1  (float)a2  (float)a3  ]
     216             :    vb_to_vf(a, 1)   returns [ (float)a4  (float)a5  (float)a6  (float)a7  ]
     217             :    vb_to_vf(a, 2)   returns [ (float)a8  (float)a9  (float)a10 (float)a11 ]
     218             :    vb_to_vf(a, 3)   returns [ (float)a12 (float)a13 (float)a14 (float)a15 ]
     219             : 
     220             :    vb_to_vi(a, 0)   returns [ (int)a0  (int)a1  (int)a2  (int)a3  ]
     221             :    vb_to_vi(a, 1)   returns [ (int)a4  (int)a5  (int)a6  (int)a7  ]
     222             :    vb_to_vi(a, 2)   returns [ (int)a8  (int)a9  (int)a10 (int)a11 ]
     223             :    vb_to_vi(a, 3)   returns [ (int)a12 (int)a13 (int)a14 (int)a15 ]
     224             : 
     225             :    vb_to_vu(a, 0)   returns [ (uint)a0  (uint)a1  (uint)a2  (uint)a3  ]
     226             :    vb_to_vu(a, 1)   returns [ (uint)a4  (uint)a5  (uint)a6  (uint)a7  ]
     227             :    vb_to_vu(a, 2)   returns [ (uint)a8  (uint)a9  (uint)a10 (uint)a11 ]
     228             :    vb_to_vu(a, 3)   returns [ (uint)a12 (uint)a13 (uint)a14 (uint)a15 ]
     229             : 
     230             :    vb_to_vd(a,0) returns [ (double)a0  (double)a1  ]
     231             :    vb_to_vd(a,1) returns [ (double)a2  (double)a3  ]
     232             :    ...
     233             :    vb_to_vd(a,7) returns [ (double)a14 (double)a15 ]
     234             : 
     235             :    vb_to_vl(a,0) returns [ (long)a0  (long)a1  ]
     236             :    vb_to_vl(a,1) returns [ (long)a2  (long)a3  ]
     237             :    ...
     238             :    vb_to_vl(a,7) returns [ (long)a14 (long)a15 ]
     239             : 
     240             :    vb_to_vv(a,0) returns [ (ulong)a0  (ulong)a1  ]
     241             :    vb_to_vv(a,1) returns [ (ulong)a2  (ulong)a3  ]
     242             :    ...
     243             :    vb_to_vv(a,7) returns [ (ulong)a14 (ulong)a15 ]
     244             : 
     245             :    where the above values should be compile time constants. */
     246             : 
     247             : #define vb_to_vc( a, imm ) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 4*(imm) ) ) , _mm_setzero_si128() ) )
     248             : #define vb_to_vf( a, imm ) _mm_cvtepi32_ps( _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 4*(imm) ) ) )
     249             : #define vb_to_vi( a, imm ) _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 4*(imm) ) )
     250             : #define vb_to_vu( a, imm ) _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 4*(imm) ) )
     251             : #define vb_to_vd( a, imm ) _mm_cvtepi32_pd( _mm_cvtepu8_epi32( _mm_bsrli_si128( (a), 2*(imm) ) ) )
     252             : #define vb_to_vl( a, imm ) _mm_cvtepu8_epi64( _mm_bsrli_si128( (a), 2*(imm) ) )
     253             : #define vb_to_vv( a, imm ) _mm_cvtepu8_epi64( _mm_bsrli_si128( (a), 2*(imm) ) )
     254             : 
     255             : #define vb_to_vc_raw(a) (a)
     256             : #define vb_to_vf_raw(a) _mm_castsi128_ps( (a) )
     257             : #define vb_to_vi_raw(a) (a)
     258             : #define vb_to_vu_raw(a) (a)
     259             : #define vb_to_vd_raw(a) _mm_castsi128_pd( (a) )
     260             : #define vb_to_vl_raw(a) (a)
     261             : #define vb_to_vv_raw(a) (a)
     262             : 
     263             : /* Reduction operations */
     264             : 
     265             : static inline vb_t
     266      196608 : vb_sum_all( vb_t x ) { /* Returns vb_bcast( sum( x ) ) */
     267      196608 :   x = _mm_sad_epu8( x, _mm_setzero_si128() );                /* x[0-7]       x[8-15]  (each stored in 64 bits) */
     268      196608 :   return _mm_add_epi8( _mm_shuffle_epi8( x, vb_bcast( 0 ) ) , _mm_shuffle_epi8( x, vb_bcast( 8 ) ) ); /* Grab the low byte of each sum, broadcast it, then sum */
     269      196608 : }
     270             : 
     271             : static inline vb_t
     272      196608 : vb_min_all( vb_t x ) { /* Returns vb_bcast( min( x ) ) */
     273      196608 :   x = _mm_min_epu8( x, _mm_shuffle_epi8( x, vb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) ) ); /* x0,8    x1,9  .. x7,15  (repeats 1 more time) */
     274      196608 :   x = _mm_min_epu8( x, _mm_shuffle_epi8( x, vb( 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 ) ) ); /* x0,4,8,12  .. x3,7,11,15 (repeats 3 more times)*/
     275      196608 :   x = _mm_min_epu8( x, _mm_shuffle_epi8( x, vb_bcast_quad( 2, 3, 0, 1 ) ) ); /* x_even x_odd (repeats 7 more times) */
     276      196608 :   x = _mm_min_epu8( x, _mm_shuffle_epi8( x, vb_bcast_pair( 1, 0 ) ) ); /* x_all (repeats 15 more times) */
     277      196608 :   return x;
     278      196608 : }
     279             : 
     280             : static inline vb_t
     281      196608 : vb_max_all( vb_t x ) { /* Returns vb_bcast( max( x ) ) */
     282      196608 :   x = _mm_max_epu8( x, _mm_shuffle_epi8( x, vb( 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 ) ) ); /* x0,8    x1,9  .. x7,15  (repeats 1 more time) */
     283      196608 :   x = _mm_max_epu8( x, _mm_shuffle_epi8( x, vb( 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 ) ) ); /* x0,4,8,12  .. x3,7,11,15 (repeats 3 more times)*/
     284      196608 :   x = _mm_max_epu8( x, _mm_shuffle_epi8( x, vb_bcast_quad( 2, 3, 0, 1 ) ) ); /* x_even x_odd (repeats 7 more times) */
     285      196608 :   x = _mm_max_epu8( x, _mm_shuffle_epi8( x, vb_bcast_pair( 1, 0 ) ) ); /* x_all (repeats 15 more times) */
     286      196608 :   return x;
     287      196608 : }
     288             : 
     289             : /* Misc operations */
     290             : 
     291             : /* TODO: These are probably are actually part of the vc post
     292             :    generalization to different width SIMD types. */
     293             : 
     294             : /* vb_{any, all} return 1 if any/all of the elements are non-zero.  The
     295             :    _fast variants are suitable for use with the return value of any of
     296             :    the vb comparison functions (e.g. vb_gt ). */
     297             : 
     298    40108044 : #define vb_any_fast( x ) ( 0 != _mm_movemask_epi8( x ) )
     299    40108044 : #define vb_any( x ) vb_any_fast( vb_ne( (x), vb_zero( ) ) )
     300   180486198 : #define vb_all_fast( x ) ( 0xFFFF == _mm_movemask_epi8( x ) )
     301   180486198 : #define vb_all( x ) vb_all_fast( vb_ne( (x), vb_zero( ) ) )

Generated by: LCOV version 1.14