LCOV - code coverage report
Current view: top level - util/simd - fd_avx_wh.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 0 26 0.0 %
Date: 2025-01-08 12:08:44 Functions: 0 6410 0.0 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_avx_h
       2             : #error "Do not include this directly; use fd_avx.h"
       3             : #endif
       4             : 
       5             : /* Vector ushort API **************************************************/
       6             : 
       7             : /* A wh_t is a vector where each 16-bit wide lane holds an unsigned
       8             :    16-bit integer (a "ushort").
       9             : 
      10             :    These mirror the other APIs as much as possible.  Macros are
      11             :    preferred over static inlines when it is possible to do it robustly
      12             :    to reduce the risk of the compiler mucking it up. */
      13             : 
      14           0 : #define wh_t __m256i
      15             : 
      16             : /* Constructors */
      17             : 
      18             : /* Given the ushort values, return ... */
      19             : 
      20             : #define wh(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9,h10,h11,h12,h13,h14,h15) /* [ h0 h1 ... h15 ] */ \
      21             :   _mm256_setr_epi16( (short)( h0), (short)( h1), (short)( h2), (short)( h3),                       \
      22             :                      (short)( h4), (short)( h5), (short)( h6), (short)( h7),                       \
      23             :                      (short)( h8), (short)( h9), (short)(h10), (short)(h11),                       \
      24             :                      (short)(h12), (short)(h13), (short)(h14), (short)(h15) )
      25             : 
      26             : #define wh_bcast(h0) _mm256_set1_epi16( (short)(h0) ) /* [ h0 h0 ... h0 ] */
      27             : 
      28             : /* Predefined constants */
      29             : 
      30           0 : #define wh_zero() _mm256_setzero_si256() /* Return [ 0 0 ... 0 ] */
      31             : #define wh_one()  _mm256_set1_epi16( 1 ) /* Return [ 1 1 ... 1 ] */
      32             : 
      33             : /* Memory operations */
      34             : 
      35             : /* wh_ld return the 16 ushorts at the 32-byte aligned / 32-byte sized
      36             :    location p as a vector ushort.  wh_ldu is the same but p does not
      37             :    have to be aligned.  wh_st writes the vector ushort to the 32-byte
      38             :    aligned / 32-byte sized location p as 16 ushorts.  wh_stu is the same
      39             :    but p does not have to be aligned.  In all these lane l will be at
      40             :    p[l].
      41             : 
      42             :    Note: gcc knows a __m256i may alias. */
      43             : 
      44           0 : static inline wh_t wh_ld( ushort const * p ) { return _mm256_load_si256( (__m256i const *)p ); }
      45           0 : static inline void wh_st( ushort * p, wh_t i ) { _mm256_store_si256( (__m256i *)p, i ); }
      46             : 
      47           0 : static inline wh_t wh_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
      48           0 : static inline void wh_stu( void * p, wh_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
      49             : 
      50             : /* Element operations */
      51             : 
      52             : /* wh_extract extracts the ushort in lane imm from the vector ushort.
      53             :    wh_insert returns the vector ushort formed by replacing the value in
      54             :    lane imm of a with the provided ushort.  imm should be a compile time
      55             :    constant in 0:15.  wh_extract_variable and wh_insert_variable are the
      56             :    slower but the lane n does not have to eb known at compile time
      57             :    (should still be in 0:15).
      58             : 
      59             :    Note: C99 TC3 allows type punning through a union. */
      60             : 
      61           0 : #define wh_extract(a,imm)  ((ushort)_mm256_extract_epi16( (a), (imm) ))
      62           0 : #define wh_insert(a,imm,v) _mm256_insert_epi16( (a), (short)(v), (imm) )
      63             : 
      64             : static inline ushort
      65           0 : wh_extract_variable( wh_t a, int n ) {
      66           0 :   union { __m256i m[1]; ushort h[16]; } t[1];
      67           0 :   _mm256_store_si256( t->m, a );
      68           0 :   return (ushort)t->h[n];
      69           0 : }
      70             : 
      71             : static inline wh_t
      72           0 : wh_insert_variable( wh_t a, int n, ushort v ) {
      73           0 :   union { __m256i m[1]; ushort h[16]; } t[1];
      74           0 :   _mm256_store_si256( t->m, a );
      75           0 :   t->h[n] = v;
      76           0 :   return _mm256_load_si256( t->m );
      77           0 : }
      78             : 
      79             : /* Arithmetic operations */
      80             : 
      81             : #define wh_neg(a) _mm256_sub_epi16( _mm256_setzero_si256(), (a) ) /* [ -a0  -a1  ... -a15]   (twos complement handling) */
      82             : #define wh_abs(a) (a)                                             /* [ |a0| |a1| ... |a15| ] (twos complement handling) */
      83             : 
      84             : #define wh_min(a,b)   _mm256_min_epu16(   (a), (b) ) /* [ min(a0,b0)  min(a1,b1)  ... min(a15,a15)  ] */
      85             : #define wh_max(a,b)   _mm256_max_epu16(   (a), (b) ) /* [ max(a0,b0)  max(a1,b1)  ... max(a15,a15)  ] */
      86             : #define wh_add(a,b)   _mm256_add_epi16(   (a), (b) ) /* [ a0+b0       a1+b1       ... a15+b15       ] */
      87             : #define wh_sub(a,b)   _mm256_sub_epi16(   (a), (b) ) /* [ a0-b0       a1-b1       ... a15-b15       ] */
      88             : #define wh_mullo(a,b) _mm256_mullo_epi16( (a), (b) ) /* [ a0*b0       a1*b1       ... a15*b15       ] */
      89             : #define wh_mulhi(a,b) _mm256_mulhi_epu16( (a), (b) ) /* [ (a0*b0)>>16 (a1*b1)>>16 ... (a15*b15)>>16 ] */
      90             : #define wh_mul(a,b)   wh_mullo((a),(b))
      91             : 
      92             : /* Binary operations */
      93             : 
      94             : /* Note: wh_shl/wh_shr is an unsigned left/right shift by imm bits; imm
      95             :    must be a compile time constant in [0,15].  The variable variants are
      96             :    slower but do not require the shift amount to be known at compile
      97             :    time (should still be in [0,15]). */
      98             : 
      99             : #define wh_not(a) _mm256_xor_si256( _mm256_set1_epi16( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a15 ] */
     100             : 
     101             : #define wh_shl(a,imm)  _mm256_slli_epi16( (a), (imm) ) /* [ a0<<imm a1<<imm ... a15<<imm ] */
     102             : #define wh_shr(a,imm)  _mm256_srli_epi16( (a), (imm) ) /* [ a0>>imm a1>>imm ... a15>>imm ] */
     103             : 
     104             : #define wh_shl_variable(a,n) _mm256_sll_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     105             : #define wh_shr_variable(a,n) _mm256_srl_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     106             : 
     107             : #define wh_shl_vector(a,b)   _mm256_sllv_epi16( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a15<<b15 ] */
     108             : #define wh_shr_vector(a,b)   _mm256_srlv_epi16( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a15>>b15 ] */
     109             : 
     110             : #define wh_and(a,b)    _mm256_and_si256(    (a), (b) ) /* [   a0 &b0    a1& b1 ...   a15& b15 ] */
     111             : #define wh_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0  (~a1)&b1 ... (~a15)&b15 ] */
     112           0 : #define wh_or(a,b)     _mm256_or_si256(     (a), (b) ) /* [   a0 |b0    a1 |b1 ...   a15 |b15 ] */
     113             : #define wh_xor(a,b)    _mm256_xor_si256(    (a), (b) ) /* [   a0 ^b0    a1 ^b1 ...   a15 ^b15 ] */
     114             : 
     115             : /* wh_rol(x,n) returns wh( rotate_left (x0,n), rotate_left (x1,n), ... )
     116             :    wh_ror(x,n) returns wh( rotate_right(x0,n), rotate_right(x1,n), ... ) */
     117             : 
     118           0 : static inline wh_t wh_rol( wh_t a, int imm ) { return wh_or( wh_shl( a, imm & 15 ), wh_shr( a, (-imm) & 15 ) ); }
     119           0 : static inline wh_t wh_ror( wh_t a, int imm ) { return wh_or( wh_shr( a, imm & 15 ), wh_shl( a, (-imm) & 15 ) ); }
     120             : 
     121           0 : static inline wh_t wh_rol_variable( wh_t a, int n ) { return wh_or( wh_shl_variable( a, n&15 ), wh_shr_variable( a, (-n)&15 ) ); }
     122           0 : static inline wh_t wh_ror_variable( wh_t a, int n ) { return wh_or( wh_shr_variable( a, n&15 ), wh_shl_variable( a, (-n)&15 ) ); }
     123             : 
     124             : /* Logical operations */
     125             : 
     126           0 : #define wh_eq(a,b) _mm256_cmpeq_epi16( (a), (b) )                                              /* [ a0==b0 a1==b1 ... a15==b15 ] */
     127           0 : #define wh_ne(a,b) _mm256_xor_si256( _mm256_set1_epi16( -1 ), _mm256_cmpeq_epi16( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a15!=b15 ] */

Generated by: LCOV version 1.14