LCOV - code coverage report
Current view: top level - util/simd - fd_avx_ws.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 5 28 17.9 %
Date: 2025-01-08 12:08:44 Functions: 1 6410 0.1 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_avx_h
       2             : #error "Do not include this directly; use fd_avx.h"
       3             : #endif
       4             : 
       5             : /* Vector short API ***************************************************/
       6             : 
       7             : /* A ws_t is a vector wsere each 16-bit wsde lane holds a signed 16-bit
       8             :    integer (a "short").
       9             : 
      10             :    These mirror the other APIs as much as possible.  Macros are
      11             :    preferred over static inlines wsen it is possible to do it robustly
      12             :    to reduce the risk of the compiler mucking it up. */
      13             : 
      14         564 : #define ws_t __m256i
      15             : 
      16             : /* Constructors */
      17             : 
      18             : /* Given the short values, return ... */
      19             : 
      20             : #define ws(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9,h10,h11,h12,h13,h14,h15) /* [ h0 h1 ... h15 ] */ \
      21             :   _mm256_setr_epi16( (short)( h0), (short)( h1), (short)( h2), (short)( h3),                       \
      22             :                      (short)( h4), (short)( h5), (short)( h6), (short)( h7),                       \
      23             :                      (short)( h8), (short)( h9), (short)(h10), (short)(h11),                       \
      24             :                      (short)(h12), (short)(h13), (short)(h14), (short)(h15) )
      25             : 
      26             : #define ws_bcast(h0) _mm256_set1_epi16( (short)(h0) ) /* [ h0 h0 ... h0 ] */
      27             : 
      28             : /* Predefined constants */
      29             : 
      30          12 : #define ws_zero() _mm256_setzero_si256() /* Return [ 0 0 ... 0 ] */
      31             : #define ws_one()  _mm256_set1_epi16( 1 ) /* Return [ 1 1 ... 1 ] */
      32             : 
      33             : /* Memory operations */
      34             : 
      35             : /* ws_ld return the 16 shorts at the 32-byte aligned / 32-byte sized
      36             :    location p as a vector short.  ws_ldu is the same but p does not
      37             :    have to be aligned.  ws_st writes the vector short to the 32-byte
      38             :    aligned / 32-byte sized location p as 16 shorts.  ws_stu is the same
      39             :    but p does not have to be aligned.  In all these lane l wsll be at
      40             :    p[l].
      41             : 
      42             :    Note: gcc knows a __m256i may alias. */
      43             : 
      44         282 : static inline ws_t ws_ld( short const * p ) { return _mm256_load_si256( (__m256i const *)p ); }
      45           0 : static inline void ws_st( short * p, ws_t i ) { _mm256_store_si256( (__m256i *)p, i ); }
      46             : 
      47           0 : static inline ws_t ws_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
      48           0 : static inline void ws_stu( void * p, ws_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
      49             : 
      50             : /* Element operations */
      51             : 
      52             : /* ws_extract extracts the short in lane imm from the vector short.
      53             :    ws_insert returns the vector short formed by replacing the value in
      54             :    lane imm of a wsth the provided short.  imm should be a compile time
      55             :    constant in 0:15.  ws_extract_variable and ws_insert_variable are the
      56             :    slower but the lane n does not have to eb known at compile time
      57             :    (should still be in 0:15).
      58             : 
      59             :    Note: C99 TC3 allows type punning through a union. */
      60             : 
      61           0 : #define ws_extract(a,imm)  ((short)_mm256_extract_epi16( (a), (imm) ))
      62           0 : #define ws_insert(a,imm,v) _mm256_insert_epi16( (a), (v), (imm) )
      63             : 
      64             : static inline short
      65           0 : ws_extract_variable( ws_t a, int n ) {
      66           0 :   union { __m256i m[1]; short h[16]; } t[1];
      67           0 :   _mm256_store_si256( t->m, a );
      68           0 :   return (short)t->h[n];
      69           0 : }
      70             : 
      71             : static inline ws_t
      72           0 : ws_insert_variable( ws_t a, int n, short v ) {
      73           0 :   union { __m256i m[1]; short h[16]; } t[1];
      74           0 :   _mm256_store_si256( t->m, a );
      75           0 :   t->h[n] = v;
      76           0 :   return _mm256_load_si256( t->m );
      77           0 : }
      78             : 
      79             : /* Arithmetic operations */
      80             : 
      81             : #define ws_neg(a) _mm256_sub_epi16( _mm256_setzero_si256(), (a) ) /* [ -a0  -a1  ... -a7  ] (twos complement handling) */
      82             : #define ws_abs(a) _mm256_abs_epi16( (a) )                         /* [ |a0| |a1| ... |a7| ] (twos complement handling) */
      83             : 
      84             : #define ws_min(a,b)   _mm256_min_epi16(   (a), (b) ) /* [ min(a0,b0)  min(a1,b1)  ... min(a7,b7) ] */
      85             : #define ws_max(a,b)   _mm256_max_epi16(   (a), (b) ) /* [ max(a0,b0)  max(a1,b1)  ... max(a7,b7) ] */
      86             : #define ws_add(a,b)   _mm256_add_epi16(   (a), (b) ) /* [ a0 +b0      a1 +b1      ... a7 +b7     ] */
      87             : #define ws_sub(a,b)   _mm256_sub_epi16(   (a), (b) ) /* [ a0 -b0      a1 -b1      ... a7 -b7     ] */
      88         282 : #define ws_mullo(a,b) _mm256_mullo_epi16( (a), (b) ) /* [ a0*b0       a1*b1       ... a15*b15       ] */
      89             : #define ws_mulhi(a,b) _mm256_mulhi_epi16( (a), (b) ) /* [ (a0*b0)>>16 (a1*b1)>>16 ... (a15*b15)>>16 ] */
      90             : #define ws_mul(a,b)   ws_mullo((a),(b))
      91             : 
      92             : /* Binary operations */
      93             : 
      94             : /* Note: ws_hl/ws_shr/ws_shru is a left/signed right/unsigned right
      95             :    shift by imm bits; imm must be a compile tiem constant in [0,15].
      96             :    The variable variants are slower but do not require the shift amount
      97             :    to be known at compile time (should still be in [0,15]). */
      98             : 
      99             : #define ws_not(a) _mm256_xor_si256( _mm256_set1_epi16( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a7 ] */
     100             : 
     101             : #define ws_shl(a,imm)  _mm256_slli_epi16( (a), (imm) ) /* [ a0<<imm a1<<imm ... a7<<imm ] */
     102             : #define ws_shr(a,imm)  _mm256_srai_epi16( (a), (imm) ) /* [ a0>>imm a1>>imm ... a7>>imm ] (treat a as signed)*/
     103             : #define ws_shru(a,imm) _mm256_srli_epi16( (a), (imm) ) /* [ a0>>imm a1>>imm ... a7>>imm ] (treat a as unsigned) */
     104             : 
     105             : #define ws_shl_variable(a,n)  _mm256_sll_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     106             : #define ws_shr_variable(a,n)  _mm256_sra_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     107             : #define ws_shru_variable(a,n) _mm256_srl_epi16( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     108             : 
     109             : #define ws_shl_vector(a,b)  _mm256_sllv_epi16( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a7<<b7 ] */
     110             : #define ws_shr_vector(a,b)  _mm256_srav_epi16( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a7>>b7 ] (treat a as signed) */
     111             : #define ws_shru_vector(a,b) _mm256_srlv_epi16( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a7>>b7 ] (treat a as unsigned) */
     112             : 
     113         282 : #define ws_and(a,b)    _mm256_and_si256(    (a), (b) ) /* [   a0 &b0    a1& b1 ...   a7& b7 ] */
     114             : #define ws_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0  (~a1)&b1 ... (~a7)&b7 ] */
     115           0 : #define ws_or(a,b)     _mm256_or_si256(     (a), (b) ) /* [   a0 |b0    a1 |b1 ...   a7 |b7 ] */
     116             : #define ws_xor(a,b)    _mm256_xor_si256(    (a), (b) ) /* [   a0 ^b0    a1 ^b1 ...   a7 ^b7 ] */
     117             : 
     118             : /* ws_rol(x,n) returns ws( rotate_left (x0,n), rotate_left (x1,n), ... )
     119             :    ws_ror(x,n) returns ws( rotate_right(x0,n), rotate_right(x1,n), ... ) */
     120             : 
     121           0 : static inline ws_t ws_rol( ws_t a, int imm ) { return ws_or( ws_shl(  a, imm & 15 ), ws_shru( a, (-imm) & 15 ) ); }
     122           0 : static inline ws_t ws_ror( ws_t a, int imm ) { return ws_or( ws_shru( a, imm & 15 ), ws_shl(  a, (-imm) & 15 ) ); }
     123             : 
     124           0 : static inline ws_t ws_rol_variable( ws_t a, int n ) { return ws_or( ws_shl_variable(  a, n&15 ), ws_shru_variable( a, (-n)&15 ) ); }
     125           0 : static inline ws_t ws_ror_variable( ws_t a, int n ) { return ws_or( ws_shru_variable( a, n&15 ), ws_shl_variable(  a, (-n)&15 ) ); }
     126             : 
     127             : /* Logical operations */
     128             : 
     129           0 : #define ws_eq(a,b) _mm256_cmpeq_epi16( (a), (b) )                                              /* [ a0==b0 a1==b1 ... a15==b15 ] */
     130           0 : #define ws_ne(a,b) _mm256_xor_si256( _mm256_set1_epi16( -1 ), _mm256_cmpeq_epi16( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a15!=b15 ] */

Generated by: LCOV version 1.14