LCOV - code coverage report
Current view: top level - util/simd - fd_sse_vi.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 66 74 89.2 %
Date: 2025-01-08 12:08:44 Functions: 15 119 12.6 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_sse_h
       2             : #error "Do not include this directly; use fd_sse.h"
       3             : #endif
       4             : 
       5             : /* Vector int API *****************************************************/
       6             : 
       7             : /* A vi_t is a vector where each 32-bit wide lane holds a signed 32-bit
       8             :    twos-complement integer (an "int").  These mirror vc and vf as much
       9             :    as possible.
      10             : 
      11             :    These mirror the other APIs as much as possible.  Macros are
      12             :    preferred over static inlines when it is possible to do it robustly
      13             :    to reduce the risk of the compiler mucking it up. */
      14             : 
      15    74121267 : #define vi_t __m128i
      16             : 
      17             : /* Constructors */
      18             : 
      19             : /* Given the int values, return ... */
      20             : 
      21      589824 : #define vi(i0,i1,i2,i3) _mm_setr_epi32( (i0), (i1), (i2), (i3) ) /* [ i0 i1 i2 i3 ] */
      22             : 
      23             : #define vi_bcast(i0) _mm_set1_epi32( (i0) ) /* [ i0 i0 i0 i0 ] */
      24             : 
      25             : static inline vi_t /* [ i0 i1 i0 i1 ] */
      26      196608 : vi_bcast_pair( int i0, int i1 ) {
      27      196608 :   return _mm_setr_epi32( i0, i1, i0, i1 );
      28      196608 : }
      29             : 
      30             : static inline vi_t /* [ i0 i0 i1 i1 ] */
      31      196608 : vi_bcast_wide( int i0, int i1 ) {
      32      196608 :   return _mm_setr_epi32( i0, i0, i1, i1 );
      33      196608 : }
      34             : 
      35             : /* vi_permute returns [ i(imm_i0) i(imm_i1) i(imm_i2) i(imm_i3) ].
      36             :    imm_i* should be compile time constants in 0:3. */
      37             : 
      38             : #define vi_permute(x,imm_i0,imm_i1,imm_i2,imm_i3) _mm_shuffle_epi32( (x), _MM_SHUFFLE( (imm_i3), (imm_i2), (imm_i1), (imm_i0) ) )
      39             : 
      40             : /* Predefined constants */
      41             : 
      42             : #define vi_zero() _mm_setzero_si128() /* Return [ 0 0 0 0 ] */
      43    71761971 : #define vi_one()  _mm_set1_epi32( 1 ) /* Return [ 1 1 1 1 ] */
      44             : 
      45             : /* Memory operations */
      46             : 
      47             : /* vi_ld return the 4 ints at the 16-byte aligned / 16-byte sized
      48             :    location p as a vector int.  vi_ldu is the same but p does not have
      49             :    to be aligned.  vi_st writes the vector int to the 16-byte aligned /
      50             :    16-byte sized location p as 4 ints.  vi_stu is the same but p does
      51             :    not have to be aligned.  In all these lane l will be at p[l].  FIXME:
      52             :    USE ATTRIBUTES ON P PASSED TO THESE?
      53             : 
      54             :    Note: gcc knows a __m128i may alias. */
      55             : 
      56    71761971 : static inline vi_t vi_ld( int const * p ) { return _mm_load_si128(  (__m128i const *)p ); }
      57    71761971 : static inline void vi_st( int * p, vi_t i ) { _mm_store_si128(  (__m128i *)p, i ); }
      58             : 
      59   287047884 : static inline vi_t vi_ldu( void const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
      60   287047884 : static inline void vi_stu( void * p, vi_t i ) { _mm_storeu_si128( (__m128i *)p, i ); }
      61             : 
      62             : /* vi_ldif is an optimized equivalent to vi_notczero(c,vi_ldu(p)) (may
      63             :    have different behavior if c is not a proper vector conditional).  It
      64             :    is provided for symmetry with the vi_stif operation.  vi_stif stores
      65             :    x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
      66             :    Undefined behavior if c is not a proper vector conditional. */
      67             : 
      68             : #define vi_ldif(c,p)   _mm_maskload_epi32( (p),(c))
      69             : #define vi_stif(c,p,x) _mm_maskstore_epi32((p),(c),(x))
      70             : 
      71             : /* Element operations */
      72             : 
      73             : /* vi_extract extracts the int in lane imm from the vector int as an int.
      74             :    vi_insert returns the vector int formed by replacing the value in
      75             :    lane imm of a with the provided int.  imm should be a compile time
      76             :    constant in 0:3.  vi_extract_variable and vi_insert_variable are the
      77             :    slower but the lane n does not have to be known at compile time
      78             :    (should be in 0:3).
      79             : 
      80             :    Note: C99 TC3 allows type punning through a union. */
      81             : 
      82   287047884 : #define vi_extract(a,imm)  _mm_extract_epi32( (a), (imm) )
      83   287047884 : #define vi_insert(a,imm,v) _mm_insert_epi32( (a), (v), (imm) )
      84             : 
      85             : static inline int
      86   287047884 : vi_extract_variable( vi_t a, int n ) {
      87   287047884 :   union { __m128i m[1]; int i[4]; } t[1];
      88   287047884 :   _mm_store_si128( t->m, a );
      89   287047884 :   return t->i[n];
      90   287047884 : }
      91             : 
      92             : static inline vi_t
      93   287047884 : vi_insert_variable( vi_t a, int n, int v ) {
      94   287047884 :   union { __m128i m[1]; int i[4]; } t[1];
      95   287047884 :   _mm_store_si128( t->m, a );
      96   287047884 :   t->i[n] = v;
      97   287047884 :   return _mm_load_si128( t->m );
      98   287047884 : }
      99             : 
     100             : /* Given [a0 a1 a2 a3] and/or [b0 b1 b2 b3], return ... */
     101             : 
     102             : /* Arithmetic operations */
     103             : 
     104             : #define vi_neg(a) _mm_sub_epi32( _mm_setzero_si128(), (a) ) /* [ -a0  -a1  ... -a3  ] (twos complement handling) */
     105             : #define vi_abs(a) _mm_abs_epi32( (a) )                      /* [ |a0| |a1| ... |a3| ] (twos complement handling) */
     106             : 
     107             : #define vi_min(a,b) _mm_min_epi32(   (a), (b) ) /* [ min(a0,b0) min(a1,b1) ... min(a3,b3) ] */
     108             : #define vi_max(a,b) _mm_max_epi32(   (a), (b) ) /* [ max(a0,b0) max(a1,b1) ... max(a3,b3) ] */
     109             : #define vi_add(a,b) _mm_add_epi32(   (a), (b) ) /* [ a0 +b0     a1 +b1     ... a3 +b3     ] */
     110             : #define vi_sub(a,b) _mm_sub_epi32(   (a), (b) ) /* [ a0 -b0     a1 -b1     ... a3 -b3     ] */
     111             : #define vi_mul(a,b) _mm_mullo_epi32( (a), (b) ) /* [ a0 *b0     a1 *b1     ... a3 *b3     ] */
     112             : 
     113             : /* Binary operations */
     114             : 
     115             : /* Note: vi_shl/vi_shr/vi_shru is a left/signed right/unsigned right
     116             :    shift by imm bits; imm should be a compile time constant in 0:31.
     117             :    The variable variants are slower but do not require the shift amount
     118             :    to be known at compile time (should still be in 0:31). */
     119             : 
     120             : #define vi_not(a) _mm_xor_si128( _mm_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a3 ] */
     121             : 
     122             : #define vi_shl(a,imm)  _mm_slli_epi32( (a), (imm) ) /* [ a0<<imm a1<<imm ... a3<<imm ] */
     123             : #define vi_shr(a,imm)  _mm_srai_epi32( (a), (imm) ) /* [ a0>>imm a1>>imm ... a3>>imm ] (treat a as signed) */
     124             : #define vi_shru(a,imm) _mm_srli_epi32( (a), (imm) ) /* [ a0>>imm a1>>imm ... a3>>imm ] (treat a as unsigned) */
     125             : 
     126             : #define vi_shl_variable(a,n)  _mm_sll_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     127             : #define vi_shr_variable(a,n)  _mm_sra_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     128             : #define vi_shru_variable(a,n) _mm_srl_epi32( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     129             : 
     130             : #define vi_shl_vector(a,b)  _mm_sllv_epi32( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a3<<b3 ] */
     131             : #define vi_shr_vector(a,b)  _mm_srav_epi32( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a3>>b3 ] (treat a as signed) */
     132             : #define vi_shru_vector(a,b) _mm_srlv_epi32( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a3>>b3 ] (treat a as unsigned) */
     133             : 
     134             : #define vi_and(a,b)    _mm_and_si128(    (a), (b) ) /* [   a0 &b0    a1& b1 ...   a3& b3 ] */
     135             : #define vi_andnot(a,b) _mm_andnot_si128( (a), (b) ) /* [ (~a0)&b0  (~a1)&b1 ... (~a3)&b3 ] */
     136    20971520 : #define vi_or(a,b)     _mm_or_si128(     (a), (b) ) /* [   a0 |b0    a1 |b1 ...   a3 |b3 ] */
     137             : #define vi_xor(a,b)    _mm_xor_si128(    (a), (b) ) /* [   a0 ^b0    a1 ^b1 ...   a3 ^b3 ] */
     138             : 
     139             : /* vi_rol(x,n) returns vi( rotate_left (x0,n), rotate_left (x1,n), ... )
     140             :    vi_ror(x,n) returns vi( rotate_right(x0,n), rotate_right(x1,n), ... ) */
     141             : 
     142             : #if FD_HAS_AVX512
     143             : #define vi_rol(a,imm)  _mm_rol_epi32( (a), (imm) )
     144             : #define vi_ror(a,imm)  _mm_ror_epi32( (a), (imm) )
     145             : #else
     146     4194304 : static inline vi_t vi_rol( vi_t a, int imm ) { return vi_or( vi_shl(  a, imm & 31 ), vi_shru( a, (-imm) & 31 ) ); }
     147     4194304 : static inline vi_t vi_ror( vi_t a, int imm ) { return vi_or( vi_shru( a, imm & 31 ), vi_shl(  a, (-imm) & 31 ) ); }
     148             : #endif
     149             : 
     150     6291456 : static inline vi_t vi_rol_variable( vi_t a, int n ) { return vi_or( vi_shl_variable(  a, n&31 ), vi_shru_variable( a, (-n)&31 ) ); }
     151     6291456 : static inline vi_t vi_ror_variable( vi_t a, int n ) { return vi_or( vi_shru_variable( a, n&31 ), vi_shl_variable(  a, (-n)&31 ) ); }
     152             : 
     153           0 : static inline vi_t vi_rol_vector( vi_t a, vi_t b ) {
     154           0 :   vi_t m = vi_bcast( 31 );
     155           0 :   return vi_or( vi_shl_vector(  a, vi_and( b, m ) ), vi_shru_vector( a, vi_and( vi_neg( b ), m ) ) );
     156           0 : }
     157             : 
     158           0 : static inline vi_t vi_ror_vector( vi_t a, vi_t b ) {
     159           0 :   vi_t m = vi_bcast( 31 );
     160           0 :   return vi_or( vi_shru_vector( a, vi_and( b, m ) ), vi_shl_vector(  a, vi_and( vi_neg( b ), m ) ) );
     161           0 : }
     162             : 
     163             : /* Logical operations */
     164             : 
     165             : #define vi_lnot(a)     _mm_cmpeq_epi32( (a), _mm_setzero_si128() ) /* [  !a0  !a1 ...  !a3 ] */
     166             : #define vi_lnotnot(a)                                              /* [ !!a0 !!a1 ... !!a3 ] */ \
     167             :   _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( (a), _mm_setzero_si128() ) )
     168             : 
     169             : #define vi_eq(a,b) _mm_cmpeq_epi32( (a), (b) )                                        /* [ a0==b0 a1==b1 ... a3==b3 ] */
     170             : #define vi_gt(a,b) _mm_cmpgt_epi32( (a), (b) )                                        /* [ a0> b0 a1> b1 ... a3> b3 ] */
     171             : #define vi_lt(a,b) _mm_cmpgt_epi32( (b), (a) )                                        /* [ a0< b0 a1< b1 ... a3> b3 ] */
     172             : #define vi_ne(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a3!=b3 ] */
     173             : #define vi_ge(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpgt_epi32( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ... a3>=b3 ] */
     174             : #define vi_le(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpgt_epi32( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ... a3<=b3 ] */
     175             : 
     176             : /* Conditional operations */
     177             : 
     178             : #define vi_czero(c,f)    _mm_andnot_si128( (c), (f) ) /* [ c0? 0:f0 c1? 0:f1 ... c3? 0:f3 ] */
     179             : #define vi_notczero(c,f) _mm_and_si128(    (c), (f) ) /* [ c0?f0: 0 c1?f1: 0 ... c3?f3: 0 ] */
     180             : 
     181             : #define vi_if(c,t,f) _mm_blendv_epi8(  (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c3?t3:f3 ] */
     182             : 
     183             : /* Conversion operations */
     184             : 
     185             : /* Summarizing:
     186             : 
     187             :    vi_to_vc(a)               returns [ !!a0 !!a1 ... !!a3 ]
     188             : 
     189             :    vi_to_vu(a)               returns [ (uint)a0 (uint)a1 ... (uint)a3 ]
     190             : 
     191             :    vi_to_vf(a)               returns [ (float)a0 (float)a1 ... (float)a3 ]
     192             : 
     193             :    vi_to_vd(a,imm_i0,imm_i1) returns [ (double)a(imm_i0) (double)a(imm_i1) ]
     194             : 
     195             :    vi_to_vl(a,imm_i0,imm_i1) returns [ (long)a(imm_i0) (long)a(imm_i1) ]
     196             : 
     197             :    vi_to_vv(a,imm_i0,imm_i1) returns [ (ulong)a(imm_i0) (ulong)a(imm_i1) ]
     198             : 
     199             :    where imm_i* should be a compile time constant in 0:3.
     200             : 
     201             :    The raw variants just treat the raw bits as the corresponding vector
     202             :    type.  For vi_to_vc_raw, the user promises vi contains a proper
     203             :    vector conditional (i.e. 0 or -1 in each lane).  vi_to_vf_raw is
     204             :    useful for doing advanced bit tricks on floating point values.  The
     205             :    others are probably dubious but are provided for completness. */
     206             : 
     207             : #define vi_to_vc(a)               _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( (a), _mm_setzero_si128() ) )
     208             : #define vi_to_vf(a)               _mm_cvtepi32_ps( (a) )
     209             : #define vi_to_vu(a)               (a)
     210             : #define vi_to_vd(a,imm_i0,imm_i1) _mm_cvtepi32_pd   ( _mm_shuffle_epi32( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
     211             : #define vi_to_vl(a,imm_i0,imm_i1) _mm_cvtepi32_epi64( _mm_shuffle_epi32( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
     212             : #define vi_to_vv(a,imm_i0,imm_i1) _mm_cvtepi32_epi64( _mm_shuffle_epi32( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
     213             : 
     214             : #define vi_to_vc_raw(a) (a)
     215             : #define vi_to_vf_raw(a) _mm_castsi128_ps( (a) )
     216             : #define vi_to_vu_raw(a) (a)
     217             : #define vi_to_vd_raw(a) _mm_castsi128_pd( (a) )
     218             : #define vi_to_vl_raw(a) (a)
     219             : #define vi_to_vv_raw(a) (a)
     220             : 
     221             : /* Reduction operations */
     222             : 
     223             : static inline vi_t
     224      196608 : vi_sum_all( vi_t x ) { /* Returns vi_bcast( sum( x ) ) */
     225      196608 :   x = _mm_hadd_epi32( x, x );    /* x01 x23 ... */
     226      196608 :   return _mm_hadd_epi32( x, x ); /* xsum ...    */
     227      196608 : }
     228             : 
     229             : static inline vi_t
     230      196608 : vi_min_all( vi_t x ) { /* Returns vi_bcast( min( x ) ) */
     231      196608 :   __m128i y;
     232      196608 :   y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x2  x3  x0  x1 */
     233      196608 :   x = _mm_min_epi32( x, y );                             /* x02 x13 ...    */
     234      196608 :   y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x13 x02 ...    */
     235      196608 :   x = _mm_min_epi32( x, y );                             /* xmin ...       */
     236      196608 :   return x;
     237      196608 : }
     238             : 
     239             : static inline vi_t
     240      196608 : vi_max_all( vi_t x ) { /* Returns vi_bcast( max( x ) ) */
     241      196608 :   __m128i y;
     242      196608 :   y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x2  x3  x0  x1 */
     243      196608 :   x = _mm_max_epi32( x, y );                             /* x02 x13 ...    */
     244      196608 :   y = _mm_shuffle_epi32( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x13 x02 ...    */
     245      196608 :   x = _mm_max_epi32( x, y );                             /* xmax ...       */
     246      196608 :   return x;
     247      196608 : }
     248             : 
     249             : /* Misc operations */
     250             : 
     251             : /* vi_gather(b,i) returns [ b[i(0)] b[i(1)] ... b[i(3)] ] where b is a
     252             :    "int const *"  and i is a vi_t. */
     253             : 
     254    71761971 : #define vi_gather(b,i) _mm_i32gather_epi32( (b), (i), 4 )
     255             : 
     256             : /* vi_transpose_4x4 transposes the 4x4 matrix stored in vi_t r0,r1,r2,r3
     257             :    and stores the result in 4x4 matrix vi_t c0,c1,c2,c3.  All
     258             :    c0,c1,c2,c3 should be different for a well defined result.
     259             :    Otherwise, in-place operation and/or using the same vi_t to specify
     260             :    multiple rows of r is fine. */
     261             : 
     262      196608 : #define vi_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do {                                                                   \
     263      196608 :     vi_t _vi_transpose_r0 = (r0); vi_t _vi_transpose_r1 = (r1); vi_t _vi_transpose_r2 = (r2); vi_t _vi_transpose_r3 = (r3); \
     264      196608 :     vi_t _vi_transpose_t;                                                                                                   \
     265      196608 :     /* Transpose 2x2 blocks */                                                                                              \
     266      196608 :     _vi_transpose_t = _vi_transpose_r0; _vi_transpose_r0 = _mm_unpacklo_epi32( _vi_transpose_t,  _vi_transpose_r2 );        \
     267      196608 :     /**/                                _vi_transpose_r2 = _mm_unpackhi_epi32( _vi_transpose_t,  _vi_transpose_r2 );        \
     268      196608 :     _vi_transpose_t = _vi_transpose_r1; _vi_transpose_r1 = _mm_unpacklo_epi32( _vi_transpose_t,  _vi_transpose_r3 );        \
     269      196608 :     /**/                                _vi_transpose_r3 = _mm_unpackhi_epi32( _vi_transpose_t,  _vi_transpose_r3 );        \
     270      196608 :     /* Transpose 1x1 blocks */                                                                                              \
     271      196608 :     /**/                                (c0)             = _mm_unpacklo_epi32( _vi_transpose_r0, _vi_transpose_r1 );        \
     272      196608 :     /**/                                (c1)             = _mm_unpackhi_epi32( _vi_transpose_r0, _vi_transpose_r1 );        \
     273      196608 :     /**/                                (c2)             = _mm_unpacklo_epi32( _vi_transpose_r2, _vi_transpose_r3 );        \
     274      196608 :     /**/                                (c3)             = _mm_unpackhi_epi32( _vi_transpose_r2, _vi_transpose_r3 );        \
     275      196608 :   } while(0)

Generated by: LCOV version 1.14