LCOV - code coverage report
Current view: top level - util/simd - fd_sse_vv.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 72 80 90.0 %
Date: 2025-01-08 12:08:44 Functions: 20 154 13.0 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_sse_h
       2             : #error "Do not include this directly; use fd_sse.h"
       3             : #endif
       4             : 
       5             : /* Vector ulong API ***************************************************/
       6             : 
       7             : /* A vv_t is a vector where each adjacent pair of 32-bit wide lanes
       8             :    (e.g. 0-1 / 2-3) holds an unsigned 64-bit integer (a "ulong").
       9             : 
      10             :    These mirror the other APIs as much as possible.  Macros are
      11             :    preferred over static inlines when it is possible to do it robustly
      12             :    to reduce the risk of the compiler mucking it up. */
      13             : 
      14   117768243 : #define vv_t __m128i
      15             : 
      16             : /* Constructors */
      17             : 
      18             : /* Given the long values, return ... */
      19             : 
      20      589824 : #define vv(v0,v1) _mm_set_epi64x( (long)(v1), (long)(v0) ) /* [ v0 v1 ] ... sigh ... backwards intel */
      21             : 
      22             : #define vv_bcast(v0) _mm_set1_epi64x( (long)(v0) ) /* [ v0 v0 ] */
      23             : 
      24             : /* vv_permute returns [ l(imm_i0) l(imm_i1) ].  imm_i* should be compile
      25             :    time constants in 0:1. */
      26             : 
      27      262144 : #define vv_permute( v, imm_i0, imm_i1 ) _mm_castpd_si128( _mm_permute_pd( _mm_castsi128_pd( (v) ), (imm_i0) + 2*(imm_i1) ) )
      28             : 
      29             : /* Predefined constants */
      30             : 
      31             : #define vv_zero() _mm_setzero_si128()   /* Return [ 0UL 0UL ] */
      32   116391987 : #define vv_one()  _mm_set1_epi64x( 1L ) /* Return [ 1UL 1UL ] */
      33             : 
      34             : /* Memory operations */
      35             : 
      36             : /* vv_ld return the 2 ulongs at the 16-byte aligned / 16-byte sized
      37             :    location p as a vector ulong.  vv_ldu is the same but p does not have
      38             :    to be aligned.  vv_st writes the vector ulong to the 16-byte aligned /
      39             :    16-byte sized location p as 2 ulongs.  vv_stu is the same but p does
      40             :    not have to be aligned.  In all these 64-bit lane l will be at p[l].
      41             :    FIXME: USE ATTRIBUTES ON P PASSED TO THESE?
      42             : 
      43             :    Note: gcc knows a __m128i may alias. */
      44             : 
      45   116391987 : static inline vv_t vv_ld( ulong const * p ) { return _mm_load_si128(  (__m128i const *)p ); }
      46   116391987 : static inline void vv_st( ulong * p, vv_t i ) { _mm_store_si128(  (__m128i *)p, i ); }
      47             : 
      48   232783974 : static inline vv_t vv_ldu( void const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
      49   232783974 : static inline void vv_stu( void * p, vv_t i ) { _mm_storeu_si128( (__m128i *)p, i ); }
      50             : 
      51             : /* vv_ldif is an optimized equivalent to vv_notczero(c,vv_ldu(p)) (may
      52             :    have different behavior if c is not a proper vector conditional).  It
      53             :    is provided for symmetry with the vv_stif operation.  vv_stif stores
      54             :    x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
      55             :    Undefined behavior if c is not a proper vector conditional. */
      56             : 
      57             : #define vv_ldif(c,p)   _mm_maskload_epi64( (p),(c))
      58             : #define vv_stif(c,p,x) _mm_maskstore_epi64((p),(c),(x))
      59             : 
      60             : /* Element operations */
      61             : 
      62             : /* vv_extract extracts the ulong in lane imm from the vector ulong as a
      63             :    ulong.  vv_insert returns the vector ulong formed by replacing the
      64             :    value in lane imm of a with the provided ulong.  imm should be a
      65             :    compile time known in 0:1.  vv_extract_variable and
      66             :    vv_insert_variable are the slower but the lane n does not have to be
      67             :    known at compile time (should still be in 0:1).
      68             : 
      69             :    Note: C99 TC3 allows type punning through a union. */
      70             : 
      71   233570406 : #define vv_extract(a,imm)  ((ulong)_mm_extract_epi64( (a), (imm) ))
      72             : 
      73   232783974 : #define vv_insert(a,imm,v) _mm_insert_epi64( (a), (long)(v), (imm) )
      74             : 
      75             : static inline ulong
      76   232783974 : vv_extract_variable( vv_t a, int n ) {
      77   232783974 :   union { __m128i m[1]; ulong u[2]; } t[1];
      78   232783974 :   _mm_store_si128( t->m, a );
      79   232783974 :   return t->u[n];
      80   232783974 : }
      81             : 
      82             : static inline vv_t
      83   232783974 : vv_insert_variable( vv_t a, int n, ulong v ) {
      84   232783974 :   union { __m128i m[1]; ulong u[2]; } t[1];
      85   232783974 :   _mm_store_si128( t->m, a );
      86   232783974 :   t->u[n] = v;
      87   232783974 :   return _mm_load_si128( t->m );
      88   232783974 : }
      89             : 
      90             : /* Given [a0 a1] and/or [b0 b1], return ... */
      91             : 
      92             : /* Arithmetic operations */
      93             : 
      94             : #define vv_neg(a) _mm_sub_epi64( _mm_setzero_si128(), (a) ) /* [ -a0  -a1  ] */
      95             : #define vv_abs(a) (a)                                       /* [ |a0| |a1| ] */
      96             : 
      97             : /* Note: _mm_{min,max}_epu64 are missing pre AVX-512.  We emulate these
      98             :    on pre AVX-512 targets below (and use the AVX-512 versions if
      99             :    possible).  Likewise, there is no _mm_mullo_epi64 pre AVX-512.  Since
     100             :    this is not cheap to emulate, we do not provide a wl_mul for the time
     101             :    being (we could consider exposing it on AVX-512 targets though).
     102             :    There is a 64L*64L->64 multiply (where the lower 32-bits will be zero
     103             :    extended to 64-bits beforehand) though and that is very useful.  So
     104             :    we do provide that. */
     105             : 
     106      196608 : #define vv_add(a,b)    _mm_add_epi64(   (a), (b) ) /* [ a0 +b0     a1 +b1     ] */
     107             : #define vv_sub(a,b)    _mm_sub_epi64(   (a), (b) ) /* [ a0 -b0     a1 -b1     ] */
     108             : //#define vv_mul(a,b)  _mm_mullo_epi64( (a), (b) ) /* [ a0 *b0     a1 *b1     ] */
     109             : #define vv_mul_ll(a,b) _mm_mul_epu32(   (a), (b) ) /* [ a0l*b0l    a1l*b1l    ] */
     110             : 
     111             : /* Binary operations */
     112             : 
     113             : /* Note: vv_shl/vv_shr/vv_shru is a left/right shift by imm bits; imm
     114             :    should be a compile time constant in 0:63.  The variable variants are
     115             :    slower but do not require the shift amount to be known at compile
     116             :    time (should still be in 0:63). */
     117             : 
     118             : #define vv_not(a) _mm_xor_si128( _mm_set1_epi64x( -1L ), (a) ) /* [ ~a0 ~a1 ] */
     119             : 
     120             : #define vv_shl(a,imm) _mm_slli_epi64( (a), (imm) ) /* [ a0<<imm a1<<imm ] */
     121             : #define vv_shr(a,imm) _mm_srli_epi64( (a), (imm) ) /* [ a0>>imm a1>>imm ] */
     122             : 
     123             : #define vv_shl_variable(a,n) _mm_sll_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     124             : #define vv_shr_variable(a,n) _mm_srl_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     125             : 
     126             : #define vv_shl_vector(a,b) _mm_sllv_epi64( (a), (b) ) /* [ a0<<b0 a1<<b1 ] */
     127             : #define vv_shr_vector(a,b) _mm_srlv_epi64( (a), (b) ) /* [ a0>>b0 a1>>b1 ] */
     128             : 
     129             : #define vv_and(a,b)    _mm_and_si128(    (a), (b) ) /* [   a0 &b0    a1& b1 ] */
     130             : #define vv_andnot(a,b) _mm_andnot_si128( (a), (b) ) /* [ (~a0)&b0  (~a1)&b1 ] */
     131    42074112 : #define vv_or(a,b)     _mm_or_si128(     (a), (b) ) /* [   a0 |b0    a1 |b1 ] */
     132             : #define vv_xor(a,b)    _mm_xor_si128(    (a), (b) ) /* [   a0 ^b0    a1 ^b1 ] */
     133             : 
     134             : /* vv_rol(x,n) returns vv( rotate_left (x0,n), rotate_left (x1,n), ... )
     135             :    vv_ror(x,n) returns vv( rotate_right(x0,n), rotate_right(x1,n), ... ) */
     136             : 
     137             : #if FD_HAS_AVX512
     138             : #define vv_rol(a,imm)  _mm_rol_epi64( (a), (imm) )
     139             : #define vv_ror(a,imm)  _mm_ror_epi64( (a), (imm) )
     140             : #else
     141     8519680 : static inline vv_t vv_rol( vv_t a, int imm ) { return vv_or( vv_shl( a, imm & 63 ), vv_shr( a, (-imm) & 63 ) ); }
     142     8388608 : static inline vv_t vv_ror( vv_t a, int imm ) { return vv_or( vv_shr( a, imm & 63 ), vv_shl( a, (-imm) & 63 ) ); }
     143             : #endif
     144             : 
     145    12582912 : static inline vv_t vv_rol_variable( vv_t a, int n ) { return vv_or( vv_shl_variable( a, n&63 ), vv_shr_variable( a, (-n)&63 ) ); }
     146    12582912 : static inline vv_t vv_ror_variable( vv_t a, int n ) { return vv_or( vv_shr_variable( a, n&63 ), vv_shl_variable( a, (-n)&63 ) ); }
     147             : 
     148           0 : static inline vv_t vv_rol_vector( vv_t a, vl_t b ) {
     149           0 :   vl_t m = vl_bcast( 63L );
     150           0 :   return vv_or( vv_shl_vector( a, vl_and( b, m ) ), vv_shr_vector( a, vl_and( vl_neg( b ), m ) ) );
     151           0 : }
     152             : 
     153           0 : static inline vv_t vv_ror_vector( vv_t a, vl_t b ) {
     154           0 :   vl_t m = vl_bcast( 63L );
     155           0 :   return vv_or( vv_shr_vector( a, vl_and( b, m ) ), vv_shl_vector( a, vl_and( vl_neg( b ), m ) ) );
     156           0 : }
     157             : 
     158             : #define vv_bswap(a) vu_to_vv_raw( vu_bswap( vv_to_vu_raw( vv_rol( (a), 32 ) ) ) )
     159             : 
     160             : /* Logical operations */
     161             : 
     162             : /* Like noted below in the converters, Intel clearly has the hardware to
     163             :    do a _mm_cmpgt_epu64 given that _mm_cmpgt_epi64 exists but doesn't
     164             :    expose it in the ISA pre AVX-512.  Sigh ... twos complement bit
     165             :    tricks to the rescue for wu_{gt,lt,ge,le}. */
     166             : 
     167             : #define vv_lnot(a)    _mm_cmpeq_epi64( (a), _mm_setzero_si128() )                                          /* [  !a0  !a1 ] */
     168             : #define vv_lnotnot(a) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpeq_epi64( (a), _mm_setzero_si128() ) ) /* [ !!a0 !!a1 ] */
     169             : 
     170             : #define vv_eq(a,b) _mm_cmpeq_epi64( (a), (b) )                                                 /* [ a0==b0 a1==b1 ] */
     171             : #define vv_gt(a,b) _mm_cmpgt_epi64( _mm_sub_epi64( (a), _mm_set1_epi64x( (long)(1UL<<63) ) ),  /* [ a0> b0 a1> b1 ] */ \
     172             :                                     _mm_sub_epi64( (b), _mm_set1_epi64x( (long)(1UL<<63) ) ) )
     173             : #define vv_lt(a,b) vv_gt( (b), (a) )                                                           /* [ a0< b0 a1< b1 ] */
     174             : #define vv_ne(a,b) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpeq_epi64( (a), (b) ) )        /* [ a0!=b0 a1!=b1 ] */
     175             : #define vv_ge(a,b) _mm_xor_si128( _mm_set1_epi64x( -1L ), vv_gt( (b), (a) ) )                  /* [ a0>=b0 a1>=b1 ] */
     176             : #define vv_le(a,b) _mm_xor_si128( _mm_set1_epi64x( -1L ), vv_gt( (a), (b) ) )                  /* [ a0<=b0 a1<=b1 ] */
     177             : 
     178             : /* Conditional operations */
     179             : 
     180             : #define vv_czero(c,f)    _mm_andnot_si128( (c), (f) )  /* [ c0?0UL:f0 c1?0UL:f1 ] */
     181             : #define vv_notczero(c,f) _mm_and_si128(    (c), (f) )  /* [ c0?f0:0UL c1?f1:0UL ] */
     182             : 
     183      524288 : #define vv_if(c,t,f) _mm_blendv_epi8(  (f), (t), (c) ) /* [ c0?t0:f0  c1?t1:f1  ] */
     184             : 
     185             : #if defined(__AVX512F__) && defined(__AVX512VL__) /* See note above */
     186       65536 : #define vv_min(a,b) _mm_min_epu64( (a), (b) )
     187       65536 : #define vv_max(a,b) _mm_max_epu64( (a), (b) )
     188             : #else
     189      262144 : static inline vv_t vv_min( vv_t a, vv_t b ) { return vv_if( vv_lt( a, b ), a, b ); }
     190      262144 : static inline vv_t vv_max( vv_t a, vv_t b ) { return vv_if( vv_gt( a, b ), a, b ); }
     191             : #endif
     192             : 
     193             : /* Conversion operations */
     194             : 
     195             : /* Summarizing:
     196             : 
     197             :    vv_to_vc(d)     returns [ !!v0 !!v0 !!v1 !!v1 ]
     198             : 
     199             :    vv_to_vf(l,i,0) returns [ (float)v0 (float)v1 f2 f3 ]
     200             :    vv_to_vf(l,i,1) returns [ f0 f1 (float)v0 (float)v1 ]
     201             : 
     202             :    vv_to_vi(l,i,0) returns [ (int)v0 (int)v1 i2 i3 ]
     203             :    vv_to_vi(l,i,1) returns [ i0 i1 (int)v0 (int)v1 ]
     204             : 
     205             :    vv_to_vu(l,u,0) returns [ (uint)v0 (uint)v1 u2 u3 ]
     206             :    vv_to_vu(l,u,1) returns [ u0 u1 (uint)v0 (uint)v1 ]
     207             : 
     208             :    vv_to_vd(l)     returns [ (double)v0 (double)v1 ]
     209             : 
     210             :    vv_to_vl(l)     returns [ (long)v0 (long)v1 ]
     211             : 
     212             :    The raw variants just treat the raw bits as the corresponding vector
     213             :    type.  For vv_to_vc_raw, the user promises vv contains a proper
     214             :    vector conditional (e.g. 0 or -1 in each lane).  The others are
     215             :    provided to facilitate doing advanced bit tricks on floating point
     216             :    values. */
     217             : 
     218             : #define vv_to_vc(a) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpeq_epi64( (a), _mm_setzero_si128() ) )
     219             : 
     220      393216 : static inline vf_t vv_to_vf( vv_t v, vf_t f, int imm_hi ) {
     221      393216 :   float f0 = (float)vv_extract( v, 0 );
     222      393216 :   float f1 = (float)vv_extract( v, 1 );
     223      393216 :   return imm_hi ? vf_insert( vf_insert( f, 2, f0 ), 3, f1 ) : vf_insert( vf_insert( f, 0, f0 ), 1, f1 ); /* Compile time */
     224      393216 : }
     225             : 
     226      393216 : static inline vv_t vv_to_vi( vv_t v, vi_t i, int imm_hi ) {
     227      393216 :   vf_t _v = _mm_castsi128_ps( v ); /* [ x0l x0h x1l x1h ] */
     228      393216 :   vf_t _i = _mm_castsi128_ps( i );
     229      393216 :   if( imm_hi ) _v = _mm_shuffle_ps( _i, _v, _MM_SHUFFLE(2,0,1,0) ); /* Compile time */
     230      196608 :   else         _v = _mm_shuffle_ps( _v, _i, _MM_SHUFFLE(3,2,2,0) );
     231      393216 :   return _mm_castps_si128( _v );
     232      393216 : }
     233             : 
     234      393216 : static inline vv_t vv_to_vu( vv_t v, vu_t u, int imm_hi ) {
     235      393216 :   vf_t _v = _mm_castsi128_ps( v ); /* [ x0l x0h x1l x1h ] */
     236      393216 :   vf_t _u = _mm_castsi128_ps( u );
     237      393216 :   if( imm_hi ) _v = _mm_shuffle_ps( _u, _v, _MM_SHUFFLE(2,0,1,0) ); /* Compile time */
     238      196608 :   else         _v = _mm_shuffle_ps( _v, _u, _MM_SHUFFLE(3,2,2,0) );
     239      393216 :   return _mm_castps_si128( _v );
     240      393216 : }
     241             : 
     242      196608 : static inline vd_t vv_to_vd( vv_t v ) {
     243      196608 :   return _mm_setr_pd( (double)(ulong)_mm_extract_epi64( v, 0 ), (double)(ulong)_mm_extract_epi64( v, 1 ) );
     244      196608 : }
     245             : 
     246             : #define vv_to_vl(a) (a)
     247             : 
     248             : #define vv_to_vc_raw(a) (a)
     249             : #define vv_to_vf_raw(a) _mm_castsi128_ps( (a) )
     250             : #define vv_to_vi_raw(a) (a)
     251             : #define vv_to_vu_raw(a) (a)
     252             : #define vv_to_vd_raw(a) _mm_castsi128_pd( (a) )
     253             : #define vv_to_vl_raw(a) (a)
     254             : 
     255             : /* Reduction operations */
     256             : 
     257             : static inline vv_t
     258      196608 : vv_sum_all( vv_t x ) { /* Returns vv_bcast( sum( x ) ) */
     259      196608 :   return vv_add( x, vv_permute( x, 1, 0 ) );
     260      196608 : }
     261             : 
     262             : static inline vv_t
     263      196608 : vv_min_all( vv_t x ) { /* Returns vv_bcast( min( x ) ) */
     264      196608 :   return vv_min( x, vv_permute( x, 1, 0 ) );
     265      196608 : }
     266             : 
     267             : static inline vv_t
     268      196608 : vv_max_all( vv_t x ) { /* Returns vv_bcast( max( x ) ) */
     269      196608 :   return vv_max( x, vv_permute( x, 1, 0 ) );
     270      196608 : }
     271             : 
     272             : /* Misc operations */
     273             : 
     274             : /* vv_gather(b,i,imm_i0,imm_i1) returns [ b[i(imm_i0)] b[i(imm_i1)] ]
     275             :    where b is a  "ulong const *" and i is a vi_t and imm_i0,imm_i1 are
     276             :    compile time constants in 0:3.  We use a static inline here instead
     277             :    of a define to keep strict type checking while working around yet
     278             :    another Intel intrinsic type mismatch issue.  And we use a define to
     279             :    workaround clang sadness with passing a compile time constant into a
     280             :    static inline. */
     281             : 
     282   465567948 : static inline vv_t _vv_gather( ulong const * b, vi_t i ) {
     283   465567948 :   return _mm_i32gather_epi64( (long long const *)b, i, 8 );
     284   465567948 : }
     285             : 
     286   465567948 : #define vv_gather(b,i,imm_i0,imm_i1) _vv_gather( (b), _mm_shuffle_epi32( (i), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
     287             : 
     288             : /* vv_transpose_2x2 transposes the 2x2 matrix stored in vv_t r0,r1
     289             :    and stores the result in 2x2 matrix vv_t c0,c1.  All c0,c1 should be
     290             :    different for a well defined result.  Otherwise, in-place operation
     291             :    and/or using the same vv_t to specify multiple rows of r is fine. */
     292             : 
     293      196608 : #define vv_transpose_2x2( r0,r1, c0,c1 ) do {                        \
     294      196608 :     vv_t _vv_transpose_r0 = (r0); vv_t _vv_transpose_r1 = (r1);      \
     295      196608 :     (c0) = _mm_unpacklo_epi64( _vv_transpose_r0, _vv_transpose_r1 ); \
     296      196608 :     (c1) = _mm_unpackhi_epi64( _vv_transpose_r0, _vv_transpose_r1 ); \
     297      196608 :   } while(0)

Generated by: LCOV version 1.14