LCOV - code coverage report
Current view: top level - util/simd - fd_sse_vl.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 82 94 87.2 %
Date: 2025-01-08 12:08:44 Functions: 23 182 12.6 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_sse_h
       2             : #error "Do not include this directly; use fd_sse.h"
       3             : #endif
       4             : 
       5             : /* Vector long API ****************************************************/
       6             : 
       7             : /* A vl_t is a vector where each adjacent pair of 32-bit wide lanes
       8             :    (e.g. 0-1 / 2-3) holds a signed 64-bit twos-complement integer (a
       9             :    "long").
      10             : 
      11             :    These mirror the other APIs as much as possible.  Macros are
      12             :    preferred over static inlines when it is possible to do it robustly
      13             :    to reduce the risk of the compiler mucking it up. */
      14             : 
      15   142737459 : #define vl_t __m128i
      16             : 
      17             : /* Constructors */
      18             : 
      19             : /* Given the long values, return ... */
      20             : 
      21      589824 : #define vl(l0,l1) _mm_set_epi64x( (l1), (l0) ) /* [ l0 l1 ] ... sigh ... backwards intel */
      22             : 
      23             : #define vl_bcast(l0) _mm_set1_epi64x( (l0) ) /* [ l0 l0 ] */
      24             : 
      25             : /* vl_permute returns [ l(imm_i0) l(imm_i1) ].  imm_i* should be compile
      26             :    time constants in 0:1. */
      27             : 
      28      262144 : #define vl_permute( v, imm_i0, imm_i1 ) _mm_castpd_si128( _mm_permute_pd( _mm_castsi128_pd( (v) ), (imm_i0) + 2*(imm_i1) ) )
      29             : 
      30             : /* Predefined constants */
      31             : 
      32             : #define vl_zero() _mm_setzero_si128()   /* Return [ 0L 0L ] */
      33   141361203 : #define vl_one()  _mm_set1_epi64x( 1L ) /* Return [ 1L 1L ] */
      34             : 
      35             : /* Memory operations */
      36             : 
      37             : /* vl_ld return the 2 longs at the 16-byte aligned / 16-byte sized
      38             :    location p as a vector long.  vl_ldu is the same but p does not have
      39             :    to be aligned.  vl_st writes the vector long to the 16-byte aligned /
      40             :    16-byte sized location p as 2 longs.  vl_stu is the same but p does
      41             :    not have to be aligned.  In all these 64-bit lane l vlll be at p[l].
      42             :    FIXME: USE ATTRIBUTES ON P PASSED TO THESE?
      43             : 
      44             :    Note: gcc knows a __m128i may alias. */
      45             : 
      46   141361203 : static inline vl_t vl_ld( long const * p ) { return _mm_load_si128(  (__m128i const *)p ); }
      47   141361203 : static inline void vl_st( long * p, vl_t i ) { _mm_store_si128(  (__m128i *)p, i ); }
      48             : 
      49   282722406 : static inline vl_t vl_ldu( void const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
      50   282722406 : static inline void vl_stu( void * p, vl_t i ) { _mm_storeu_si128( (__m128i *)p, i ); }
      51             : 
      52             : /* vl_ldif is an optimized equivalent to vl_notczero(c,vl_ldu(p)) (may
      53             :    have different behavior if c is not a proper vector conditional).  It
      54             :    is provided for symmetry with the vl_stif operation.  vl_stif stores
      55             :    x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
      56             :    Undefined behavior if c is not a proper vector conditional. */
      57             : 
      58             : #define vl_ldif(c,p)   _mm_maskload_epi64( (p),(c))
      59             : #define vl_stif(c,p,x) _mm_maskstore_epi64((p),(c),(x))
      60             : 
      61             : /* Element operations */
      62             : 
      63             : /* vl_extract extracts the long in lane imm from the vector long as a
      64             :    long.  vl_insert returns the vector long formed by replacing the
      65             :    value in lane imm of a with the provided long.  imm should be a
      66             :    compile time known in 0:1.  vl_extract_variable and
      67             :    vl_insert_variable are the slower but the lane n does not have to be
      68             :    known at compile time (should still be in 0:1).
      69             : 
      70             :    Note: C99 TC3 allows type punning through a union. */
      71             : 
      72   282722406 : #define vl_extract(a,imm)  _mm_extract_epi64( (a), (imm) )
      73             : 
      74   282722406 : #define vl_insert(a,imm,v) _mm_insert_epi64( (a), (v), (imm) )
      75             : 
      76             : static inline long
      77   282722406 : vl_extract_variable( vl_t a, int n ) {
      78   282722406 :   union { __m128i m[1]; long l[2]; } t[1];
      79   282722406 :   _mm_store_si128( t->m, a );
      80   282722406 :   return t->l[n];
      81   282722406 : }
      82             : 
      83             : static inline vl_t
      84   282722406 : vl_insert_variable( vl_t a, int n, long v ) {
      85   282722406 :   union { __m128i m[1]; long l[2]; } t[1];
      86   282722406 :   _mm_store_si128( t->m, a );
      87   282722406 :   t->l[n] = v;
      88   282722406 :   return _mm_load_si128( t->m );
      89   282722406 : }
      90             : 
      91             : /* Given [a0 a1] and/or [b0 b1], return ... */
      92             : 
      93             : /* Arithmetic operations */
      94             : 
      95             : #define vl_neg(a)   _mm_sub_epi64( _mm_setzero_si128(), (a) ) /* [ -a0  -a1  ] (twos complement handling) */
      96             : 
      97             : /* Note: _mm_{abs,min,max}_epi64 are missing pre AVX-512.  We emulate
      98             :    these below (and use the AVX-512 versions if possible).  Likewise,
      99             :    there is no _mm_mullo_epi64 pre AVX-512.  Since this is not cheap to
     100             :    emulate, we do not provide a vl_mul for the time being (we could
     101             :    consider exposing it on AVX-512 targets though).  There is a
     102             :    64L*64L->64 multiply (where the lower 32-bits will be sign extended
     103             :    to 64-bits beforehand) though and that is very useful.  So we do
     104             :    provide that. */
     105             : 
     106      196608 : #define vl_add(a,b)    _mm_add_epi64(   (a), (b) ) /* [ a0 +b0     a1 +b1     ] */
     107             : #define vl_sub(a,b)    _mm_sub_epi64(   (a), (b) ) /* [ a0 -b0     a1 -b1     ] */
     108             : //#define vl_mul(a,b)  _mm_mullo_epi64( (a), (b) ) /* [ a0 *b0     a1 *b1     ] */
     109             : #define vl_mul_ll(a,b) _mm_mul_epi32(   (a), (b) ) /* [ a0l*b0l    a1l*b1l    ] */
     110             : 
     111             : /* Binary operations */
     112             : 
     113             : /* Note: vl_shl/vl_shr/vl_shru is a left/signed right/unsigned right
     114             :    shift by imm bits; imm should be a compile time constant in 0:63.
     115             :    The variable variants are slower but do not require the shift amount
     116             :    to be known at compile time (should still be in 0:63).  Also, AVX is
     117             :    missing _mm_sra*_epi64 intrinsics.  We emulate these below. */
     118             : 
     119             : #define vl_not(a) _mm_xor_si128( _mm_set1_epi64x( -1L ), (a) ) /* [ ~a0 ~a1 ] */
     120             : 
     121             : #define vl_shl(a,imm)   _mm_slli_epi64( (a), (imm) ) /* [ a0<<imm a1<<imm ] */
     122             : //#define vl_shr(a,imm) _mm_srai_epi64( (a), (imm) ) /* [ a0>>imm a1>>imm ] (treat a as signed)*/
     123             : #define vl_shru(a,imm)  _mm_srli_epi64( (a), (imm) ) /* [ a0>>imm a1>>imm ] (treat a as unsigned) */
     124             : 
     125             : #define vl_shl_variable(a,n)   _mm_sll_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     126             : //#define vl_shr_variable(a,n) _mm_sra_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     127             : #define vl_shru_variable(a,n)  _mm_srl_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     128             : 
     129             : #define vl_shl_vector(a,b)   _mm_sllv_epi64( (a), (b) ) /* [ a0<<b0 a1<<b1 ] */
     130             : //#define vl_shr_vector(a,b) _mm_srav_epi64( (a), (b) ) /* [ a0>>b0 a1>>b1 ] (treat a as signed) */
     131             : #define vl_shru_vector(a,b)  _mm_srlv_epi64( (a), (b) ) /* [ a0>>b0 a1>>b1 ] (treat a as unsigned) */
     132             : 
     133             : #define vl_and(a,b)    _mm_and_si128(    (a), (b) ) /* [   a0 &b0    a1& b1 ] */
     134             : #define vl_andnot(a,b) _mm_andnot_si128( (a), (b) ) /* [ (~a0)&b0  (~a1)&b1 ] */
     135    41943040 : #define vl_or(a,b)     _mm_or_si128(     (a), (b) ) /* [   a0 |b0    a1 |b1 ] */
     136             : #define vl_xor(a,b)    _mm_xor_si128(    (a), (b) ) /* [   a0 ^b0    a1 ^b1 ] */
     137             : 
     138             : /* vl_rol(x,n) returns vl( rotate_left (x0,n), rotate_left (x1,n), ... )
     139             :    vl_ror(x,n) returns vl( rotate_right(x0,n), rotate_right(x1,n), ... ) */
     140             : 
     141             : #if FD_HAS_AVX512
     142             : #define vl_rol(a,imm)  _mm_rol_epi64( (a), (imm) )
     143             : #define vl_ror(a,imm)  _mm_ror_epi64( (a), (imm) )
     144             : #else
     145     8388608 : static inline vl_t vl_rol( vl_t a, int imm ) { return vl_or( vl_shl(  a, imm & 63 ), vl_shru( a, (-imm) & 63 ) ); }
     146     8388608 : static inline vl_t vl_ror( vl_t a, int imm ) { return vl_or( vl_shru( a, imm & 63 ), vl_shl(  a, (-imm) & 63 ) ); }
     147             : #endif
     148             : 
     149    12582912 : static inline vl_t vl_rol_variable( vl_t a, int n ) { return vl_or( vl_shl_variable(  a, n&63 ), vl_shru_variable( a, (-n)&63 ) ); }
     150    12582912 : static inline vl_t vl_ror_variable( vl_t a, int n ) { return vl_or( vl_shru_variable( a, n&63 ), vl_shl_variable(  a, (-n)&63 ) ); }
     151             : 
     152           0 : static inline vl_t vl_rol_vector( vl_t a, vl_t b ) {
     153           0 :   vl_t m = vl_bcast( 63L );
     154           0 :   return vl_or( vl_shl_vector(  a, vl_and( b, m ) ), vl_shru_vector( a, vl_and( vl_neg( b ), m ) ) );
     155           0 : }
     156             : 
     157           0 : static inline vl_t vl_ror_vector( vl_t a, vl_t b ) {
     158           0 :   vl_t m = vl_bcast( 63L );
     159           0 :   return vl_or( vl_shru_vector( a, vl_and( b, m ) ), vl_shl_vector(  a, vl_and( vl_neg( b ), m ) ) );
     160           0 : }
     161             : 
     162             : /* Logical operations */
     163             : 
     164             : #define vl_lnot(a)    _mm_cmpeq_epi64( (a), _mm_setzero_si128() )                                          /* [  !a0  !a1 ] */
     165             : #define vl_lnotnot(a) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpeq_epi64( (a), _mm_setzero_si128() ) ) /* [ !!a0 !!a1 ] */
     166             : 
     167             : #define vl_eq(a,b) _mm_cmpeq_epi64( (a), (b) )                                          /* [ a0==b0 a1==b1 ] */
     168             : #define vl_gt(a,b) _mm_cmpgt_epi64( (a), (b) )                                          /* [ a0> b0 a1> b1 ] */
     169    25165824 : #define vl_lt(a,b) _mm_cmpgt_epi64( (b), (a) )                                          /* [ a0< b0 a1< b1 ] */
     170             : #define vl_ne(a,b) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpeq_epi64( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ] */
     171             : #define vl_ge(a,b) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpgt_epi64( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ] */
     172             : #define vl_le(a,b) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpgt_epi64( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ] */
     173             : 
     174             : /* Conditional operations */
     175             : 
     176             : #define vl_czero(c,f)    _mm_andnot_si128( (c), (f) ) /* [ c0?0L:f0 c1? 0:f1 ] */
     177             : #define vl_notczero(c,f) _mm_and_si128(    (c), (f) ) /* [ c0?f0:0L c1?f1:0L ] */
     178             : 
     179      655360 : #define vl_if(c,t,f) _mm_blendv_epi8(  (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ] */
     180             : 
     181             : #if defined(__AVX512F__) && defined(__AVX512VL__) /* See note above */
     182             : #define vl_abs(a)   _mm_abs_epi64( (a) )
     183       65536 : #define vl_min(a,b) _mm_min_epi64( (a), (b) )
     184       65536 : #define vl_max(a,b) _mm_max_epi64( (a), (b) )
     185             : #else
     186      131072 : static inline vl_t vl_abs( vl_t a )         { return vl_if( vl_lt( a, vl_zero() ), vl_neg( a ), a ); }
     187      262144 : static inline vl_t vl_min( vl_t a, vl_t b ) { return vl_if( vl_lt( a, b ), a, b ); }
     188      262144 : static inline vl_t vl_max( vl_t a, vl_t b ) { return vl_if( vl_gt( a, b ), a, b ); }
     189             : #endif
     190             : 
     191    12582912 : static inline vl_t vl_shr( vl_t a, int imm ) {
     192    12582912 :   vc_t c = vl_lt( a, vl_zero() ); /* Note that vc_t is binary compat with vl_t */
     193    12582912 :   return _mm_xor_si128( _mm_srli_epi64( _mm_xor_si128( a, c ), imm ), c );
     194    12582912 : }
     195    12582912 : static inline vl_t vl_shr_variable( vl_t a, int n ) {
     196    12582912 :   vc_t c = vl_lt( a, vl_zero() ); /* Note that vc_t is binary compat with vl_t */
     197    12582912 :   return _mm_xor_si128( _mm_srl_epi64( _mm_xor_si128( a, c ), _mm_insert_epi64( _mm_setzero_si128(), n, 0 ) ), c );
     198    12582912 : }
     199           0 : static inline vl_t vl_shr_vector( vl_t a, vl_t n ) {
     200           0 :   vc_t c = vl_lt( a, vl_zero() ); /* Note that vc_t is binary compat with vl_t */
     201           0 :   return _mm_xor_si128( _mm_srlv_epi64( _mm_xor_si128( a, c ), n ), c );
     202           0 : }
     203             : 
     204             : /* Conversion operations */
     205             : 
     206             : /* Summarizing:
     207             : 
     208             :    vl_to_vc(d)     returns [ !!l0 !!l0 !!l1 !!l1 ]
     209             : 
     210             :    vl_to_vf(l,f,0) returns [ (float)l0 (float)l1 f2 f3 ]
     211             :    vl_to_vf(l,f,1) returns [ f0 f1 (float)l0 (float)l1 ]
     212             : 
     213             :    vl_to_vi(l,i,0) returns [ (int)l0 (int)l1 i2 i3 ]
     214             :    vl_to_vi(l,i,1) returns [ i0 i1 (int)l0 (int)l1 ]
     215             : 
     216             :    vl_to_vu(l,u,0) returns [ (uint)l0 (uint)l1 u2 u3 ]
     217             :    vl_to_vu(l,u,1) returns [ u0 u1 (uint)l0 (uint)l1 ]
     218             : 
     219             :    vl_to_vd(l)     returns [ (double)l0 (double)l1 ]
     220             : 
     221             :    vl_to_vv(l)     returns [ (ulong)l0 (ulong)l1 ]
     222             : 
     223             :    The raw variants just treat the raw bits as the corresponding vector
     224             :    type.  For vl_to_vc_raw, the user promises vl contains a proper
     225             :    vector conditional (e.g. 0 or -1 in each lane).  The others are
     226             :    provided to facilitate doing advanced bit tricks on floating point
     227             :    values. */
     228             : 
     229             : #define vl_to_vc(a) _mm_xor_si128( _mm_set1_epi64x( -1L ), _mm_cmpeq_epi64( (a), _mm_setzero_si128() ) )
     230             : 
     231      393216 : static inline vf_t vl_to_vf( vl_t l, vf_t f, int imm_hi ) {
     232      393216 :   float f0 = (float)_mm_extract_epi64( l, 0 );
     233      393216 :   float f1 = (float)_mm_extract_epi64( l, 1 );
     234      393216 :   return imm_hi ? vf_insert( vf_insert( f, 2, f0 ), 3, f1 ) : vf_insert( vf_insert( f, 0, f0 ), 1, f1 ); /* Compile time */
     235      393216 : }
     236             : 
     237      393216 : static inline vl_t vl_to_vi( vl_t l, vi_t i, int imm_hi ) {
     238      393216 :   vf_t _l = _mm_castsi128_ps( l ); /* [ x0l x0h x1l x1h ] */
     239      393216 :   vf_t _i = _mm_castsi128_ps( i );
     240      393216 :   if( imm_hi ) _l = _mm_shuffle_ps( _i, _l, _MM_SHUFFLE(2,0,1,0) ); /* Compile time */
     241      196608 :   else         _l = _mm_shuffle_ps( _l, _i, _MM_SHUFFLE(3,2,2,0) );
     242      393216 :   return _mm_castps_si128( _l );
     243      393216 : }
     244             : 
     245      393216 : static inline vl_t vl_to_vu( vl_t l, vu_t u, int imm_hi ) {
     246      393216 :   vf_t _l = _mm_castsi128_ps( l ); /* [ x0l x0h x1l x1h ] */
     247      393216 :   vf_t _u = _mm_castsi128_ps( u );
     248      393216 :   if( imm_hi ) _l = _mm_shuffle_ps( _u, _l, _MM_SHUFFLE(2,0,1,0) ); /* Compile time */
     249      196608 :   else         _l = _mm_shuffle_ps( _l, _u, _MM_SHUFFLE(3,2,2,0) );
     250      393216 :   return _mm_castps_si128( _l );
     251      393216 : }
     252             : 
     253      196608 : static inline vd_t vl_to_vd( vl_t l ) {
     254      196608 :   return _mm_setr_pd( (double)_mm_extract_epi64( l, 0 ), (double)_mm_extract_epi64( l, 1 ) );
     255      196608 : }
     256             : 
     257             : #define vl_to_vv(a) (a)
     258             : 
     259             : #define vl_to_vc_raw(a) (a)
     260             : #define vl_to_vf_raw(a) _mm_castsi128_ps( (a) )
     261             : #define vl_to_vi_raw(a) (a)
     262             : #define vl_to_vu_raw(a) (a)
     263             : #define vl_to_vd_raw(a) _mm_castsi128_pd( (a) )
     264             : #define vl_to_vv_raw(a) (a)
     265             : 
     266             : /* Reduction operations */
     267             : 
     268             : static inline vl_t
     269      196608 : vl_sum_all( vl_t x ) { /* Returns vl_bcast( sum( x ) ) */
     270      196608 :   return vl_add( x, vl_permute( x, 1, 0 ) );
     271      196608 : }
     272             : 
     273             : static inline vl_t
     274      196608 : vl_min_all( vl_t x ) { /* Returns vl_bcast( min( x ) ) */
     275      196608 :   return vl_min( x, vl_permute( x, 1, 0 ) );
     276      196608 : }
     277             : 
     278             : static inline vl_t
     279      196608 : vl_max_all( vl_t x ) { /* Returns vl_bcast( max( x ) ) */
     280      196608 :   return vl_max( x, vl_permute( x, 1, 0 ) );
     281      196608 : }
     282             : 
     283             : /* Misc operations */
     284             : 
     285             : /* vl_gather(b,i,imm_i0,imm_i1) returns [ b[i(imm_i0)] b[i(imm_i1)] ]
     286             :    where b is a  "long const *" and i is a vi_t and imm_i0,imm_i1 are
     287             :    compile time constants in 0:3.  We use a static inline here instead
     288             :    of a define to keep strict type checking while working around yet
     289             :    another Intel intrinsic type mismatch issue.  And we use a define to
     290             :    workaround clang sadness with passing a compile time constant into a
     291             :    static inline. */
     292             : 
     293             : #if defined(__AVX2__)
     294   565444812 : static inline vl_t _vl_gather( long const * b, vi_t i ) {
     295   565444812 :   return _mm_i32gather_epi64( (long long const *)b, i, 8 );
     296   565444812 : }
     297             : #endif
     298             : 
     299   565444812 : #define vl_gather(b,i,imm_i0,imm_i1) _vl_gather( (b), _mm_shuffle_epi32( (i), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
     300             : 
     301             : /* vl_transpose_2x2 transposes the 2x2 matrix stored in vl_t r0,r1
     302             :    and stores the result in 2x2 matrix vl_t c0,c1.  All c0,c1 should be
     303             :    different for a well defined result.  Otherwise, in-place operation
     304             :    and/or using the same vl_t to specify multiple rows of r is fine. */
     305             : 
     306      196608 : #define vl_transpose_2x2( r0,r1, c0,c1 ) do {                        \
     307      196608 :     vl_t _vl_transpose_r0 = (r0); vl_t _vl_transpose_r1 = (r1);      \
     308      196608 :     (c0) = _mm_unpacklo_epi64( _vl_transpose_r0, _vl_transpose_r1 ); \
     309      196608 :     (c1) = _mm_unpackhi_epi64( _vl_transpose_r0, _vl_transpose_r1 ); \
     310      196608 :   } while(0)

Generated by: LCOV version 1.14