LCOV - code coverage report
Current view: top level - util/simd - fd_sse_vc.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 54 54 100.0 %
Date: 2025-01-08 12:08:44 Functions: 11 70 15.7 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_sse_h
       2             : #error "Do not include this directly; use fd_sse.h"
       3             : #endif
       4             : 
       5             : /* TODO: the below is much very designed for a 32-bit SIMD lane world
       6             :    (with 64-bit SIMD lane support hacked on afterward).  Revamp these to
       7             :    be more general for 8, 16, 32 and 64 bit lanes. */
       8             : 
       9             : /* Vector conditional API *********************************************/
      10             : 
      11             : /* A vc_t is a vector conditional.  This is, it is a vector of integers
      12             :    where each 32-bit wide lane is either 0 (all zero bits), indicating
      13             :    the condition is true for that lane or -1 (all one bits), indicating
      14             :    the condition is false for that lane.  This allows fast bit
      15             :    operations to mask other types of vectors.  If this API is used on
      16             :    vectors that aren't proper vector conditionals, results are
      17             :    undefined.  When vector conditional are applied to vector doubles,
      18             :    longs and ulongs, adjacent lanes (0-1 / 2-3) should have identical
      19             :    values, otherwise results will be undefined.
      20             : 
      21             :    These mirror the other APIs as much as possible.  Macros are
      22             :    preferred over static inlines when it is possible to do it robustly
      23             :    to reduce the risk of the compiler mucking it up. */
      24             : 
      25    38931456 : #define vc_t __m128i
      26             : 
      27             : /* Constructors */
      28             : 
      29             : /* vc returns a vc_t corresponding to the c-style logical values c0:c3.
      30             :    This will always create a proper vector conditional regardless how
      31             :    logical values were presented to them.  That is, the provided values
      32             :    will be treated as c-style logical values such that zero/false will
      33             :    become zero/false in the vector and non-zero/true will become -1/true
      34             :    in the vector conditional).  Similarly for vc_bcast*.  Summarizing:
      35             : 
      36             :      vc(c0,c1,c2,c3)      return [c0 c1 c2 c3]
      37             :      vc_bcast(c0)         return [c0 c0 c0 c0]
      38             :      vc_bcast_pair(c0,c1) return [c0 c1 c0 c1]
      39             :      vc_bcast_wide(c0,c1) return [c0 c0 c1 c1] */
      40             : 
      41      589914 : #define vc(c0,c1,c2,c3) _mm_setr_epi32( -!!(c0), -!!(c1), -!!(c2), -!!(c3) )
      42             : 
      43             : #if 0 /* Compiler sometimes tries to turn this into branches ... sigh */
      44             : #define vc_bcast(c0) _mm_set1_epi32( -!!(c0) )
      45             : #else
      46             : static inline __m128i
      47         225 : vc_bcast( int c0 ) {
      48         225 :   c0 = -!!c0; FD_COMPILER_FORGET( c0 );
      49         225 :   return _mm_set1_epi32( c0 );
      50         225 : }
      51             : #endif
      52             : 
      53             : static inline vc_t
      54          45 : vc_bcast_pair( int c0, int c1 ) {
      55          45 :   c0 = -!!c0; c1 = -!!c1;
      56          45 :   return _mm_setr_epi32( c0, c1, c0, c1 );
      57          45 : }
      58             : 
      59             : static inline vc_t
      60      590004 : vc_bcast_wide( int c0, int c1 ) {
      61      590004 :   c0 = -!!c0; c1 = -!!c1;
      62      590004 :   return _mm_setr_epi32( c0, c0, c1, c1 );
      63      590004 : }
      64             : 
      65             : /* vc_permute(c,imm_i0,imm_i1,imm_i2,imm_i3) returns
      66             :    [ c(imm_i0) c(imm_i1) c(imm_i2) c(imm_i3) ].  imm_i* should be
      67             :    compile time constants in 0:3. */
      68             : 
      69             : #define vc_permute(c,imm_i0,imm_i1,imm_i2,imm_i3) _mm_shuffle_epi32( (c), _MM_SHUFFLE( (imm_i3), (imm_i2), (imm_i1), (imm_i0) ) )
      70             : 
      71             : /* Predefined constants. */
      72             : 
      73             : #define vc_false() _mm_setzero_si128()  /* vc_false() returns [ f f f f ] */
      74             : #define vc_true()  _mm_set1_epi32( -1 ) /* vc_true()  returns [ t t t t ] */
      75             : 
      76             : /* Memory operations */
      77             : 
      78             : /* vc_ld returns the 4 integers at the 16-byte aligned / 16-byte sized
      79             :    location p as a proper vector conditional (see above note about
      80             :    c-style logicals).  vc_ldu is the same but p does not have to be
      81             :    aligned.  In the fast variants, the caller promises that p already
      82             :    holds a proper vector conditions (e.g. 0/-1 for true/false).  vc_st
      83             :    writes the vector conditional c at the 16-byte aligned / 16-byte size
      84             :    location p (0/-1 for true/false).  vc_stu is the same but p does not
      85             :    have to be aligned.  Lane l will be at p[l].  FIXME: USE ATTRIBUTES
      86             :    ON P PASSED TO THESE?
      87             : 
      88             :    Note: gcc knows that __m128i may alias. */
      89             : 
      90             : static inline vc_t
      91    11797431 : vc_ld( int const * p ) {
      92    11797431 :   return _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( _mm_load_si128(  (__m128i const *)p ), _mm_setzero_si128() ) );
      93    11797431 : }
      94    11797431 : static inline vc_t vc_ld_fast( int const * p ) { return _mm_load_si128(  (__m128i const *)p ); }
      95    11797431 : static inline void vc_st( int * p, vc_t c ) { _mm_store_si128(  (__m128i *)p, c ); }
      96             : 
      97             : static inline vc_t
      98    47189724 : vc_ldu( void const * p ) {
      99    47189724 :   return _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_cmpeq_epi32( _mm_loadu_si128( (__m128i const *)p ), _mm_setzero_si128() ) );
     100    47189724 : }
     101    47189724 : static inline vc_t vc_ldu_fast( void const * p ) { return _mm_loadu_si128( (__m128i const *)p ); }
     102    47189724 : static inline void vc_stu( void * p, vc_t c ) { _mm_storeu_si128( (__m128i *)p, c ); }
     103             : 
     104             : /* vc_ldif is an optimized equivalent to vc_and(c,vc_ldu(p)).  Similarly
     105             :    for vc_ldif_fast (either may have different behavior if c is not a
     106             :    proper vector conditional).  vc_ldif_fast assumes p already holds a
     107             :    proper vector conditional.  These are provided for symmetry with the
     108             :    vc_stif operation.  vc_stif stores x(n) to p[n] if c(n) is true and
     109             :    leaves p[n] unchanged otherwise.  Undefined behavior if c is not a
     110             :    proper vector conditional. */
     111             : 
     112             : #define vc_ldif(c,p)      _mm_xor_si128(_mm_set1_epi32(-1),_mm_cmpeq_epi32( _mm_maskload_epi32((p),(c)),_mm_setzero_si128()))
     113             : #define vc_ldif_fast(c,p) _mm_maskload_epi32((p),(c))
     114             : #define vc_stif(c,p,x)    _mm_maskstore_epi32((p),(c),(x))
     115             : 
     116             : /* Element operations */
     117             : 
     118             : /* vc_extract extracts the value of lane imm from the vector conditional
     119             :    as an int 0 (false) or 1 (true).  vc_insert returns the vector
     120             :    conditional formed by replacing the value in lane imm of a with the
     121             :    provided c-style logical.  imm should be a compile time constant in
     122             :    0:3.  vc_extract_variable and vc_insert_variable are the slower but
     123             :    the lane does not have to be known at compile time (should still be
     124             :    in 0:3). */
     125             : 
     126    47189724 : #define vc_extract(c,imm)  ((_mm_movemask_ps( _mm_castsi128_ps( (c) ) ) >> (imm)) & 1)
     127    47189724 : #define vc_insert(a,imm,c) _mm_insert_epi32( (a), -!!(c), (imm) )
     128             : 
     129    47189724 : #define vc_extract_variable(c,n) ((_mm_movemask_ps( _mm_castsi128_ps( (c) ) ) >> (n)  ) & 1)
     130             : #define vc_insert_variable(a,n,c)                                                                                              \
     131    47189724 :   _mm_cmpgt_epi32( _mm_and_si128( _mm_set1_epi32( (_mm_movemask_ps( _mm_castsi128_ps( (a) ) ) & (~(1<<(n)))) | ((!!(c))<<n) ), \
     132    47189724 :                                   _mm_setr_epi32( 1<<0, 1<<1, 1<<2, 1<<3 ) ), _mm_setzero_si128() )
     133             : 
     134             : /* Given [ a0 a1 a2 a3 ] and/or [ b0 b1 b2 b3 ], return ... */
     135             : 
     136             : /* Arithmetic operations */
     137             : 
     138             : /* Note: arithmetic and shift operations are not well defined for a vc_t
     139             :    as it isn't clear if user would like to treat the vector conditional
     140             :    these as 4 1-bit signed ints (0/-1), 4 1-bit unsigned ints (0/1) or
     141             :    4-GF2 elements (f/t but sign is meaningless) or do cross lane motion
     142             :    of the condition.  Instead, the user should use vc_to_{vi,vl}[_raw]
     143             :    as necessary and use the appropriate binary, arithmetic, permute
     144             :    and/or shift operations there. */
     145             : 
     146             : /* Binary operations */
     147             : 
     148             : #define vc_not(a)      _mm_xor_si128( _mm_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a3 ] */
     149             : 
     150             : #define vc_and(a,b)    _mm_and_si128(   (a),(b)) /* [   a0 &b0   a1 &b1 ...   a3 &b3 ] */
     151             : #define vc_or(a,b)     _mm_or_si128(    (a),(b)) /* [   a0 |b0   a1 |b1 ...   a3 |b3 ] */
     152             : #define vc_xor(a,b)    _mm_xor_si128(   (a),(b)) /* [   a0 ^b0   a1 ^b1 ...   a3 ^b3 ] */
     153             : #define vc_andnot(a,b) _mm_andnot_si128((a),(b)) /* [ (~a0)&b0 (~a1)&b1 ... (~a3)&b3 ] */
     154             : 
     155             : /* Logical operations */
     156             : 
     157             : /* Note: vc_{gt,lt,ge,le} are provided for completeness and treat
     158             :    true>false. */
     159             : 
     160             : #define vc_lnot(a)    _mm_xor_si128( _mm_set1_epi32( -1 ), (a) ) /* [ ~a0 ~a1 ... ~a3 ] */
     161             : #define vc_lnotnot(a) (a)                                        /* [  a0  a1 ...  a3 ] */
     162             : 
     163             : #define vc_eq(a,b) _mm_cmpeq_epi32( (a),(b))                                           /* [ a0==b0  a1==b1 ... a3==b3 ] */
     164             : #define vc_gt(a,b) _mm_andnot_si128((b),(a))                                           /* [ a0> b0  a1> b1 ... a3> b3 ] */
     165             : #define vc_lt(a,b) _mm_andnot_si128((a),(b))                                           /* [ a0< b0  a1< b1 ... a3< b3 ] */
     166             : #define vc_ne(a,b) _mm_xor_si128(   (a),(b))                                           /* [ a0!=b0  a1!=b1 ... a3!=b3 ] */
     167             : #define vc_ge(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_andnot_si128( (a), (b) ) ) /* [ a0>=b0  a1>=b1 ... a3>=b3 ] */
     168             : #define vc_le(a,b) _mm_xor_si128( _mm_set1_epi32( -1 ), _mm_andnot_si128( (b), (a) ) ) /* [ a0<=b0  a1<=b1 ... a3<=b3 ] */
     169             : 
     170             : /* Conditional operations */
     171             : 
     172             : /* FIXME: Define vc_czero / vc_notczero?  Equivalent TO vc_andnot and
     173             :    vc_and but have arithmetic connotations.  */
     174             : 
     175             : #define vc_if(c,t,f) _mm_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c3?t3:f3 ] */
     176             : 
     177             : /* Conversion operations */
     178             : 
     179             : /* vc_to_{vf,vi,vu,vd,vl,vv} convert a proper vector conditional into a
     180             :    vector float/int/double/long/ulong with f mapping to 0 and t mapping
     181             :    to 1 in each lane.
     182             : 
     183             :    vc_to_{vf,vi,vu,vd,vl,vv}_raw just treat the raw bits in the vector
     184             :    conditional as the corresponding vector type.  vc_to_{vi,vu}_raw map
     185             :    false(true) to 0(-1) and similarly for vc_to_{vl,vv}_raw when c has
     186             :    paired lanes.  vc_to_{vf,vd}_raw probably are not useful in practice
     187             :    but are provided for completeness; vc_to_vf_raw maps false(true) to
     188             :    0(-nan) and similarly for vc_to_vd_raw when c has paired lanes. */
     189             : 
     190             : #define vc_to_vf(a) _mm_and_ps( _mm_castsi128_ps( (a) ), _mm_set1_ps( 1.f ) )
     191             : #define vc_to_vi(a) _mm_and_si128( (a), _mm_set1_epi32( 1 ) )
     192             : #define vc_to_vu(a) _mm_and_si128( (a), _mm_set1_epi32( 1 ) )
     193             : #define vc_to_vd(a) _mm_and_pd( _mm_castsi128_pd( (a) ), _mm_set1_pd( 1. ) ) /* vc should have paired lanes */
     194             : #define vc_to_vl(a) _mm_and_si128( (a), _mm_set1_epi64x( 1L ) )              /* vc should have paired lanes */
     195             : #define vc_to_vv(a) _mm_and_si128( (a), _mm_set1_epi64x( 1L ) )              /* vc should have paired lanes */
     196             : 
     197             : #define vc_to_vf_raw(a) _mm_castsi128_ps( (a) )
     198             : #define vc_to_vi_raw(a) (a)
     199             : #define vc_to_vu_raw(a) (a)
     200             : #define vc_to_vd_raw(a) _mm_castsi128_pd( (a) )
     201             : #define vc_to_vl_raw(a) (a)
     202             : #define vc_to_vv_raw(a) (a)
     203             : 
     204             : /* Reduction operations */
     205             : 
     206             : /* vc_any/vc_all returns logical true if any/all conditions in c is true */
     207             : 
     208   930357276 : #define vc_any(c) (_mm_movemask_ps( _mm_castsi128_ps( (c) ) )!=0x0)
     209  1313347911 : #define vc_all(c) (_mm_movemask_ps( _mm_castsi128_ps( (c) ) )==0xf)
     210             : 
     211             : /* Misc operations */
     212             : 
     213             : /* vc_pack returns an int where bit i equals 0(1) if lane i of c is
     214             :    false(true) for i in [0,4).  Vice versa for vc_unpack. */
     215             : 
     216  1563823158 : #define vc_pack(c)   _mm_movemask_ps( _mm_castsi128_ps( (c) ) )
     217        1536 : #define vc_unpack(b) _mm_cmpgt_epi32( _mm_and_si128( _mm_set1_epi32( (b) ), _mm_setr_epi32( 1<<0, 1<<1, 1<<2, 1<<3 ) ), \
     218        1536 :                                       _mm_setzero_si128() )
     219             : 
     220             : /* vc_expand expands c0:c1 (imm_hi==0) or c2:c3 (imm_hi==1) into a
     221             :    paired lane conditional.  That is:
     222             : 
     223             :      vc_expand(c,0) returns [ c0 c0 c1 c1 ]
     224             :      vc_expand(c,1) returns [ c2 c2 c3 c3 ]
     225             : 
     226             :    Conversely:
     227             : 
     228             :      vc_narrow(a,b) returns [ a0 a2 b0 b2 ]
     229             : 
     230             :    which is useful for turning two paired lane conditionals into a
     231             :    single lane conditional.  U.B. if a, b, and/or c are not proper
     232             :    vector conditional.  These are useful, for example, for vectorizing
     233             :    64-bit pointer arithmetic used in 32-bit lane SIMD. */
     234             : 
     235          90 : static inline vc_t vc_expand( vc_t c, int imm_hi ) {
     236          90 :   return _mm_cvtepi32_epi64( imm_hi ? _mm_shuffle_epi32( c, _MM_SHUFFLE(3,2,3,2) ) : c ); /* compile time */
     237          90 : }
     238             : 
     239             : #define vc_narrow(a,b) _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( (a) ), _mm_castsi128_ps( (b) ), _MM_SHUFFLE(2,0,2,0) ) )
     240             : 
     241             : /* vc_gather(b,i) returns [ -!!b[i(0)] -!!b[i(1)] ... -!!b[i(3)] ] where
     242             :    b is an "int const *" (0/non-zero map to false/true) and i is a vi_t.
     243             : 
     244             :    vc_gather_fast(b,i) returns [ b[i(0)] b[i(1)] ... b[i(3)] ] where b s
     245             :    an "int const *".   User promises b[i(:)] values are already either 0
     246             :    or -1.  i here is a vi_t.  */
     247             : 
     248             : #if defined(__AVX2__)
     249    11797431 : #define vc_gather(b,i)      _mm_xor_si128( _mm_set1_epi32( -1 ), \
     250    11797431 :                                            _mm_cmpeq_epi32( _mm_i32gather_epi32( (b), (i), 4 ), _mm_setzero_si128() ) )
     251    11797431 : #define vc_gather_fast(b,i) _mm_i32gather_epi32( (b), (i), 4 )
     252             : #endif
     253             : 
     254             : /* vc_transpose_4x4 transposes the 4x4 matrix stored in vc_t r0,r1,r2,r3
     255             :    and stores the result in 4x4 matrix vc_t c0,c1,c2,c3.  All
     256             :    c0,c1,c2,c3 should be different for a well defined result.
     257             :    Otherwise, in-place operation and/or using the same vc_t to specify
     258             :    multiple rows of r is fine. */
     259             : 
     260          45 : #define vc_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do {                                                                   \
     261          45 :     vc_t _vc_transpose_r0 = (r0); vc_t _vc_transpose_r1 = (r1); vc_t _vc_transpose_r2 = (r2); vc_t _vc_transpose_r3 = (r3); \
     262          45 :     vc_t _vc_transpose_t;                                                                                                   \
     263          45 :     /* Transpose 2x2 blocks */                                                                                              \
     264          45 :     _vc_transpose_t = _vc_transpose_r0; _vc_transpose_r0 = _mm_unpacklo_epi32( _vc_transpose_t,  _vc_transpose_r2 );        \
     265          45 :     /**/                                _vc_transpose_r2 = _mm_unpackhi_epi32( _vc_transpose_t,  _vc_transpose_r2 );        \
     266          45 :     _vc_transpose_t = _vc_transpose_r1; _vc_transpose_r1 = _mm_unpacklo_epi32( _vc_transpose_t,  _vc_transpose_r3 );        \
     267          45 :     /**/                                _vc_transpose_r3 = _mm_unpackhi_epi32( _vc_transpose_t,  _vc_transpose_r3 );        \
     268          45 :     /* Transpose 1x1 blocks */                                                                                              \
     269          45 :     /**/                                (c0)             = _mm_unpacklo_epi32( _vc_transpose_r0, _vc_transpose_r1 );        \
     270          45 :     /**/                                (c1)             = _mm_unpackhi_epi32( _vc_transpose_r0, _vc_transpose_r1 );        \
     271          45 :     /**/                                (c2)             = _mm_unpacklo_epi32( _vc_transpose_r2, _vc_transpose_r3 );        \
     272          45 :     /**/                                (c3)             = _mm_unpackhi_epi32( _vc_transpose_r2, _vc_transpose_r3 );        \
     273          45 :   } while(0)

Generated by: LCOV version 1.14