LCOV - code coverage report
Current view: top level - util/simd - fd_avx512_wwl.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 85 85 100.0 %
Date: 2024-11-13 11:58:15 Functions: 9 312 2.9 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_avx512_h
       2             : #error "Do not include this directly; use fd_avx512.h"
       3             : #endif
       4             : 
       5             : /* TODO: REDUCE, EXTRACT, ADDITIONAL LANE OPS, ... */
       6             : 
       7             : /* Vector long API ****************************************************/
       8             : 
       9             : /* A wwl_t is a vector where each adjacent pair of 32-bit wide lanes
      10             :    (e.g. 0-1 / 2-3 / 4-5 / 6-7) holds a signed 64-bit twos-complement
      11             :    integer (a "long").
      12             : 
      13             :    These mirror the other APIs as much as possible.  Macros are
      14             :    preferred over static inlines when it is possible to do it robustly
      15             :    to reduce the risk of the compiler mucking it up. */
      16             : 
      17 44703829384 : #define wwl_t __m512i
      18             : 
      19             : /* wwl(x0,x1,x2,x3,x4,x5,x6,x7) returns the wwl_t [x0 x1 ... x7] where
      20             :    x* are longs */
      21             : 
      22  4766100906 : #define wwl(x0,x1,x2,x3,x4,x5,x6,x7) _mm512_setr_epi64( (x0), (x1), (x2), (x3), (x4), (x5), (x6), (x7) )
      23             : 
      24   481182358 : #define wwl_bcast(x)         _mm512_set1_epi64( (x) ) /* wwl(x, x, ... x) */
      25             : 
      26             : /* wwl_permute(p,x) returns:
      27             :      wwl( x(p(0)), x(p(1)), ... x(p(i)) ).
      28             :    As such p(*) should be longs in [0,7]. */
      29             : 
      30  8262096591 : #define wwl_permute(p,x)     _mm512_permutexvar_epi64( (p), (x) )
      31             : 
      32             : /* wwl_select(s,x,y) concatenates the wwl_t's x and y into
      33             :      z = [ x0 x1 ... x7 y0 y1 ... y7 ]
      34             :    and then returns:
      35             :      wwl( z(p(0)), z(p(1)), ... z(p(7)) ).
      36             :    As such p(*) should be longs in [0,15]. */
      37             : 
      38   903485592 : #define wwl_select(p,x,y)    _mm512_permutex2var_epi64( (x), (p), (y) )
      39             : 
      40             : /* Predefined constants */
      41             : 
      42  1701283281 : #define wwl_zero()           _mm512_setzero_si512()  /* wwl(0, 0, ... 0) */
      43             : #define wwl_one()            _mm512_set1_epi64( 1L ) /* wwl(1, 1, ... 1) */
      44             : 
      45             : /* Memory operations */
      46             : /* Note: wwl_{ld,st} assume m is 64-byte aligned while wwl_{ldu,stu}
      47             :    allow m to have arbitrary alignment */
      48             : 
      49     1000000 : static inline wwl_t wwl_ld( long const * m ) { return _mm512_load_epi64( m ); }  /* wwl( m[0], m[1], ... m[7] ) */
      50   209000000 : static inline void  wwl_st( long * m, wwl_t x ) { _mm512_store_epi64( m, x ); }  /* does m[0] = x0, m[1] = x1, ... m[7] = x7 */
      51             : 
      52     1000000 : static inline wwl_t wwl_ldu( void const * m ) { return _mm512_loadu_epi64( m ); } /* wwl( m[0], m[1], ... m[7]) */
      53     1000000 : static inline void  wwl_stu( void * m, wwl_t x ) { _mm512_storeu_epi64( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[7] = x7 */
      54             : 
      55             : /* Arithmetic operations */
      56             : 
      57             : #define wwl_neg(x)           _mm512_sub_epi64( _mm512_setzero_si512(), (x) ) /* wwl(-x0, -x1, ...-x7 ), twos complement */
      58             : #define wwl_abs(x)           _mm512_abs_epi64( (x) )                         /* wwl(|x0|,|x1|,...|x7|), twos complement */
      59             : 
      60             : #define wwl_min(x,y)         _mm512_min_epi64  ( (x), (y) ) /* wwl( min(x0,y0), min(x1,y1), ... min(x7,y7) ) */
      61             : #define wwl_max(x,y)         _mm512_max_epi64  ( (x), (y) ) /* wwl( max(x0,y0), max(x1,y1), ... max(x7,y7) ) */
      62 11954317420 : #define wwl_add(x,y)         _mm512_add_epi64  ( (x), (y) ) /* wwl( x0+y0,      x1+y1,      ... x7+y7      ) */
      63    81221698 : #define wwl_sub(x,y)         _mm512_sub_epi64  ( (x), (y) ) /* wwl( x0-y0,      x1-y1,      ... x7-y7      ) */
      64             : #define wwl_mul(x,y)         _mm512_mullo_epi64( (x), (y) ) /* wwl( x0*y0,      x1*y1,      ... x7*y7      ) */
      65             : #define wwl_mul_ll(x,y)      _mm512_mul_epi32  ( (x), (y) ) /* wwl( x0l*y0l,    x1l*y1l,    ... x7l*y7l    ) */
      66             : 
      67             : /* Binary operations */
      68             : /* Note: shifts assumes n and or y* in [0,63].  Rotates work for
      69             :    arbitrary values. */
      70             : 
      71     4000000 : #define wwl_not(x)           _mm512_xor_epi64( _mm512_set1_epi64( -1L ), (x) )
      72             : 
      73  4147043489 : #define wwl_shl(x,n)         _mm512_slli_epi64  ( (x), (uint)(n) ) /* wwl( x0<<n,  x1<<n,  ... x7<<n  ) */
      74    53479970 : #define wwl_shr(x,n)         _mm512_srai_epi64  ( (x), (uint)(n) ) /* wwl( x0>>n,  x1>>n,  ... x7>>n  ) */
      75   693799036 : #define wwl_shru(x,n)        _mm512_srli_epi64  ( (x), (uint)(n) ) /* wwl( x0>>n,  x1>>n,  ... x7>>n  ) (unsigned right shift) */
      76  2619849170 : #define wwl_shl_vector(x,y)  _mm512_sllv_epi64  ( (x), (y)       ) /* wwl( x0<<y0, x1<<y1, ... x7<<y7 ) */
      77    26739985 : #define wwl_shr_vector(x,y)  _mm512_srav_epi64  ( (x), (y)       ) /* wwl( x0>>y0, x1>>y1, ... x7>>y7 ) */
      78   380110131 : #define wwl_shru_vector(x,y) _mm512_srlv_epi64  ( (x), (y)       ) /* wwl( x0>>y0, x1>>y1, ... x7>>y7 ) (unsigned right shift) */
      79  1081996580 : #define wwl_and(x,y)         _mm512_and_epi64   ( (x), (y)       ) /* wwl( x0&y0,  x1&y1,  ... x7&y7  ) */
      80             : #define wwl_andnot(x,y)      _mm512_andnot_epi64( (x), (y)       ) /* wwl( ~x0&y0, ~x1&y1, ... ~x7&y7 ) */
      81     4000000 : #define wwl_or(x,y)          _mm512_or_epi64    ( (x), (y)       ) /* wwl( x0|y0,  x1|y1,  ... x7|y7  ) */
      82             : #define wwl_xor(x,y)         _mm512_xor_epi64   ( (x), (y)       ) /* wwl( x0^y0,  x1^y1,  ... x7^y7  ) */
      83             : 
      84             : /* wwl_rol(x,n)          returns wwl( rotate_left (x0,n ), rotate_left (x1,n ), ... )
      85             :    wwl_ror(x,n)          returns wwl( rotate_right(x0,n ), rotate_right(x1,n ), ... )
      86             :    wwl_rol_variable(x,n) returns wwl( rotate_left (x0,n ), rotate_left (x1,n ), ... )
      87             :    wwl_ror_variable(x,n) returns wwl( rotate_right(x0,n ), rotate_right(x1,n ), ... )
      88             :    wwl_rol_vector(x,y)   returns wwl( rotate_left (x0,y0), rotate_left (x1,y1), ... )
      89             :    wwl_ror_vector(x,y)   returns wwl( rotate_right(x0,y0), rotate_right(x1,y1), ... )
      90             : 
      91             :    The variable variants are slower but do not require the shift amount
      92             :    to be known at compile time. */
      93             : 
      94             : #define wwl_rol(a,imm)       _mm512_rol_epi64( (a), (imm)&63L )
      95             : #define wwl_ror(a,imm)       _mm512_ror_epi64( (a), (imm)&63L )
      96             : 
      97     1000000 : static inline wwl_t wwl_rol_variable( wwl_t a, long n ) { return wwl_or( wwl_shl ( a, n & 63L ), wwl_shru( a, (-n) & 63L ) ); }
      98     1000000 : static inline wwl_t wwl_ror_variable( wwl_t a, long n ) { return wwl_or( wwl_shru( a, n & 63L ), wwl_shl ( a, (-n) & 63L ) ); }
      99             : 
     100     1000000 : static inline wwl_t wwl_rol_vector( wwl_t a, wwl_t b ) {
     101     1000000 :   wwl_t m = wwl_bcast( 63L );
     102     1000000 :   return wwl_or( wwl_shl_vector ( a, wwl_and( b, m ) ), wwl_shru_vector( a, wwl_and( wwl_neg( b ), m ) ) );
     103     1000000 : }
     104             : 
     105     1000000 : static inline wwl_t wwl_ror_vector( wwl_t a, wwl_t b ) {
     106     1000000 :   wwl_t m = wwl_bcast( 63L );
     107     1000000 :   return wwl_or( wwl_shru_vector( a, wwl_and( b, m ) ), wwl_shl_vector ( a, wwl_and( wwl_neg( b ), m ) ) );
     108     1000000 : }
     109             : 
     110             : /* Comparison operations */
     111             : /* mask(c0,c1,...) means (((int)c0)<<0) | (((int)c1)<<1) | ... */
     112             : 
     113     5915352 : #define wwl_eq(x,y) ((int)_mm512_cmpeq_epi64_mask(  (x), (y) )) /* mask( x0==y0, x1==y1, ... ) */
     114             : #define wwl_gt(x,y) ((int)_mm512_cmpgt_epi64_mask(  (x), (y) )) /* mask( x0> y0, x1> y1, ... ) */
     115             : #define wwl_lt(x,y) ((int)_mm512_cmplt_epi64_mask(  (x), (y) )) /* mask( x0< y0, x1< y1, ... ) */
     116             : #define wwl_ne(x,y) ((int)_mm512_cmpneq_epi64_mask( (x), (y) )) /* mask( x0!=y0, x1!=y1, ... ) */
     117             : #define wwl_ge(x,y) ((int)_mm512_cmpge_epi64_mask(  (x), (y) )) /* mask( x0>=y0, x1>=y1, ... ) */
     118             : #define wwl_le(x,y) ((int)_mm512_cmple_epi64_mask(  (x), (y) )) /* mask( x0<=y0, x1<=y1, ... ) */
     119             : 
     120             : #define wwl_lnot(x)    wwl_eq( (x), wwl_zero() )                /* mask(  !x0,  !x1, ... ) */
     121             : #define wwl_lnotnot(x) wwl_ne( (x), wwl_zero() )                /* mask( !!x0, !!x1, ... ) */
     122             : 
     123             : /* Conditional operations */
     124             : /* cn means bit n of c */
     125             : 
     126  1185264754 : #define wwl_if(c,x,y)          _mm512_mask_blend_epi64 ( (__mmask8)(c), (y), (x) )    /* wwl( c0? x0    :y0, ... ) */
     127             : 
     128             : #define wwl_add_if(c,x,y,z)    _mm512_mask_add_epi64   ( (z), (__mmask8)(c), (x), (y) ) /* wwl( c0?(x0+y0):z0, ... ) */
     129             : #define wwl_sub_if(c,x,y,z)    _mm512_mask_sub_epi64   ( (z), (__mmask8)(c), (x), (y) ) /* wwl( c0?(x0-y0):z0, ... ) */
     130             : 
     131             : #define wwl_and_if(c,x,y,z)    _mm512_mask_and_epi64   ( (z), (__mmask8)(c), (x), (y) ) /* wwl( c0?(x0&y0):z0, ... ) */
     132             : #define wwl_andnot_if(c,x,y,z) _mm512_mask_andnot_epi64( (z), (__mmask8)(c), (x), (y) ) /* wwl( c0?(~x0&y0):z0, ... ) */
     133             : #define wwl_or_if(c,x,y,z)     _mm512_mask_or_epi64    ( (z), (__mmask8)(c), (x), (y) ) /* wwl( c0?(x0|y0):z0, ... ) */
     134             : #define wwl_xor_if(c,x,y,z)    _mm512_mask_xor_epi64   ( (z), (__mmask8)(c), (x), (y) ) /* wwl( c0?(x0^y0):z0, ... ) */
     135             : 
     136             : /* Conversions */
     137             : 
     138             : /* wwl_to_wwi(x) returns [  (int)x0,0,  (int)x1,0, ...  (int)x7,0 ]
     139             :    wwl_to_wwu(x) returns [ (uint)x0,0, (uint)x1,0, ... (uint)x7,0 ]
     140             :    wwl_to_wwv(x) returns [ (ulong)x0,  (ulong)x1,  ... (ulong)x7  ] */
     141             : 
     142             : #define wwl_to_wwi(x) wwl_and( (x), wwl_bcast( (long)UINT_MAX ) )
     143             : #define wwl_to_wwu(x) wwl_and( (x), wwl_bcast( (long)UINT_MAX ) )
     144             : #define wwl_to_wwv(x) (x)
     145             : 
     146             : #define wwl_to_wwi_raw(x) (x)
     147             : #define wwl_to_wwu_raw(x) (x)
     148             : #define wwl_to_wwv_raw(x) (x)
     149             : 
     150             : /* Misc operations */
     151             : 
     152             : /* wwl_pack_halves(x,imm0,y,imm1) packs half of x and half of y into a
     153             :    wwl.  imm0/imm1 select which half of x and y to pack.  imm0 / imm1
     154             :    should be in [0,1].  That is, this returns:
     155             : 
     156             :      [ if( imm0, x(4:7), x(0:3) ) if( imm1, y(4:7), y(0:3) ) ]
     157             : 
     158             :    wwl_pack_h0_h1(x,y) does the wwl_pack_halves(x,0,y,1) case faster.
     159             :    Hat tip to Philip Taffet for pointing this out. */
     160             : 
     161  3282563060 : #define wwl_pack_halves(x,imm0,y,imm1) _mm512_shuffle_i64x2( (x), (y), 68+10*(imm0)+160*(imm1) )
     162   395355972 : #define wwl_pack_h0_h1(x,y) _mm512_mask_blend_epi64( (__mmask8)0xF0, (x), (y) )
     163             : 
     164             : /* wwl_madd52lo(a,b,c) returns LO64( a + LO52( LO52(b)*LO52(c) )
     165             :    wwl_madd52hi(a,b,c) returns LO64( a + HI52( LO52(b)*LO52(c) ) */
     166             : 
     167  7440637922 : #define wwl_madd52lo(a,b,c) _mm512_madd52lo_epu64( (a), (b), (c) )
     168             : #define wwl_madd52hi(a,b,c) _mm512_madd52hi_epu64( (a), (b), (c) )
     169             : 
     170             : /* wwl_slide(x,y,imm) treats as a x FIFO with the oldest / newest
     171             :    element at lane 0 / 7.  Returns the result of dequeing x imm times
     172             :    and enqueing the values y0 ... y{imm-1} in that order.  imm should be
     173             :    in [0,7].  For example, with imm==5 case, returns:
     174             :      [ x5 x6 x7 y0 y1 y2 y3 y4 ]. */
     175             : 
     176  2617184700 : #define wwl_slide(x,y,imm) _mm512_alignr_epi64( (y), (x), (imm) )
     177             : 
     178             : /* wwl_unpack unpacks the wwl x into its long components x0,x1,...x7. */
     179             : 
     180     1000000 : #define wwl_unpack( x, x0,x1,x2,x3,x4,x5,x6,x7 ) do {                       \
     181     1000000 :     __m512i _wwl_unpack_x  = (x);                                           \
     182     1000000 :     __m256i _wwl_unpack_xl = _mm512_extracti64x4_epi64( _wwl_unpack_x, 0 ); \
     183     1000000 :     __m256i _wwl_unpack_xh = _mm512_extracti64x4_epi64( _wwl_unpack_x, 1 ); \
     184     1000000 :     (x0) = _mm256_extract_epi64( _wwl_unpack_xl, 0 );                       \
     185     1000000 :     (x1) = _mm256_extract_epi64( _wwl_unpack_xl, 1 );                       \
     186     1000000 :     (x2) = _mm256_extract_epi64( _wwl_unpack_xl, 2 );                       \
     187     1000000 :     (x3) = _mm256_extract_epi64( _wwl_unpack_xl, 3 );                       \
     188     1000000 :     (x4) = _mm256_extract_epi64( _wwl_unpack_xh, 0 );                       \
     189     1000000 :     (x5) = _mm256_extract_epi64( _wwl_unpack_xh, 1 );                       \
     190     1000000 :     (x6) = _mm256_extract_epi64( _wwl_unpack_xh, 2 );                       \
     191     1000000 :     (x7) = _mm256_extract_epi64( _wwl_unpack_xh, 3 );                       \
     192     1000000 :   } while(0)
     193             : 
     194             : /* wwl_transpose_8x8 sets wwl_t's c0,c1,...c7 to the columns of an 8x8
     195             :    ulong matrix given the rows of the matrix in wwl_t's r0,r1,...r7.
     196             :    In-place operation fine. */
     197             : 
     198     1000000 : #define wwl_transpose_8x8( r0,r1,r2,r3,r4,r5,r6,r7, c0,c1,c2,c3,c4,c5,c6,c7 ) do {                \
     199     1000000 :     wwl_t _wwl_transpose_r0 = (r0); wwl_t _wwl_transpose_r1 = (r1);                               \
     200     1000000 :     wwl_t _wwl_transpose_r2 = (r2); wwl_t _wwl_transpose_r3 = (r3);                               \
     201     1000000 :     wwl_t _wwl_transpose_r4 = (r4); wwl_t _wwl_transpose_r5 = (r5);                               \
     202     1000000 :     wwl_t _wwl_transpose_r6 = (r6); wwl_t _wwl_transpose_r7 = (r7);                               \
     203     1000000 :                                                                                                   \
     204     1000000 :     /* Outer 4x4 transpose of 2x2 blocks */                                                       \
     205     1000000 :     wwl_t _wwl_transpose_t0 = _mm512_shuffle_i64x2( _wwl_transpose_r0, _wwl_transpose_r2, 0x88 ); \
     206     1000000 :     wwl_t _wwl_transpose_t1 = _mm512_shuffle_i64x2( _wwl_transpose_r1, _wwl_transpose_r3, 0x88 ); \
     207     1000000 :     wwl_t _wwl_transpose_t2 = _mm512_shuffle_i64x2( _wwl_transpose_r0, _wwl_transpose_r2, 0xdd ); \
     208     1000000 :     wwl_t _wwl_transpose_t3 = _mm512_shuffle_i64x2( _wwl_transpose_r1, _wwl_transpose_r3, 0xdd ); \
     209     1000000 :     wwl_t _wwl_transpose_t4 = _mm512_shuffle_i64x2( _wwl_transpose_r4, _wwl_transpose_r6, 0x88 ); \
     210     1000000 :     wwl_t _wwl_transpose_t5 = _mm512_shuffle_i64x2( _wwl_transpose_r5, _wwl_transpose_r7, 0x88 ); \
     211     1000000 :     wwl_t _wwl_transpose_t6 = _mm512_shuffle_i64x2( _wwl_transpose_r4, _wwl_transpose_r6, 0xdd ); \
     212     1000000 :     wwl_t _wwl_transpose_t7 = _mm512_shuffle_i64x2( _wwl_transpose_r5, _wwl_transpose_r7, 0xdd ); \
     213     1000000 :                                                                                                   \
     214     1000000 :     /**/  _wwl_transpose_r0 = _mm512_shuffle_i64x2( _wwl_transpose_t0, _wwl_transpose_t4, 0x88 ); \
     215     1000000 :     /**/  _wwl_transpose_r1 = _mm512_shuffle_i64x2( _wwl_transpose_t1, _wwl_transpose_t5, 0x88 ); \
     216     1000000 :     /**/  _wwl_transpose_r2 = _mm512_shuffle_i64x2( _wwl_transpose_t2, _wwl_transpose_t6, 0x88 ); \
     217     1000000 :     /**/  _wwl_transpose_r3 = _mm512_shuffle_i64x2( _wwl_transpose_t3, _wwl_transpose_t7, 0x88 ); \
     218     1000000 :     /**/  _wwl_transpose_r4 = _mm512_shuffle_i64x2( _wwl_transpose_t0, _wwl_transpose_t4, 0xdd ); \
     219     1000000 :     /**/  _wwl_transpose_r5 = _mm512_shuffle_i64x2( _wwl_transpose_t1, _wwl_transpose_t5, 0xdd ); \
     220     1000000 :     /**/  _wwl_transpose_r6 = _mm512_shuffle_i64x2( _wwl_transpose_t2, _wwl_transpose_t6, 0xdd ); \
     221     1000000 :     /**/  _wwl_transpose_r7 = _mm512_shuffle_i64x2( _wwl_transpose_t3, _wwl_transpose_t7, 0xdd ); \
     222     1000000 :                                                                                                   \
     223     1000000 :     /* Inner 2x2 transpose of 1x1 blocks */                                                       \
     224     1000000 :     /**/  (c0)              = _mm512_unpacklo_epi64( _wwl_transpose_r0, _wwl_transpose_r1 );      \
     225     1000000 :     /**/  (c1)              = _mm512_unpackhi_epi64( _wwl_transpose_r0, _wwl_transpose_r1 );      \
     226     1000000 :     /**/  (c2)              = _mm512_unpacklo_epi64( _wwl_transpose_r2, _wwl_transpose_r3 );      \
     227     1000000 :     /**/  (c3)              = _mm512_unpackhi_epi64( _wwl_transpose_r2, _wwl_transpose_r3 );      \
     228     1000000 :     /**/  (c4)              = _mm512_unpacklo_epi64( _wwl_transpose_r4, _wwl_transpose_r5 );      \
     229     1000000 :     /**/  (c5)              = _mm512_unpackhi_epi64( _wwl_transpose_r4, _wwl_transpose_r5 );      \
     230     1000000 :     /**/  (c6)              = _mm512_unpacklo_epi64( _wwl_transpose_r6, _wwl_transpose_r7 );      \
     231     1000000 :     /**/  (c7)              = _mm512_unpackhi_epi64( _wwl_transpose_r6, _wwl_transpose_r7 );      \
     232     1000000 :   } while(0)

Generated by: LCOV version 1.14