LCOV - code coverage report
Current view: top level - util/simd - fd_avx512_wwv.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 80 80 100.0 %
Date: 2024-11-13 11:58:15 Functions: 17 312 5.4 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_avx512_h
       2             : #error "Do not include this directly; use fd_avx512.h"
       3             : #endif
       4             : 
       5             : /* TODO: REDUCE, EXTRACT, ADDITIONAL LANE OPS, ... */
       6             : 
       7             : /* Vector ulong API ***************************************************/
       8             : 
       9             : /* A wwv_t is a vector where each adjacent pair of 32-bit wide lanes
      10             :    (e.g. 0-1 / 2-3 / 4-5 / 6-7) holds an unsigned 64-bit integer (a
      11             :    "ulong").
      12             : 
      13             :    These mirror the other APIs as much as possible.  Macros are
      14             :    preferred over static inlines when it is possible to do it robustly
      15             :    to reduce the risk of the compiler mucking it up. */
      16             : 
      17  2622101291 : #define wwv_t __m512i
      18             : 
      19             : /* Constructors */
      20             : 
      21             : /* wwv(x0,x1,x2,x3,x4,x5,x6,x7) returns the wwv_t [x0 x1 ... x7] where
      22             :    x* are ulongs */
      23             : 
      24             : #define wwv(x0,x1,x2,x3,x4,x5,x6,x7) \
      25    30185296 :   _mm512_setr_epi64( (long)(x0), (long)(x1), (long)(x2), (long)(x3), (long)(x4), (long)(x5), (long)(x6), (long)(x7) )
      26             : 
      27  1133207727 : #define wwv_bcast(x)         _mm512_set1_epi64( (long)(x) ) /* wwv(x, x, ... x) */
      28             : 
      29             : /* wwv_permute(p,x) returns:
      30             :      wwv( x(p(0)), x(p(1)), ... x(p(i)) ).
      31             :    As such p(*) should be ulongs in [0,7]. */
      32             : 
      33             : #define wwv_permute(p,x)     _mm512_permutexvar_epi64( (p), (x) )
      34             : 
      35             : /* wwv_select(s,x,y) concatenates the wwv_t's x and y into
      36             :      z = [ x0 x1 ... x7 y0 y1 ... y7 ]
      37             :    and then returns:
      38             :      wwv( z(p(0)), z(p(1)), ... z(p(7)) ).
      39             :    As such p(*) should be ulongs in [0,15]. */
      40             : 
      41             : #define wwv_select(p,x,y)    _mm512_permutex2var_epi64( (x), (p), (y) )
      42             : 
      43             : /* Predefined constants */
      44             : 
      45    18865414 : #define wwv_zero()           _mm512_setzero_si512()  /* wwv(0, 0, ... 0) */
      46     4989383 : #define wwv_one()            _mm512_set1_epi64( 1L ) /* wwv(1, 1, ... 1) */
      47             : 
      48             : /* Memory operations */
      49             : /* Note: wwv_{ld,st} assume m is 64-byte aligned while wwv_{ldu,stu}
      50             :    allow m to have arbitrary alignment */
      51             : 
      52  2996518423 : static inline wwv_t wwv_ld( ulong const * m ) { return _mm512_load_epi64( m ); }  /* wwv( m[0], m[1], ... m[7] ) */
      53  1423719239 : static inline void  wwv_st( ulong * m, wwv_t x ) { _mm512_store_epi64( m, x ); }  /* does m[0] = x0, m[1] = x1, ... m[7] = x7 */
      54             : 
      55    85067392 : static inline wwv_t wwv_ldu( void const * m ) { return _mm512_loadu_epi64( m ); } /* wwv( m[0], m[1], ... m[7]) */
      56     7213102 : static inline void  wwv_stu( void * m, wwv_t x ) { _mm512_storeu_epi64( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[7] = x7 */
      57             : 
      58             : /* Arithmetic operations */
      59             : 
      60             : #define wwv_neg(x)           _mm512_sub_epi64( _mm512_setzero_si512(), (x) ) /* wwv( -x0, -x1, ... -x7 ) */
      61             : #define wwv_abs(x)           (x)                                             /* wwv(  x0,  x1, ...  x7 ) */
      62             : 
      63             : #define wwv_min(x,y)         _mm512_min_epu64  ( (x), (y) ) /* wwv( min(x0,y0), min(x1,y1), ... min(x7,y7) ) */
      64             : #define wwv_max(x,y)         _mm512_max_epu64  ( (x), (y) ) /* wwv( max(x0,y0), max(x1,y1), ... max(x7,y7) ) */
      65  2066189691 : #define wwv_add(x,y)         _mm512_add_epi64  ( (x), (y) ) /* wwv( x0+y0,      x1+y1,      ... x7+y7      ) */
      66   971281677 : #define wwv_sub(x,y)         _mm512_sub_epi64  ( (x), (y) ) /* wwv( x0-y0,      x1-y1,      ... x7-y7      ) */
      67             : #define wwv_mul(x,y)         _mm512_mullo_epi64( (x), (y) ) /* wwv( x0*y0,      x1*y1,      ... x7*y7      ) */
      68             : #define wwv_mul_ll(x,y)      _mm512_mul_epu32  ( (x), (y) ) /* wwv( x0l*y0l,    x1l*y1l,    ... x7l*y7l    ) */
      69             : 
      70             : /* Binary operations */
      71             : /* Note: shifts assumes n and or y* in [0,63].  Rotates work for
      72             :    arbitrary values */
      73             : 
      74     4000000 : #define wwv_not(x)           _mm512_xor_epi64( _mm512_set1_epi64( -1L ), (x) )
      75             : 
      76             : #define wwv_shl(x,n)         _mm512_slli_epi64  ( (x), (uint)(n) ) /* wwv( x0<<n,  x1<<n,  ... x7<<n  ) */
      77             : #define wwv_shr(x,n)         _mm512_srli_epi64  ( (x), (uint)(n) ) /* wwv( x0>>n,  x1>>n,  ... x7>>n  ) */
      78             : #define wwv_shl_vector(x,y)  _mm512_sllv_epi64  ( (x), (y)       ) /* wwv( x0<<y0, x1<<y1, ... x7<<y7 ) */
      79             : #define wwv_shr_vector(x,y)  _mm512_srlv_epi64  ( (x), (y)       ) /* wwv( x0>>y0, x1>>y1, ... x7>>y7 ) */
      80             : #define wwv_and(x,y)         _mm512_and_epi64   ( (x), (y)       ) /* wwv( x0&y0,  x1&y1,  ... x7&y7  ) */
      81    13749702 : #define wwv_andnot(x,y)      _mm512_andnot_epi64( (x), (y)       ) /* wwv( ~x0&y0, ~x1&y1, ... ~x7&y7 ) */
      82    23154348 : #define wwv_or(x,y)          _mm512_or_epi64    ( (x), (y)       ) /* wwv( x0|y0,  x1|y1,  ... x7|y7  ) */
      83             : #define wwv_xor(x,y)         _mm512_xor_epi64   ( (x), (y)       ) /* wwv( x0^y0,  x1^y1,  ... x7^y7  ) */
      84             : 
      85             : /* wwv_rol(x,n)          returns wwv( rotate_left (x0,n ), rotate_left (x1,n ), ... )
      86             :    wwv_ror(x,n)          returns wwv( rotate_right(x0,n ), rotate_right(x1,n ), ... )
      87             :    wwv_rol_variable(x,n) returns wwv( rotate_left (x0,n ), rotate_left (x1,n ), ... )
      88             :    wwv_ror_variable(x,n) returns wwv( rotate_right(x0,n ), rotate_right(x1,n ), ... )
      89             :    wwv_rol_vector(x,y)   returns wwv( rotate_left (x0,y0), rotate_left (x1,y1), ... )
      90             :    wwv_ror_vector(x,y)   returns wwv( rotate_right(x0,y0), rotate_right(x1,y1), ... )
      91             : 
      92             :    The variable variants are slower but do not require the shift amount
      93             :    to be known at compile time. */
      94             : 
      95             : #define wwv_rol(a,imm)  _mm512_rol_epi64( (a), (imm)&63 )
      96             : #define wwv_ror(a,imm)  _mm512_ror_epi64( (a), (imm)&63 )
      97             : 
      98     1000000 : static inline wwv_t wwv_rol_variable( wwv_t a, ulong n ) { return wwv_or( wwv_shl( a, n & 63UL ), wwv_shr( a, (-n) & 63UL ) ); }
      99     1000000 : static inline wwv_t wwv_ror_variable( wwv_t a, ulong n ) { return wwv_or( wwv_shr( a, n & 63UL ), wwv_shl( a, (-n) & 63UL ) ); }
     100             : 
     101     1000000 : static inline wwv_t wwv_rol_vector( wwv_t a, wwv_t b ) {
     102     1000000 :   wwv_t m = wwv_bcast( 63UL );
     103     1000000 :   return wwv_or( wwv_shl_vector( a, wwv_and( b, m ) ), wwv_shr_vector( a, wwv_and( wwv_neg( b ), m ) ) );
     104     1000000 : }
     105             : 
     106     1000000 : static inline wwv_t wwv_ror_vector( wwv_t a, wwv_t b ) {
     107     1000000 :   wwv_t m = wwv_bcast( 63UL );
     108     1000000 :   return wwv_or( wwv_shr_vector( a, wwv_and( b, m ) ), wwv_shl_vector( a, wwv_and( wwv_neg( b ), m ) ) );
     109     1000000 : }
     110             : 
     111             : /* wwv_bswap(x) returns wwv( bswap(x0), bswap(x1), ... ) */
     112             : 
     113     6213102 : #define wwv_bswap( x ) _mm512_shuffle_epi8( (x), _mm512_set_epi8(  8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7, \
     114     6213102 :                                                                    8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7, \
     115     6213102 :                                                                    8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7, \
     116     6213102 :                                                                    8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7 ) )
     117             : 
     118             : /* Comparison operations */
     119             : /* mask(c0,c1,...) means (((int)c0)<<0) | (((int)c1)<<1) | ... */
     120             : 
     121             : #define wwv_eq(x,y) ((int)_mm512_cmpeq_epu64_mask(  (x), (y) )) /* mask( x0==y0, x1==y1, ... ) */
     122             : #define wwv_gt(x,y) ((int)_mm512_cmpgt_epu64_mask(  (x), (y) )) /* mask( x0> y0, x1> y1, ... ) */
     123             : #define wwv_lt(x,y) ((int)_mm512_cmplt_epu64_mask(  (x), (y) )) /* mask( x0< y0, x1< y1, ... ) */
     124   131077985 : #define wwv_ne(x,y) ((int)_mm512_cmpneq_epu64_mask( (x), (y) )) /* mask( x0!=y0, x1!=y1, ... ) */
     125             : #define wwv_ge(x,y) ((int)_mm512_cmpge_epu64_mask(  (x), (y) )) /* mask( x0>=y0, x1>=y1, ... ) */
     126             : #define wwv_le(x,y) ((int)_mm512_cmple_epu64_mask(  (x), (y) )) /* mask( x0<=y0, x1<=y1, ... ) */
     127             : 
     128             : #define wwv_lnot(x)    wwv_eq( (x), wwv_zero() )                /* mask(  !x0,  !x1, ... ) */
     129             : #define wwv_lnotnot(x) wwv_ne( (x), wwv_zero() )                /* mask( !!x0, !!x1, ... ) */
     130             : 
     131             : /* Conditional operations */
     132             : /* cn means bit n of c */
     133             : 
     134   133077985 : #define wwv_if(c,x,y)          _mm512_mask_blend_epi64 ( (__mmask8)(c), (y), (x) )      /* wwv( c0? x0    :y0, ... ) */
     135  4136308263 : #define wwv_add_if(c,x,y,z)    _mm512_mask_add_epi64   ( (z), (__mmask8)(c), (x), (y) ) /* wwv( c0?(x0+y0):z0, ... ) */
     136   238337007 : #define wwv_sub_if(c,x,y,z)    _mm512_mask_sub_epi64   ( (z), (__mmask8)(c), (x), (y) ) /* wwv( c0?(x0-y0):z0, ... ) */
     137             : 
     138             : #define wwv_and_if(c,x,y,z)    _mm512_mask_and_epi64   ( (z), (__mmask8)(c), (x), (y) ) /* wwv( c0?(x0&y0):z0, ... ) */
     139             : #define wwv_andnot_if(c,x,y,z) _mm512_mask_andnot_epi64( (z), (__mmask8)(c), (x), (y) ) /* wwv( c0?(~x0&y0):z0, ... ) */
     140             : #define wwv_or_if(c,x,y,z)     _mm512_mask_or_epi64    ( (z), (__mmask8)(c), (x), (y) ) /* wwv( c0?(x0|y0):z0, ... ) */
     141             : #define wwv_xor_if(c,x,y,z)    _mm512_mask_xor_epi64   ( (z), (__mmask8)(c), (x), (y) ) /* wwv( c0?(x0^y0):z0, ... ) */
     142             : 
     143             : /* Conversions */
     144             : 
     145             : /* wwv_to_wwi(x) returns [  (int)x0,0,  (int)x1,0, ...  (int)x7,0 ]
     146             :    wwv_to_wwu(x) returns [ (uint)x0,0, (uint)x1,0, ... (uint)x7,0 ]
     147             :    wwv_to_wwv(x) returns [ (ulong)x0,  (ulong)x1,  ... (ulong)x7  ] */
     148             : 
     149             : #define wwv_to_wwi(x) wwv_and( (x), wwv_bcast( (ulong)UINT_MAX ) )
     150             : #define wwv_to_wwu(x) wwv_and( (x), wwv_bcast( (ulong)UINT_MAX ) )
     151             : #define wwv_to_wwl(x) (x)
     152             : 
     153             : #define wwv_to_wwi_raw(x) (x)
     154             : #define wwv_to_wwu_raw(x) (x)
     155             : #define wwv_to_wwl_raw(x) (x)
     156             : 
     157             : /* Misc operations */
     158             : 
     159             : /* wwv_pack_halves(x,imm0,y,imm1) packs half of x and half of y into a
     160             :    wwv.  imm0/imm1 select which half of x and y to pack.  imm0 / imm1
     161             :    should be in [0,1].  That is, this returns:
     162             : 
     163             :      [ if( imm0, x(4:7), x(0:3) ) if( imm1, y(4:7), y(0:3) ) ]
     164             : 
     165             :    wwv_pack_h0_h1(x,y) does the wwv_pack_halves(x,0,y,1) case faster.
     166             :    Hat tip to Philip Taffet for pointing this out. */
     167             : 
     168             : #define wwv_pack_halves(x,imm0,y,imm1) _mm512_shuffle_i64x2( (x), (y), 68+10*(imm0)+160*(imm1) )
     169             : #define wwv_pack_h0_h1(x,y)            _mm512_mask_blend_epi64( (__mmask8)0xF0, (x), (y) )
     170             : 
     171             : /* wwv_madd52lo(a,b,c) returns LO64( a + LO52( LO52(b)*LO52(c) )
     172             :    wwv_madd52hi(a,b,c) returns LO64( a + HI52( LO52(b)*LO52(c) ) */
     173             : 
     174             : #define wwv_madd52lo(a,b,c) _mm512_madd52lo_epu64( (a), (b), (c) )
     175             : #define wwv_madd52hi(a,b,c) _mm512_madd52hi_epu64( (a), (b), (c) )
     176             : 
     177             : /* wwv_slide(x,y,imm) treats as a x FIFO with the oldest / newest
     178             :    element at lane 0 / 7.  Returns the result of dequeing x imm times
     179             :    and enqueing the values y0 ... y{imm-1} in that order.  imm should be
     180             :    in [0,7].  For example, with imm==5 case, returns:
     181             :      [ x5 x6 x7 y0 y1 y2 y3 y4 ]. */
     182             : 
     183             : #define wwv_slide(x,y,imm) _mm512_alignr_epi64( (y), (x), (imm) )
     184             : 
     185             : /* wwv_unpack unpacks the wwv x into its ulong components x0,x1,...x7. */
     186             : 
     187   122932795 : #define wwv_unpack( x, x0,x1,x2,x3,x4,x5,x6,x7 ) do {                       \
     188   122932795 :     __m512i _wwv_unpack_x  = (x);                                           \
     189   122932795 :     __m256i _wwv_unpack_xl = _mm512_extracti64x4_epi64( _wwv_unpack_x, 0 ); \
     190   122932795 :     __m256i _wwv_unpack_xh = _mm512_extracti64x4_epi64( _wwv_unpack_x, 1 ); \
     191   122932795 :     (x0) = (ulong)_mm256_extract_epi64( _wwv_unpack_xl, 0 );                \
     192   122932795 :     (x1) = (ulong)_mm256_extract_epi64( _wwv_unpack_xl, 1 );                \
     193   122932795 :     (x2) = (ulong)_mm256_extract_epi64( _wwv_unpack_xl, 2 );                \
     194   122932795 :     (x3) = (ulong)_mm256_extract_epi64( _wwv_unpack_xl, 3 );                \
     195   122932795 :     (x4) = (ulong)_mm256_extract_epi64( _wwv_unpack_xh, 0 );                \
     196   122932795 :     (x5) = (ulong)_mm256_extract_epi64( _wwv_unpack_xh, 1 );                \
     197   122932795 :     (x6) = (ulong)_mm256_extract_epi64( _wwv_unpack_xh, 2 );                \
     198   122932795 :     (x7) = (ulong)_mm256_extract_epi64( _wwv_unpack_xh, 3 );                \
     199   122932795 :   } while(0)
     200             : 
     201             : /* wwv_transpose_8x8 sets wwv_t's c0,c1,...c7 to the columns of an 8x8
     202             :    ulong matrix given the rows of the matrix in wwv_t's r0,r1,...r7.
     203             :    In-place operation fine. */
     204             : 
     205    11959914 : #define wwv_transpose_8x8( r0,r1,r2,r3,r4,r5,r6,r7, c0,c1,c2,c3,c4,c5,c6,c7 ) do {                \
     206    11959914 :     wwv_t _wwv_transpose_r0 = (r0); wwv_t _wwv_transpose_r1 = (r1);                               \
     207    11959914 :     wwv_t _wwv_transpose_r2 = (r2); wwv_t _wwv_transpose_r3 = (r3);                               \
     208    11959914 :     wwv_t _wwv_transpose_r4 = (r4); wwv_t _wwv_transpose_r5 = (r5);                               \
     209    11959914 :     wwv_t _wwv_transpose_r6 = (r6); wwv_t _wwv_transpose_r7 = (r7);                               \
     210    11959914 :                                                                                                   \
     211    11959914 :     /* Outer 4x4 transpose of 2x2 blocks */                                                       \
     212    11959914 :     wwv_t _wwv_transpose_t0 = _mm512_shuffle_i64x2( _wwv_transpose_r0, _wwv_transpose_r2, 0x88 ); \
     213    11959914 :     wwv_t _wwv_transpose_t1 = _mm512_shuffle_i64x2( _wwv_transpose_r1, _wwv_transpose_r3, 0x88 ); \
     214    11959914 :     wwv_t _wwv_transpose_t2 = _mm512_shuffle_i64x2( _wwv_transpose_r0, _wwv_transpose_r2, 0xdd ); \
     215    11959914 :     wwv_t _wwv_transpose_t3 = _mm512_shuffle_i64x2( _wwv_transpose_r1, _wwv_transpose_r3, 0xdd ); \
     216    11959914 :     wwv_t _wwv_transpose_t4 = _mm512_shuffle_i64x2( _wwv_transpose_r4, _wwv_transpose_r6, 0x88 ); \
     217    11959914 :     wwv_t _wwv_transpose_t5 = _mm512_shuffle_i64x2( _wwv_transpose_r5, _wwv_transpose_r7, 0x88 ); \
     218    11959914 :     wwv_t _wwv_transpose_t6 = _mm512_shuffle_i64x2( _wwv_transpose_r4, _wwv_transpose_r6, 0xdd ); \
     219    11959914 :     wwv_t _wwv_transpose_t7 = _mm512_shuffle_i64x2( _wwv_transpose_r5, _wwv_transpose_r7, 0xdd ); \
     220    11959914 :                                                                                                   \
     221    11959914 :     /**/  _wwv_transpose_r0 = _mm512_shuffle_i64x2( _wwv_transpose_t0, _wwv_transpose_t4, 0x88 ); \
     222    11959914 :     /**/  _wwv_transpose_r1 = _mm512_shuffle_i64x2( _wwv_transpose_t1, _wwv_transpose_t5, 0x88 ); \
     223    11959914 :     /**/  _wwv_transpose_r2 = _mm512_shuffle_i64x2( _wwv_transpose_t2, _wwv_transpose_t6, 0x88 ); \
     224    11959914 :     /**/  _wwv_transpose_r3 = _mm512_shuffle_i64x2( _wwv_transpose_t3, _wwv_transpose_t7, 0x88 ); \
     225    11959914 :     /**/  _wwv_transpose_r4 = _mm512_shuffle_i64x2( _wwv_transpose_t0, _wwv_transpose_t4, 0xdd ); \
     226    11959914 :     /**/  _wwv_transpose_r5 = _mm512_shuffle_i64x2( _wwv_transpose_t1, _wwv_transpose_t5, 0xdd ); \
     227    11959914 :     /**/  _wwv_transpose_r6 = _mm512_shuffle_i64x2( _wwv_transpose_t2, _wwv_transpose_t6, 0xdd ); \
     228    11959914 :     /**/  _wwv_transpose_r7 = _mm512_shuffle_i64x2( _wwv_transpose_t3, _wwv_transpose_t7, 0xdd ); \
     229    11959914 :                                                                                                   \
     230    11959914 :     /* Inner 2x2 transpose of 1x1 blocks */                                                       \
     231    11959914 :     /**/  (c0)              = _mm512_unpacklo_epi64( _wwv_transpose_r0, _wwv_transpose_r1 );      \
     232    11959914 :     /**/  (c1)              = _mm512_unpackhi_epi64( _wwv_transpose_r0, _wwv_transpose_r1 );      \
     233    11959914 :     /**/  (c2)              = _mm512_unpacklo_epi64( _wwv_transpose_r2, _wwv_transpose_r3 );      \
     234    11959914 :     /**/  (c3)              = _mm512_unpackhi_epi64( _wwv_transpose_r2, _wwv_transpose_r3 );      \
     235    11959914 :     /**/  (c4)              = _mm512_unpacklo_epi64( _wwv_transpose_r4, _wwv_transpose_r5 );      \
     236    11959914 :     /**/  (c5)              = _mm512_unpackhi_epi64( _wwv_transpose_r4, _wwv_transpose_r5 );      \
     237    11959914 :     /**/  (c6)              = _mm512_unpacklo_epi64( _wwv_transpose_r6, _wwv_transpose_r7 );      \
     238    11959914 :     /**/  (c7)              = _mm512_unpackhi_epi64( _wwv_transpose_r6, _wwv_transpose_r7 );      \
     239    11959914 :   } while(0)

Generated by: LCOV version 1.14