LCOV - code coverage report
Current view: top level - util/simd - fd_avx_wd.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 101 101 100.0 %
Date: 2024-11-13 11:58:15 Functions: 11 6875 0.2 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_avx_h
       2             : #error "Do not include this directly; use fd_avx.h"
       3             : #endif
       4             : 
       5             : /* Vector double API **************************************************/
       6             : 
       7             : /* A vd_t is a vector where each adjacent pair of 32-bit wide lanes
       8             :    (e.g. 0-1 / 2-3 / 4-5 / 6-7) hold a double precision IEEE 754
       9             :    floating point value (a "double").
      10             : 
      11             :    Inputs to all operations assume that the values aren't exotic (no
      12             :    NaNs, no +/-Infs, no denorms) and, if the output of an operation
      13             :    would produce an exotic value in the IEEE 754 standard, the results
      14             :    of that operation are undefined.  Additionally, correct handling of
      15             :    signed zero is not guaranteed.  Lastly, these will not raise floating
      16             :    point exceptions or set math errno's.
      17             : 
      18             :    Basically, handling of exotics and signed zero will generally be
      19             :    reasonable but most of that relies on the underlying compiler and
      20             :    hardware having conformant behavior and this is flaky at the best of
      21             :    times.  So it is best for developers not to assume conformant
      22             :    behavior.
      23             : 
      24             :    Note that, when doing conditional operations on a vector double, the
      25             :    vector conditional should be identical values in adjacent pairs of
      26             :    lanes.  Results are be undefined otherwise.
      27             : 
      28             :    These mirror the other APIs as much as possible.  Macros are
      29             :    preferred over static inlines when it is possible to do it robustly
      30             :    to reduce the risk of the compiler mucking it up. */
      31             : 
      32    14418691 : #define wd_t __m256d
      33             : 
      34             : /* Constructors */
      35             : 
      36             : /* Given the double values, return ... */
      37             : 
      38     1179648 : #define wd(d0,d1,d2,d3) _mm256_setr_pd( (d0), (d1), (d2), (d3) ) /* [ d0 d1 d2 d3 ] */
      39             : 
      40      524288 : #define wd_bcast(d0) _mm256_set1_pd( (d0) ) /* [ d0 d0 d0 d0 ] */
      41             : 
      42             : static inline wd_t /* [ d0 d1 d0 d1 ] */
      43      196608 : wd_bcast_pair( double d0, double d1 ) {
      44      196608 :   return _mm256_setr_pd( d0, d1, d0, d1 );
      45      196608 : }
      46             : 
      47             : static inline wd_t /* [ d0 d0 d1 d1 ] */
      48      196608 : wd_bcast_wide( double d0, double d1 ) {
      49      196608 :   return _mm256_setr_pd( d0, d0, d1, d1 );
      50      196608 : }
      51             : 
      52             : /* wd_permute returns [ d(imm_i0) d(imm_i1) d(imm_i2) d(imm_i3) ].
      53             :    imm_i* should be compile time constants in 0:3. */
      54             : 
      55             : #define wd_permute(x,imm_i0,imm_i1,imm_i2,imm_i3) _mm256_permute4x64_pd( (x), (imm_i0)+4*(imm_i1)+16*(imm_i2)+64*(imm_i3) )
      56             : 
      57             : /* Predefined constants */
      58             : 
      59             : #define wd_zero() _mm256_setzero_pd() /* Return [ 0. 0. 0. 0. ] */
      60    10617603 : #define wd_one()  _mm256_set1_pd(1.)  /* Return [ 1. 1. 1. 1. ] */
      61             : 
      62             : /* Memory operations */
      63             : 
      64             : /* wd_ld return the 4 doubles at the 32-byte aligned / 32-byte sized
      65             :    location p as a vector double.  wd_ldu is the same but p does not have
      66             :    to be aligned.  wd_st writes the vector double to the 32-byte aligned
      67             :    / 32-byte sized location p as 4 doubles.  wd_stu is the same but p
      68             :    does not have to be aligned.  In all these 64-bit lane l will be at
      69             :    p[l].  FIXME: USE ATTRIBUTES ON P PASSED TO THESE? */
      70             : 
      71    10617603 : #define wd_ld(p)    _mm256_load_pd( (p) )
      72    42470412 : #define wd_ldu(p)   _mm256_loadu_pd( (p) )
      73    10617603 : #define wd_st(p,d)  _mm256_store_pd( (p), (d) )
      74    42470412 : #define wd_stu(p,d) _mm256_storeu_pd( (p), (d) )
      75             : 
      76             : /* wd_ldif is an optimized equivalent to wd_notczero(c,wd_ldu(p)) (may
      77             :    have different behavior if c is not a proper vector conditional).  It
      78             :    is provided for symmetry with the wd_stif operation.  wd_stif stores
      79             :    x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
      80             :    Undefined behavior if c(n) is not a proper paired lane vector
      81             :    conditional. */
      82             : 
      83             : #define wd_ldif(c,p)   _mm256_maskload_pd( (p),(c))
      84             : #define wd_stif(c,p,x) _mm256_maskstore_pd((p),(c),(x))
      85             : 
      86             : /* Element operations */
      87             : 
      88             : /* wd_extract extracts the double in 64-bit lane imm (e.g. indexed
      89             :    0,1,2,3, corresponding to 32-bit pairs of lines 0-1, 2-3, 4-5, 6-7
      90             :    respectively) from the vector double as a double.  wd_insert returns
      91             :    the vector double formed by replacing the value in 64-bit lane imm of
      92             :    a with the provided double.  imm should be a compile time constant in
      93             :    0:3.  wd_extract_variable and wd_insert_variable are the slower but
      94             :    the 64-bit lane n does not have to be known at compile time (should
      95             :    still be in 0:3). */
      96             : 
      97             : static inline double
      98    42470412 : wd_extract( wd_t a, int imm ) { /* FIXME: USE EPI64 HACKS? */
      99    42470412 :   double d[4] W_ATTR;
     100    42470412 :   _mm256_store_pd( d, a );
     101    42470412 :   return d[imm];
     102    42470412 : }
     103             : 
     104             : #if FD_USING_CLANG || !FD_HAS_OPTIMIZATION
     105             : 
     106             : /* Sigh ... clang is sad and can't handle passing compile time const
     107             :    expressions through a static inline */
     108    42470412 : #define wd_insert( a, imm, v ) (__extension__({                                                      \
     109    42470412 :     union { double v; long i; } _wd_insert_t;                                                        \
     110    42470412 :     _wd_insert_t.v = v;                                                                              \
     111    42470412 :     _mm256_castsi256_pd( _mm256_insert_epi64( _mm256_castpd_si256( (a) ), _wd_insert_t.i, (imm) ) ); \
     112    42470412 :   }))
     113             : 
     114             : #else
     115             : 
     116             : static inline wd_t
     117             : wd_insert( wd_t a, int imm, double v ) {
     118             :   union { double v; long i; } t;
     119             :   t.v = v;
     120             :   return _mm256_castsi256_pd( _mm256_insert_epi64( _mm256_castpd_si256( a ), t.i, imm ) );
     121             : }
     122             : 
     123             : #endif
     124             : 
     125             : static inline double
     126    42470412 : wd_extract_variable( wd_t a, int n ) {
     127    42470412 :   double d[4] W_ATTR;
     128    42470412 :   _mm256_store_pd( d, a );
     129    42470412 :   return d[n];
     130    42470412 : }
     131             : 
     132             : static inline wd_t
     133    42470412 : wd_insert_variable( wd_t a, int n, double v ) {
     134    42470412 :   double d[4] W_ATTR;
     135    42470412 :   _mm256_store_pd( d, a );
     136    42470412 :   d[n] = v;
     137    42470412 :   return _mm256_load_pd( d );
     138    42470412 : }
     139             : 
     140             : /* Arithmetic operations */
     141             : 
     142             : /* wd_neg(a)        returns [       -a0        -a1  ...       -a3  ] (i.e.       -a )
     143             :    wd_sign(a)       returns [   sign(a0)   sign(a1) ...   sign(a3) ]
     144             :    wd_abs(a)        returns [   fabs(a0)   fabs(a1) ...   fabs(a3) ] (i.e.    abs(a))
     145             :    wd_negabs(a)     returns [  -fabs(a0)  -fabs(a1) ...  -fabs(a3) ] (i.e.   -abs(a))
     146             :    wd_ceil(a)       returns [   ceil(a0)   ceil(a1) ...   ceil(a3) ] (i.e.   ceil(a))
     147             :    wd_floor(a)      returns [  floor(a0)  floor(a1) ...  floor(a3) ] (i.e.  floor(a))
     148             :    wd_rint(a)       returns [   rint(a0)   rint(a1) ...   rint(a3) includi roundb(a))
     149             :    wd_trunc(a)      returns [  trunc(a0)  trunc(a1) ...  trunc(a3) ] (i.e.    fix(a))
     150             :    wd_sqrt(a)       returns [   sqrt(a0)   sqrt(a1) ...   sqrt(a3) ] (i.e.   sqrt(a))
     151             :    wd_rcp_fast(a)   returns [   ~rcp(a0)   ~rcp(a1) ...   ~rcp(a3) ]
     152             :    wd_rsqrt_fast(a) returns [ ~rsqrt(a0) ~rsqrt(a1) ... ~rsqrt(a3) ]
     153             : 
     154             :    wd_add(     a,b) returns [          a0+b0           a1+b1  ...          a3+b3  ] (i.e. a +b)
     155             :    wd_sub(     a,b) returns [          a0-b0           a1-b1  ...          a3-b3  ] (i.e. a -b)
     156             :    wd_mul(     a,b) returns [          a0*b0           a1*b1  ...          a3*b3  ] (i.e. a.*b)
     157             :    wd_div(     a,b) returns [          a0/b0           a1/b1  ...          a3/b3  ] (i.e. a./b)
     158             :    wd_min(     a,b) returns [     fmin(a0,b0)     fmin(a1,b1) ...     fmin(a3,b3) ] (i.e. min([a;b]) (a and b are 1x4)
     159             :    wd_max(     a,b) returns [     fmax(a0,b0)     fmax(a1,b1) ...     fmax(a3,b3) ] (i.e. max([a;b]) (a and b are 1x4)
     160             :    wd_copysign(a,b) returns [ copysign(a0,b0) copysign(a1,b1) ... copysign(a3,b3) ]
     161             :    wd_flipsign(a,b) returns [ flipsign(a0,b0) flipsign(a1,b1) ... flipsign(a3,b3) ]
     162             : 
     163             :    wd_fma( a,b,c)   returns [  fma(a0,b0, c0)  fma(a1,b1, c1) ...  fma(a3,b3, c3) ] (i.e.  a.*b+c)
     164             :    wd_fms( a,b,c)   returns [  fma(a0,b0,-c0)  fma(a1,b1,-c1) ...  fma(a3,b3,-c3) ] (i.e.  a.*b-c)
     165             :    wd_fnma(a,b,c)   returns [ -fma(a0,b0,-c0) -fma(a1,b1,-c1) ... -fma(a3,b3,-c3) ] (i.e. -a.*b+c)
     166             : 
     167             :    where sign(a) is -1. if a's sign bit is set and +1. otherwise, rcp(a)
     168             :    is 1./a, rsqrt(a) is 1./sqrt(a), and flipsign(a,b) returns -a if b
     169             :    signbit is set and a otherwise.
     170             : 
     171             :    rint is in round-to-nearest-even rounding mode (note rint and
     172             :    nearbyint are identical once floating point exceptions are ignored).
     173             : 
     174             :    sqrt should typically be full accuracy.
     175             : 
     176             :    rcp_fast and rsqrt_fast should typically be ~12 bits or more bits
     177             :    accurate (~3 or more decimal digits) such that (nearly) full accuracy
     178             :    can be achieved with two to three rounds of Newton-Raphson polishing.
     179             :    Bit level replicable code should avoid rcp_fast and rsqrt_fast though
     180             :    as the approximations used can vary between various generations /
     181             :    steppings / microcode updates of x86 processors (including Intel and
     182             :    AMD). */
     183             : 
     184             : #define wd_neg(a)        _mm256_xor_pd(    _mm256_set1_pd( -0. ), (a) )
     185             : #define wd_sign(a)       _mm256_xor_pd(    _mm256_set1_pd(  1. ), _mm256_and_pd( _mm256_set1_pd( -0. ), (a) ) )
     186             : #define wd_abs(a)        _mm256_andnot_pd( _mm256_set1_pd( -0. ), (a) )
     187             : #define wd_negabs(a)     _mm256_or_pd(     _mm256_set1_pd( -0. ), (a) )
     188             : #define wd_ceil(a)       _mm256_ceil_pd(  (a) )
     189             : #define wd_floor(a)      _mm256_floor_pd( (a) )
     190             : #define wd_rint(a)       _mm256_round_pd( (a), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC )
     191             : #define wd_trunc(a)      _mm256_round_pd( (a), _MM_FROUND_TO_ZERO        | _MM_FROUND_NO_EXC )
     192             : #define wd_sqrt(a)       _mm256_sqrt_pd(  (a) )
     193             : #define wd_rcp_fast(a)   _mm256_cvtps_pd( _mm_rcp_ps(   _mm256_cvtpd_ps( (a) ) ) )
     194             : #define wd_rsqrt_fast(a) _mm256_cvtps_pd( _mm_rsqrt_ps( _mm256_cvtpd_ps( (a) ) ) )
     195             : 
     196             : #define wd_add(a,b)      _mm256_add_pd( (a), (b) )
     197      524288 : #define wd_sub(a,b)      _mm256_sub_pd( (a), (b) )
     198             : #define wd_mul(a,b)      _mm256_mul_pd( (a), (b) )
     199             : #define wd_div(a,b)      _mm256_div_pd( (a), (b) )
     200             : #define wd_min(a,b)      _mm256_min_pd( (a), (b) )
     201             : #define wd_max(a,b)      _mm256_max_pd( (a), (b) )
     202             : #define wd_copysign(a,b) _mm256_or_pd( _mm256_andnot_pd( _mm256_set1_pd( -0. ), (a) ), \
     203             :                                        _mm256_and_pd(    _mm256_set1_pd( -0. ), (b) ) )
     204             : #define wd_flipsign(a,b) _mm256_xor_pd( (a), _mm256_and_pd( _mm256_set1_pd( -0. ), (b) ) )
     205             : 
     206             : #define wd_fma(a,b,c)  _mm256_fmadd_pd(  (a), (b), (c) )
     207             : #define wd_fms(a,b,c)  _mm256_fmsub_pd(  (a), (b), (c) )
     208             : #define wd_fnma(a,b,c) _mm256_fnmadd_pd( (a), (b), (c) )
     209             : 
     210             : /* Binary operations */
     211             : 
     212             : /* Note: binary operations are not well defined on vector doubles.
     213             :    If doing tricks with floating point binary representations, the user
     214             :    should use wd_to_wl_raw as necessary. */
     215             : 
     216             : /* Logical operations */
     217             : 
     218             : /* These all return proper paired lane vector conditionals */
     219             : 
     220             : #define wd_lnot(a)    /* [  !a0  !a0  !a1  !a1 ...  !a3  !a3 ] */ \
     221             :   _mm256_castpd_si256( _mm256_cmp_pd( (a), _mm256_setzero_pd(), _CMP_EQ_OQ  ) )
     222             : #define wd_lnotnot(a) /* [ !!a0 !!a0 !!a1 !!a1 ... !!a3 !!a3 ] */ \
     223             :   _mm256_castpd_si256( _mm256_cmp_pd( (a), _mm256_setzero_pd(), _CMP_NEQ_OQ ) )
     224             : #define wd_signbit(a) /* [ signbit(a0) signbit(a0) signbit(a1) signbit(a1) ... signbit(a3) signbit(a3) ] */ \
     225             :   _mm256_castps_si256( _mm256_permute_ps( _mm256_castsi256_ps( _mm256_srai_epi32( _mm256_castpd_si256( (a) ), 31 ) ), \
     226             :                                           _MM_SHUFFLE(3,3,1,1) ) )
     227             : 
     228             : #define wd_eq(a,b) _mm256_castpd_si256( _mm256_cmp_pd( (a), (b), _CMP_EQ_OQ  ) ) /* [ a0==b0 a0==b0 a1==b1 a1==b1 ... a3==b3 a3==b3 ] */
     229             : #define wd_gt(a,b) _mm256_castpd_si256( _mm256_cmp_pd( (a), (b), _CMP_GT_OQ  ) ) /* [ a0> b0 a0> b0 a1> b1 a1> b1 ... a3> b3 a3> b3 ] */
     230      524288 : #define wd_lt(a,b) _mm256_castpd_si256( _mm256_cmp_pd( (a), (b), _CMP_LT_OQ  ) ) /* [ a0< b0 a0< b0 a1< b1 a1< b1 ... a3< b3 a3< b3 ] */
     231             : #define wd_ne(a,b) _mm256_castpd_si256( _mm256_cmp_pd( (a), (b), _CMP_NEQ_OQ ) ) /* [ a0!=b0 a0!=b0 a1!=b1 a1!=b1 ... a3!=b3 a3!=b3 ] */
     232             : #define wd_ge(a,b) _mm256_castpd_si256( _mm256_cmp_pd( (a), (b), _CMP_GE_OQ  ) ) /* [ a0>=b0 a0>=b0 a1>=b1 a1>=b1 ... a3>=b3 a3>=b3 ] */
     233             : #define wd_le(a,b) _mm256_castpd_si256( _mm256_cmp_pd( (a), (b), _CMP_LE_OQ  ) ) /* [ a0<=b0 a0<=b0 a1<=b1 a1<=b1 ... a3<=b3 a3<=b3 ] */
     234             : 
     235             : /* Conditional operations */
     236             : 
     237             : /* c should be a proper paired lane vector conditional for these */
     238             : 
     239             : #define wd_czero(c,a)    _mm256_andnot_pd( _mm256_castsi256_pd( (c) ), (a) )  /* [ c01?0.:a0 c23?0.:a1 c45?0.:a2 c67?0.:a3 ] */
     240             : #define wd_notczero(c,a) _mm256_and_pd(    _mm256_castsi256_pd( (c) ), (a) )  /* [ c01?a0:0. c23?a1:0. c45?a2:0. c67?a3:0. ] */
     241             : 
     242      524288 : #define wd_if(c,t,f) _mm256_blendv_pd( (f), (t), _mm256_castsi256_pd( (c) ) ) /* [ c01?t0:f0 c23?t1:f1 c45?t2:f2 c67?t3:f3 ] */
     243             : 
     244             : /* Conversion operations */
     245             : 
     246             : /* Summarizing:
     247             : 
     248             :    wd_to_wc(d)          returns [ !!d0 !!d0 !!d1 !!d1 ... !!d3 !!d3 ] ... proper paired lane
     249             : 
     250             :    wd_to_wf(d,f,0)      returns [ (float)d0 (float)d1 (float)d2 (float)d3 f4 f5 f6 f7 ]
     251             :    wd_to_wf(d,f,1)      returns [ f0 f1 f2 f3 (float)d0 (float)d1 (float)d2 (float)d3 ]
     252             : 
     253             :    wd_to_wi(d,i,0)      returns [ (int)d0 (int)d1 (int)d2 (int)d3 i4 i5 i6 i7 ]
     254             :    wd_to_wi(d,i,1)      returns [ i0 i1 i2 i3 (int)d0 (int)d1 (int)d2 (int)d3 ]
     255             : 
     256             :    wd_to_wi_fast(d,i,0) returns [ (int)rint(d0) (int)rint(d1) (int)rint(d2) (int)rint(d3) i4 i5 i6 i7 ]
     257             :    wd_to_wi_fast(d,i,1) returns [ i0 i1 i2 i3 (int)rint(d0) (int)rint(d1) (int)rint(d2) (int)rint(d3) ]
     258             : 
     259             :    wd_to_wu(d,u,0)      returns [ (uint)d0 (uint)d1 (uint)d2 (uint)d3 u4 u5 u6 u7 ]
     260             :    wd_to_wu(d,u,1)      returns [ u0 u1 u2 u3 (uint)d0 (uint)d1 (uint)d2 (uint)d3 ]
     261             : 
     262             :    wd_to_wu_fast(d,u,0) returns [ (uint)rint(d0) (uint)rint(d1) (uint)rint(d2) (uint)rint(d3) u4 u5 u6 u7 ]
     263             :    wd_to_wu_fast(d,u,1) returns [ u0 u1 u2 u3 (uint)rint(d0) (uint)rint(d1) (uint)rint(d2) (uint)rint(d3) ]
     264             : 
     265             :    wd_to_wl(d)          returns [ (long)d0 (long)d1 (long)d2 (long)d3 ]
     266             : 
     267             :    wd_to_wv(d)          returns [ (ulong)d0 (ulong)d1 (ulong)d2 (ulong)d3 ]
     268             : 
     269             :    where rint is configured for round-to-nearest-even rounding (Intel
     270             :    architecture defaults to round-nearest-even here ... sigh, they still
     271             :    don't fully get it) and imm_hi should be a compile time constant.
     272             :    That is, the fast variants assume that float point inputs are already
     273             :    integral value in the appropriate range for the output type.
     274             : 
     275             :    Note that wd_to_{wf,wi,wi_fast} insert the converted values into
     276             :    lanes 0:3 (imm_hi==0) or 4:7 (imm_hi!=0) of the provided vector.
     277             : 
     278             :    The raw variants return just the raw bits as the corresponding vector
     279             :    type.  wd_to_wl_raw allows doing advanced bit tricks on a vector
     280             :    double.  The others are probably dubious but are provided for
     281             :    completeness. */
     282             : 
     283             : #define wd_to_wc(d)          _mm256_castpd_si256( _mm256_cmp_pd( (d), _mm256_setzero_pd(), _CMP_NEQ_OQ ) )
     284             : 
     285             : #define wd_to_wf(d,f,imm_hi) _mm256_insertf128_ps( (f), _mm256_cvtpd_ps( (d) ), !!(imm_hi) )
     286             : 
     287             : #define wd_to_wi(d,i,imm_hi) wd_to_wi_fast( _mm256_round_pd( (d), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ), (i), (imm_hi) )
     288             : #define wd_to_wu(d,u,imm_hi) wd_to_wu_fast( _mm256_round_pd( (d), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ), (u), (imm_hi) )
     289             : 
     290             : /* FIXME: IS IT FASTER TO USE INSERT / EXTRACT FOR THESE? */
     291             : 
     292      196608 : static inline __m256i wd_to_wl( wd_t d ) { /* FIXME: workaround wl_t isn't declared at this point */
     293      196608 :   union { double d[4]; __m256d v[1]; } t[1];
     294      196608 :   union { long   l[4]; __m256i v[1]; } u[1];
     295      196608 :   _mm256_store_pd( t->d, d );
     296      196608 :   u->l[0] = (long)t->d[0];
     297      196608 :   u->l[1] = (long)t->d[1];
     298      196608 :   u->l[2] = (long)t->d[2];
     299      196608 :   u->l[3] = (long)t->d[3];
     300      196608 :   return _mm256_load_si256( u->v );
     301      196608 : }
     302             : 
     303      196608 : static inline __m256i wd_to_wv( wd_t d ) { /* FIXME: workaround wv_t isn't declared at this point */
     304      196608 :   union { double d[4]; __m256d v[1]; } t[1];
     305      196608 :   union { ulong  u[4]; __m256i v[1]; } u[1];
     306      196608 :   _mm256_store_pd( t->d, d );
     307      196608 :   u->u[0] = (ulong)t->d[0];
     308      196608 :   u->u[1] = (ulong)t->d[1];
     309      196608 :   u->u[2] = (ulong)t->d[2];
     310      196608 :   u->u[3] = (ulong)t->d[3];
     311      196608 :   return _mm256_load_si256( u->v );
     312      196608 : }
     313             : 
     314             : #define wd_to_wi_fast(d,i,imm_hi) _mm256_insertf128_si256( (i), _mm256_cvtpd_epi32( (d) ), !!(imm_hi) )
     315             : 
     316      786432 : static inline wu_t wd_to_wu_fast( wd_t d, wu_t u, int imm_hi ) {
     317             : 
     318             : /* Note: Given that _mm256_cvtpd_epi32 existed for a long time, Intel
     319             :    clearly had the hardware under the hood for _mm256_cvtpd_epu32 but
     320             :    didn't bother to expose it pre-Skylake-X ... sigh (all too typical
     321             :    unfortunately).  We use _mm256_cvtpd_epu32 where supported because
     322             :    it is faster and it replicates the same IB behaviors as the compiler
     323             :    generated scalar ASM for float to uint casts on these targets.
     324             : 
     325             :    Pre-Skylake-X, we emulate it by noting that subtracting 2^31
     326             :    from a double holding an integer in [0,2^32) is exact and the
     327             :    result can be exactly converted to a signed integer by
     328             :    _mm256_cvtpd_epi32.  We then use twos complement hacks to add back
     329             :    any shift.  This also replicates the compiler's IB behaviors on
     330             :    these ISAs for float to int casts. */
     331             : 
     332      262144 : # if defined(__AVX512F__) && defined(__AVX512VL__)
     333      262144 :   __m128i v = _mm256_cvtpd_epu32( d );
     334             : # else
     335             :   /**/                                                                                     // Assumes d is integer in [0,2^32)
     336      524288 :   wd_t    s  = wd_bcast( (double)(1UL<<31) );                                              // (double)2^31
     337      524288 :   wc_t    c  = wd_lt ( d, s );                                                             // -1L if d<2^31, 0L o.w.
     338      524288 :   wd_t    ds = wd_sub( d, s );                                                             // (double)(d-2^31)
     339      524288 :   __m128  b  = _mm_shuffle_ps( _mm_castsi128_ps( _mm256_extractf128_si256( c, 0 ) ),
     340      524288 :                                _mm_castsi128_ps( _mm256_extractf128_si256( c, 1 ) ),
     341      524288 :                                _MM_SHUFFLE(2,0,2,0) );                                     // -1 if d<2^31, 0 if o.w.
     342      524288 :   __m128i v0 = _mm256_cvtpd_epi32( wd_if( c, d, ds ) );                                    // (uint)(d      if d<2^31, d-2^31 o.w.)
     343      524288 :   __m128i v1 = _mm_add_epi32( v0, _mm_set1_epi32( (int)(1U<<31) ) );                       // (uint)(d+2^31 if d<2^31, d      o.w.)
     344      524288 :   __m128i v  = _mm_castps_si128( _mm_blendv_ps( _mm_castsi128_ps( v1 ),
     345      524288 :                                                 _mm_castsi128_ps( v0 ), b ) );             // (uint)d
     346      524288 : # endif
     347      786432 :   return imm_hi ? _mm256_insertf128_si256( u, v, 1 ) : _mm256_insertf128_si256( u, v, 0 ); // compile time
     348             : 
     349      786432 : }
     350             : 
     351             : #define wd_to_wc_raw(a) _mm256_castpd_si256( (a) )
     352             : #define wd_to_wf_raw(a) _mm256_castpd_ps(    (a) )
     353             : #define wd_to_wi_raw(a) _mm256_castpd_si256( (a) )
     354             : #define wd_to_wu_raw(a) _mm256_castpd_si256( (a) )
     355             : #define wd_to_wl_raw(a) _mm256_castpd_si256( (a) )
     356             : #define wd_to_wv_raw(a) _mm256_castpd_si256( (a) )
     357             : 
     358             : /* Reduction operations */
     359             : 
     360             : static inline wd_t
     361      196608 : wd_sum_all( wd_t x ) { /* Returns wd_bcast( sum( x ) ) */
     362      196608 :   x = _mm256_add_pd( x, _mm256_permute2f128_pd( x, x, 1 ) ); /* x02   x13   ... */
     363      196608 :   return _mm256_hadd_pd( x, x );                             /* xsum  ... */
     364      196608 : }
     365             : 
     366             : static inline wd_t
     367      196608 : wd_min_all( wd_t a ) { /* Returns wd_bcast( min( x ) ) */
     368      196608 :   a = _mm256_min_pd( a, _mm256_permute2f128_pd( a, a, 1 ) );
     369      196608 :   return _mm256_min_pd( a, _mm256_permute_pd( a, 5 ) );
     370      196608 : }
     371             : 
     372             : static inline wd_t
     373      196608 : wd_max_all( wd_t a ) { /* Returns wd_bcast( max( x ) ) */
     374      196608 :   a = _mm256_max_pd( a, _mm256_permute2f128_pd( a, a, 1 ) );
     375      196608 :   return _mm256_max_pd( a, _mm256_permute_pd( a, 5 ) );
     376      196608 : }
     377             : 
     378             : /* Misc operations */
     379             : 
     380             : /* wd_gather(b,i,imm_hi) returns
     381             :      [ b[i(0)] b[i(1)] b[i(2)] b[i(3)] ] if imm_hi is 0 and
     382             :      [ b[i(4)] b[i(5)] b[i(6)] b[i(7)] ] o.w.
     383             :    where b is a "double const*", i is wi_t and imm_hi is a compile time
     384             :    constant. */
     385             : 
     386    21235206 : #define wd_gather(b,i,imm_hi) _mm256_i32gather_pd( (b), _mm256_extractf128_si256( (i), !!(imm_hi) ), 8 )
     387             : 
     388             : /* wd_transpose_4x4 transposes the 4x4 matrix stored in wd_t r0,r1,r2,r3
     389             :    and stores the result in 4x4 matrix wd_t c0,c1,c2,c3.  All
     390             :    c0,c1,c2,c3 should be different for a well defined result.
     391             :    Otherwise, in-place operation and/or using the same wd_t to specify
     392             :    multiple rows of r is fine. */
     393             : 
     394      196608 : #define wd_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do {                                                                      \
     395      196608 :     wd_t _wd_transpose_r0 = (r0); wd_t _wd_transpose_r1 = (r1); wd_t _wd_transpose_r2 = (r2); wd_t _wd_transpose_r3 = (r3);    \
     396      196608 :     wd_t _wd_transpose_t;                                                                                                      \
     397      196608 :     /* Transpose 2x2 blocks */                                                                                                 \
     398      196608 :     _wd_transpose_t = _wd_transpose_r0; _wd_transpose_r0 = _mm256_permute2f128_pd( _wd_transpose_t,  _wd_transpose_r2, 0x20 ); \
     399      196608 :     /**/                                _wd_transpose_r2 = _mm256_permute2f128_pd( _wd_transpose_t,  _wd_transpose_r2, 0x31 ); \
     400      196608 :     _wd_transpose_t = _wd_transpose_r1; _wd_transpose_r1 = _mm256_permute2f128_pd( _wd_transpose_t,  _wd_transpose_r3, 0x20 ); \
     401      196608 :     /**/                                _wd_transpose_r3 = _mm256_permute2f128_pd( _wd_transpose_t,  _wd_transpose_r3, 0x31 ); \
     402      196608 :     /* Transpose 1x1 blocks */                                                                                                 \
     403      196608 :     /**/                                (c0)             = _mm256_unpacklo_pd(     _wd_transpose_r0, _wd_transpose_r1 );       \
     404      196608 :     /**/                                (c1)             = _mm256_unpackhi_pd(     _wd_transpose_r0, _wd_transpose_r1 );       \
     405      196608 :     /**/                                (c2)             = _mm256_unpacklo_pd(     _wd_transpose_r2, _wd_transpose_r3 );       \
     406      196608 :     /**/                                (c3)             = _mm256_unpackhi_pd(     _wd_transpose_r2, _wd_transpose_r3 );       \
     407      196608 :   } while(0)

Generated by: LCOV version 1.14