LCOV - code coverage report
Current view: top level - util/simd - fd_sse_vd.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 91 91 100.0 %
Date: 2025-01-08 12:08:44 Functions: 12 84 14.3 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_sse_h
       2             : #error "Do not include this directly; use fd_sse.h"
       3             : #endif
       4             : 
       5             : /* Vector double API **************************************************/
       6             : 
       7             : /* A vd_t is a vector where each adjacent pair of 32-bit wide lanes
       8             :    (e.g. 0-1 / 2-3) hold a double precision IEEE 754 floating point
       9             :    value (a "double").
      10             : 
      11             :    Inputs to all operations assume that the values aren't exotic (no
      12             :    NaNs, no +/-Infs, no denorms) and, if the output of an operation
      13             :    would produce an exotic value in the IEEE 754 standard, the results
      14             :    of that operation are undefined.  Additionally, correct handling of
      15             :    signed zero is not guaranteed.  Lastly, these will not raise floating
      16             :    point exceptions or set math errno's.
      17             : 
      18             :    Basically, handling of exotics and signed zero will generally be
      19             :    reasonable but most of that relies on the underlying compiler and
      20             :    hardware having conformant behavior and this is flaky at the best of
      21             :    times.  So it is best for developers not to assume conformant
      22             :    behavior.
      23             : 
      24             :    Note that, when doing conditional operations on a vector double, the
      25             :    vector conditional should have identical values in adjacent pairs of
      26             :    lanes.  Results are undefined otherwise.
      27             : 
      28             :    These mirror the other APIs as much as possible.  Macros are
      29             :    preferred over static inlines when it is possible to do it robustly
      30             :    to reduce the risk of the compiler mucking it up. */
      31             : 
      32    19922995 : #define vd_t __m128d
      33             : 
      34             : /* Constructors */
      35             : 
      36             : /* Given the double values, return ... */
      37             : 
      38     1179648 : #define vd(d0,d1) _mm_setr_pd( (d0), (d1) ) /* [ d0 d1 ] */
      39             : 
      40      524288 : #define vd_bcast(d0) _mm_set1_pd( (d0) ) /* [ d0 d0 ] */
      41             : 
      42             : /* vd_permute returns [ d(imm_i0) d(imm_i1) ].  imm_i* should be compile
      43             :    time constants in 0:1. */
      44             : 
      45             : #define vd_permute( d, imm_i0, imm_i1 ) _mm_permute_pd( (d), (imm_i0) + 2*(imm_i1) )
      46             : 
      47             : /* Predefined constants */
      48             : 
      49    17104947 : #define vd_zero() _mm_setzero_pd() /* Return [ 0. 0. ] */
      50    17104947 : #define vd_one()  _mm_set1_pd(1.)  /* Return [ 1. 1. ] */
      51             : 
      52             : /* Memory operations */
      53             : 
      54             : /* vd_ld return the 2 doubles at the 16-byte aligned / 16-byte sized
      55             :    location p as a vector double.  vd_ldu is the same but p does not have
      56             :    to be aligned.  vd_st writes the vector double to the 16-byte aligned
      57             :    / 16-byte sized location p as 2 doubles.  vd_stu is the same but p
      58             :    does not have to be aligned.  In all these 64-bit lane l will be at
      59             :    p[l].  FIXME: USE ATTRIBUTES ON P PASSED TO THESE? */
      60             : 
      61    17104947 : #define vd_ld(p)    _mm_load_pd( (p) )
      62    34209894 : #define vd_ldu(p)   _mm_loadu_pd( (p) )
      63    17104947 : #define vd_st(p,d)  _mm_store_pd( (p), (d) )
      64    34209894 : #define vd_stu(p,d) _mm_storeu_pd( (p), (d) )
      65             : 
      66             : /* vd_ldif is an optimized equivalent to vd_notczero(c,vd_ldu(p)) (may
      67             :    have different behavior if c is not a proper vector conditional).  It
      68             :    is provided for symmetry with the vd_stif operation.  vd_stif stores
      69             :    x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
      70             :    Undefined behavior if c(n) is not a proper paired lane vector
      71             :    conditional. */
      72             : 
      73             : #define vd_ldif(c,p)   _mm_maskload_pd( (p),(c))
      74             : #define vd_stif(c,p,x) _mm_maskstore_pd((p),(c),(x))
      75             : 
      76             : /* Element operations */
      77             : 
      78             : /* vd_extract extracts the double in 64-bit lane imm (e.g. indexed
      79             :    0 and 1 corresponding to 32-bit pairs of lanes 0-1 and 2-3
      80             :    respectively) from the vector double as a double.  vd_insert returns
      81             :    the vector double formed by replacing the value in 64-bit lane imm of
      82             :    a with the provided double.  imm should be a compile time constant in
      83             :    0:1.  vd_extract_variable and vd_insert_variable are the slower but
      84             :    the 64-bit lane n does not have to be known at compile time (should
      85             :    still be in 0:1). */
      86             : 
      87             : static inline double
      88    34209894 : vd_extract( vd_t a, int imm ) { /* FIXME: USE EPI64 HACKS? */
      89    34209894 :   double d[2] V_ATTR;
      90    34209894 :   _mm_store_pd( d, a );
      91    34209894 :   return d[imm];
      92    34209894 : }
      93             : 
      94             : static inline vd_t
      95    34209894 : vd_insert( vd_t a, int imm, double v ) { /* FIXME: USE EPI64 HACKS? */
      96    34209894 :   double d[2] V_ATTR;
      97    34209894 :   _mm_store_pd( d, a );
      98    34209894 :   d[imm] = v;
      99    34209894 :   return _mm_load_pd( d );
     100    34209894 : }
     101             : 
     102             : static inline double
     103    34209894 : vd_extract_variable( vd_t a, int n ) {
     104    34209894 :   double d[2] V_ATTR;
     105    34209894 :   _mm_store_pd( d, a );
     106    34209894 :   return d[n];
     107    34209894 : }
     108             : 
     109             : static inline vd_t
     110    34209894 : vd_insert_variable( vd_t a, int n, double v ) {
     111    34209894 :   double d[2] V_ATTR;
     112    34209894 :   _mm_store_pd( d, a );
     113    34209894 :   d[n] = v;
     114    34209894 :   return _mm_load_pd( d );
     115    34209894 : }
     116             : 
     117             : /* Arithmetic operations */
     118             : 
     119             : /* vd_neg(a)        returns [       -a0        -a1  ] (i.e.       -a )
     120             :    vd_sign(a)       returns [   sign(a0)   sign(a1) ]
     121             :    vd_abs(a)        returns [   fabs(a0)   fabs(a1) ] (i.e.    abs(a))
     122             :    vd_negabs(a)     returns [  -fabs(a0)  -fabs(a1) ] (i.e.   -abs(a))
     123             :    vd_ceil(a)       returns [   ceil(a0)   ceil(a1) ] (i.e.   ceil(a))
     124             :    vd_floor(a)      returns [  floor(a0)  floor(a1) ] (i.e.  floor(a))
     125             :    vd_rint(a)       returns [   rint(a0)   rint(a1) ] (i.e. roundb(a))
     126             :    vd_trunc(a)      returns [  trunc(a0)  trunc(a1) ] (i.e.    fix(a))
     127             :    vd_sqrt(a)       returns [   sqrt(a0)   sqrt(a1) ] (i.e.   sqrt(a))
     128             :    vd_rcp_fast(a)   returns [   ~rcp(a0)   ~rcp(a1) ]
     129             :    vd_rsqrt_fast(a) returns [ ~rsqrt(a0) ~rsqrt(a1) ]
     130             : 
     131             :    vd_add(     a,b) returns [          a0+b0           a1+b1  ] (i.e. a +b)
     132             :    vd_sub(     a,b) returns [          a0-b0           a1-b1  ] (i.e. a -b)
     133             :    vd_mul(     a,b) returns [          a0*b0           a1*b1  ] (i.e. a.*b)
     134             :    vd_div(     a,b) returns [          a0/b0           a1/b1  ] (i.e. a./b)
     135             :    vd_min(     a,b) returns [     fmin(a0,b0)     fmin(a1,b1) ] (i.e. min([a;b]) (a and b are 1x2)
     136             :    vd_max(     a,b) returns [     fmax(a0,b0)     fmax(a1,b1) ] (i.e. max([a;b]) (a and b are 1x2)
     137             :    vd_copysign(a,b) returns [ copysign(a0,b0) copysign(a1,b1) ]
     138             :    vd_flipsign(a,b) returns [ flipsign(a0,b0) flipsign(a1,b1) ]
     139             : 
     140             :    vd_fma( a,b,c)   returns [  fma(a0,b0, c0)  fma(a1,b1, c1) ] (i.e.  a.*b+c)
     141             :    vd_fms( a,b,c)   returns [  fma(a0,b0,-c0)  fma(a1,b1,-c1) ] (i.e.  a.*b-c)
     142             :    vd_fnma(a,b,c)   returns [ -fma(a0,b0,-c0) -fma(a1,b1,-c1) ] (i.e. -a.*b+c)
     143             : 
     144             :    where sign(a) is -1. if a's sign bit is set and +1. otherwise, rcp(a)
     145             :    is 1./a, rsqrt(a) is 1./sqrt(a), and flipsign(a,b) returns -a if b
     146             :    signbit is set and a otherwise.
     147             : 
     148             :    rint is in round-to-nearest-even rounding mode (note rint and
     149             :    nearbyint are identical once floating point exceptions are ignored).
     150             : 
     151             :    sqrt should typically be full accuracy.
     152             : 
     153             :    rcp_fast and rsqrt_fast should typically be ~12 bits or more bits
     154             :    accurate (~3 or more decimal digits) such that (nearly) full accuracy
     155             :    can be achieved with two to three rounds of Newton-Raphson polishing.
     156             :    Bit level replicable code should avoid rcp_fast and rsqrt_fast though
     157             :    as the approximations used can vary between various generations /
     158             :    steppings / microcode updates of x86 processors (including Intel and
     159             :    AMD). */
     160             : 
     161             : #define vd_neg(a)        _mm_xor_pd(    _mm_set1_pd( -0. ), (a) )
     162             : #define vd_sign(a)       _mm_xor_pd(    _mm_set1_pd(  1. ), _mm_and_pd( _mm_set1_pd( -0. ), (a) ) )
     163             : #define vd_abs(a)        _mm_andnot_pd( _mm_set1_pd( -0. ), (a) )
     164             : #define vd_negabs(a)     _mm_or_pd(     _mm_set1_pd( -0. ), (a) )
     165             : #define vd_ceil(a)       _mm_ceil_pd(  (a) )
     166             : #define vd_floor(a)      _mm_floor_pd( (a) )
     167             : #define vd_rint(a)       _mm_round_pd( (a), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC )
     168             : #define vd_trunc(a)      _mm_round_pd( (a), _MM_FROUND_TO_ZERO        | _MM_FROUND_NO_EXC )
     169             : #define vd_sqrt(a)       _mm_sqrt_pd(  (a) )
     170             : #define vd_rcp_fast(a)   _mm_cvtps_pd( _mm_rcp_ps(   _mm_cvtpd_ps( (a) ) ) )
     171             : #define vd_rsqrt_fast(a) _mm_cvtps_pd( _mm_rsqrt_ps( _mm_cvtpd_ps( (a) ) ) )
     172             : 
     173             : #define vd_add(a,b)      _mm_add_pd( (a), (b) )
     174      524288 : #define vd_sub(a,b)      _mm_sub_pd( (a), (b) )
     175             : #define vd_mul(a,b)      _mm_mul_pd( (a), (b) )
     176             : #define vd_div(a,b)      _mm_div_pd( (a), (b) )
     177             : #define vd_min(a,b)      _mm_min_pd( (a), (b) )
     178             : #define vd_max(a,b)      _mm_max_pd( (a), (b) )
     179             : #define vd_copysign(a,b) _mm_or_pd( _mm_andnot_pd( _mm_set1_pd( -0. ), (a) ), _mm_and_pd( _mm_set1_pd( -0. ), (b) ) )
     180             : #define vd_flipsign(a,b) _mm_xor_pd( (a), _mm_and_pd( _mm_set1_pd( -0. ), (b) ) )
     181             : 
     182             : #define vd_fma(a,b,c)  _mm_fmadd_pd(  (a), (b), (c) )
     183             : #define vd_fms(a,b,c)  _mm_fmsub_pd(  (a), (b), (c) )
     184             : #define vd_fnma(a,b,c) _mm_fnmadd_pd( (a), (b), (c) )
     185             : 
     186             : /* Binary operations */
     187             : 
     188             : /* Note: binary operations are not well defined on vector doubles.
     189             :    If doing tricks with floating point binary representations, the user
     190             :    should use vd_to_vi_raw as necessary. */
     191             : 
     192             : /* Logical operations */
     193             : 
     194             : /* These all return proper paired lane vector conditionals */
     195             : 
     196             : #define vd_lnot(a)    /* [  !a0  !a0  !a1  !a1 ] */ \
     197             :   _mm_castpd_si128( _mm_cmp_pd( (a), _mm_setzero_pd(), _CMP_EQ_OQ  ) )
     198             : #define vd_lnotnot(a) /* [ !!a0 !!a0 !!a1 !!a1 ] */ \
     199             :   _mm_castpd_si128( _mm_cmp_pd( (a), _mm_setzero_pd(), _CMP_NEQ_OQ ) )
     200             : #define vd_signbit(a) /* [ signbit(a0) signbit(a0) signbit(a1) signbit(a1) ] */ \
     201             :   _mm_castps_si128( _mm_permute_ps( _mm_castsi128_ps( _mm_srai_epi32( _mm_castpd_si128( (a) ), 31 ) ), _MM_SHUFFLE(3,3,1,1) ) )
     202             : 
     203             : #define vd_eq(a,b) _mm_castpd_si128( _mm_cmp_pd( (a), (b), _CMP_EQ_OQ  ) ) /* [ a0==b0 a0==b0 a1==b1 a1==b1 ] */
     204             : #define vd_gt(a,b) _mm_castpd_si128( _mm_cmp_pd( (a), (b), _CMP_GT_OQ  ) ) /* [ a0> b0 a0> b0 a1> b1 a1> b1 ] */
     205      524288 : #define vd_lt(a,b) _mm_castpd_si128( _mm_cmp_pd( (a), (b), _CMP_LT_OQ  ) ) /* [ a0< b0 a0< b0 a1< b1 a1< b1 ] */
     206             : #define vd_ne(a,b) _mm_castpd_si128( _mm_cmp_pd( (a), (b), _CMP_NEQ_OQ ) ) /* [ a0!=b0 a0!=b0 a1!=b1 a1!=b1 ] */
     207             : #define vd_ge(a,b) _mm_castpd_si128( _mm_cmp_pd( (a), (b), _CMP_GE_OQ  ) ) /* [ a0>=b0 a0>=b0 a1>=b1 a1>=b1 ] */
     208             : #define vd_le(a,b) _mm_castpd_si128( _mm_cmp_pd( (a), (b), _CMP_LE_OQ  ) ) /* [ a0<=b0 a0<=b0 a1<=b1 a1<=b1 ] */
     209             : 
     210             : /* Conditional operations */
     211             : 
     212             : /* c should be a proper paired lane vector conditional for these */
     213             : 
     214             : #define vd_czero(c,a)    _mm_andnot_pd( _mm_castsi128_pd( (c) ), (a) )  /* [ c01?0.:a0 c23?0.:a1 ] */
     215             : #define vd_notczero(c,a) _mm_and_pd(    _mm_castsi128_pd( (c) ), (a) )  /* [ c01?a0:0. c23?a1:0. ] */
     216             : 
     217      524288 : #define vd_if(c,t,f) _mm_blendv_pd( (f), (t), _mm_castsi128_pd( (c) ) ) /* [ c01?t0:f0 c23?t1:f1 ] */
     218             : 
     219             : /* Conversion operations */
     220             : 
     221             : /* Summarizing:
     222             : 
     223             :    vd_to_vc(d)          returns [ !!d0 !!d0 !!d1 !!d1 ] ... proper paired lane
     224             : 
     225             :    vd_to_vf(d,f,0)      returns [ (float)d0 (float)d1 f2 f3 ]
     226             :    vd_to_vf(d,f,1)      returns [ f0 f1 (float)d0 (float)d1 ]
     227             : 
     228             :    vd_to_vi(d,i,0)      returns [ (int)d0 (int)d1 i2 i3 ]
     229             :    vd_to_vi(d,i,1)      returns [ i0 i1 (int)d0 (int)d1 ]
     230             : 
     231             :    vd_to_vi_fast(d,i,0) returns [ (int)rint(d0) (int)rint(d1) i2 i3 ]
     232             :    vd_to_vi_fast(d,i,1) returns [ i0 i1 (int)rint(d0) (int)rint(d1) ]
     233             : 
     234             :    vd_to_vu(d,u,0)      returns [ (uint)d0 (uint)d1 u2 u3 ]
     235             :    vd_to_vu(d,u,1)      returns [ u0 u1 (uint)d0 (uint)d1 ]
     236             : 
     237             :    vd_to_vu_fast(d,u,0) returns [ (uint)rint(d0) (uint)rint(d1) u2 u3 ]
     238             :    vd_to_vu_fast(d,u,1) returns [ u0 u1 (uint)rint(d0) (uint)rint(d1) ]
     239             : 
     240             :    vd_to_vl(d)          returns [ (long)d0 (long)d1 ]
     241             : 
     242             :    vd_to_vv(v)          returns [ (ulong)d0 (ulong)d1 ]
     243             : 
     244             :    where rint is configured for round-to-nearest-even rounding (Intel
     245             :    architecture defaults to round-nearest-even here ... sigh, they still
     246             :    don't fully get it) and imm_hi should be a compile time constant.
     247             :    That is, the fast variants assume that float point inputs are already
     248             :    integral value in the appropriate range for the output type.
     249             : 
     250             :    Note that vd_to_{vf,vi,vi_fast} insert the converted values into
     251             :    lanes 0:1 (imm_hi==0) or 2:3 (imm_hi!=0) of the provided vector.
     252             : 
     253             :    The raw variants return just the raw bits as the corresponding vector
     254             :    type.  vd_to_vl_raw allows doing advanced bit tricks on a vector
     255             :    double.  The others are probably dubious but are provided for
     256             :    completeness. */
     257             : 
     258             : #define vd_to_vc(d) _mm_castpd_si128( _mm_cmp_pd( (d), _mm_setzero_pd(), _CMP_NEQ_OQ ) )
     259             : 
     260      393216 : static inline vf_t vd_to_vf( vd_t d, vf_t f, int imm_hi ) {
     261      393216 :   vf_t _d = _mm_cvtpd_ps( d ); /* [ d0 d1  0  0 ] */
     262      393216 :   if( imm_hi ) _d = _mm_shuffle_ps( f, _d, _MM_SHUFFLE(1,0,1,0) ); /* Compile time */ /* mm_movelh_ps? */
     263      196608 :   else         _d = _mm_shuffle_ps( _d, f, _MM_SHUFFLE(3,2,1,0) );
     264      393216 :   return _d;
     265      393216 : }
     266             : 
     267             : #define vd_to_vi(d,i,imm_hi) vd_to_vi_fast( _mm_round_pd( (d), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ), (i), (imm_hi) )
     268             : #define vd_to_vu(d,i,imm_hi) vd_to_vu_fast( _mm_round_pd( (d), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ), (i), (imm_hi) )
     269             : 
     270      786432 : static inline vi_t vd_to_vi_fast( vd_t d, vi_t i, int imm_hi ) {
     271      786432 :   vf_t _d = _mm_castsi128_ps( _mm_cvtpd_epi32( d ) ); /* [ d0 d1  0  0 ] */
     272      786432 :   vf_t _i = _mm_castsi128_ps( i );
     273      786432 :   if( imm_hi ) _d = _mm_shuffle_ps( _i, _d, _MM_SHUFFLE(1,0,1,0) ); /* Compile time */
     274      393216 :   else         _d = _mm_shuffle_ps( _d, _i, _MM_SHUFFLE(3,2,1,0) );
     275      786432 :   return _mm_castps_si128( _d );
     276      786432 : }
     277             : 
     278      786432 : static inline vu_t vd_to_vu_fast( vd_t d, vu_t u, int imm_hi ) {
     279             : 
     280             : /* Note: Given that _mm_cvtpd_epi32 existed for a long time, Intel
     281             :    clearly had the hardware under the hood for _mm_cvtpd_epu32 but
     282             :    didn't bother to expose it pre-Skylake-X ... sigh (all too typical
     283             :    unfortunately).  We use _mm_cvtpd_epu32 where supported because it
     284             :    is faster and it replicates the same IB behaviors as the compiler
     285             :    generated scalar ASM for float to uint casts on these targets.
     286             : 
     287             :    Pre-Skylake-X, we emulate it by noting that subtracting 2^31
     288             :    from a double holding an integer in [0,2^32) is exact and the
     289             :    result can be exactly converted to a signed integer by
     290             :    _mm_cvtpd_epi32.  We then use twos complement hacks to add back
     291             :    any shift.  This also replicates the compiler's IB behaviors on
     292             :    these ISAs for float to int casts. */
     293             : 
     294      262144 : # if defined(__AVX512F__) && defined(__AVX512VL__)
     295      262144 :   vu_t v = _mm_cvtpd_epu32( d );
     296             : # else
     297             :   /**/                                                 // Assumes d is integer in [0,2^32)
     298      524288 :   vd_t s  = vd_bcast( (double)(1UL<<31) );             // (double)2^31
     299      524288 :   vc_t c  = vd_lt ( d, s );                            // -1L if d<2^31, 0L o.w.
     300      524288 :   vd_t ds = vd_sub( d, s );                            // (double)(d-2^31)
     301      524288 :   vu_t v0 = _mm_cvtpd_epi32( vd_if( c, d, ds ) );      // (uint)(d      if d<2^31, d-2^31 o.w.), d/c lanes 2,3
     302      524288 :   vu_t v1 = vu_add( v0, vu_bcast( 1U<<31) );           // (uint)(d+2^31 if d<2^31, d      o.w.), d/c lanes 2,3
     303      524288 :   vu_t v  = vu_if( vc_permute( c, 0,2,0,2 ), v0, v1 ); // (uint)d, d/c lanes 2,3
     304      524288 : # endif
     305             :   /* Compile time */
     306      786432 :   return imm_hi ? _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( u ), _mm_castsi128_ps( v ), _MM_SHUFFLE(1,0,1,0) ) )
     307      786432 :                 : _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v ), _mm_castsi128_ps( u ), _MM_SHUFFLE(3,2,1,0) ) );
     308      786432 : }
     309             : 
     310             : /* FIXME: IS IT FASTER TO USE INSERT / EXTRACT FOR THESE? */
     311             : 
     312      196608 : static inline __m128i vd_to_vl( vd_t d ) { /* FIXME: workaround vl_t isn't declared at this point */
     313      196608 :   union { double d[2]; __m128d v[1]; } t[1];
     314      196608 :   union { long   l[2]; __m128i v[1]; } u[1];
     315      196608 :   _mm_store_pd( t->d, d );
     316      196608 :   u->l[0] = (long)t->d[0];
     317      196608 :   u->l[1] = (long)t->d[1];
     318      196608 :   return _mm_load_si128( u->v );
     319      196608 : }
     320             : 
     321      196608 : static inline __m128i vd_to_vv( vd_t d ) { /* FIXME: workaround vv_t isn't declared at this point */
     322      196608 :   union { double d[2]; __m128d v[1]; } t[1];
     323      196608 :   union { ulong  u[2]; __m128i v[1]; } u[1];
     324      196608 :   _mm_store_pd( t->d, d );
     325      196608 :   u->u[0] = (ulong)t->d[0];
     326      196608 :   u->u[1] = (ulong)t->d[1];
     327      196608 :   return _mm_load_si128( u->v );
     328      196608 : }
     329             : 
     330             : #define vd_to_vc_raw(a) _mm_castpd_si128( (a) )
     331             : #define vd_to_vf_raw(a) _mm_castpd_ps(    (a) )
     332             : #define vd_to_vi_raw(a) _mm_castpd_si128( (a) )
     333             : #define vd_to_vu_raw(a) _mm_castpd_si128( (a) )
     334             : #define vd_to_vl_raw(a) _mm_castpd_si128( (a) )
     335             : #define vd_to_vv_raw(a) _mm_castpd_si128( (a) )
     336             : 
     337             : /* Reduction operations */
     338             : 
     339             : static inline vd_t
     340      196608 : vd_sum_all( vd_t x ) { /* Returns vd_bcast( sum( x ) ) */
     341      196608 :   return _mm_hadd_pd( x, x ); /* xsum  ... */
     342      196608 : }
     343             : 
     344             : static inline vd_t
     345      196608 : vd_min_all( vd_t a ) { /* Returns vd_bcast( min( x ) ) */
     346      196608 :   return _mm_min_pd( a, _mm_permute_pd( a, 1 ) );
     347      196608 : }
     348             : 
     349             : static inline vd_t
     350      196608 : vd_max_all( vd_t a ) { /* Returns vd_bcast( max( x ) ) */
     351      196608 :   return _mm_max_pd( a, _mm_permute_pd( a, 1 ) );
     352      196608 : }
     353             : 
     354             : /* Misc operations */
     355             : 
     356             : /* vd_gather(b,i,imm_i0,imm_i1) returns [ b[i(imm_i0)] b[i(imm_i1)] ]
     357             :    where b is a  "double const *" and i is a vi_t and imm_i0,imm_i1 are
     358             :    compile time constants in 0:3. */
     359             : 
     360    68419788 : #define vd_gather(b,i,imm_i0,imm_i1) _mm_i32gather_pd( (b), _mm_shuffle_epi32( (i), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0))), 8 )
     361             : 
     362             : /* vd_transpose_2x2 transposes the 2x2 matrix stored in vd_t r0,r1
     363             :    and stores the result in 2x2 matrix vd_t c0,c1.  All c0,c1 should be
     364             :    different for a well defined result.  Otherwise, in-place operation
     365             :    and/or using the same vd_t to specify multiple rows of r is fine. */
     366             : 
     367      196608 : #define vd_transpose_2x2( r0,r1, c0,c1 ) do {                     \
     368      196608 :     vd_t _vd_transpose_r0 = (r0); vd_t _vd_transpose_r1 = (r1);   \
     369      196608 :     (c0) = _mm_unpacklo_pd( _vd_transpose_r0, _vd_transpose_r1 ); \
     370      196608 :     (c1) = _mm_unpackhi_pd( _vd_transpose_r0, _vd_transpose_r1 ); \
     371      196608 :   } while(0)

Generated by: LCOV version 1.14