LCOV - cov.lcov - util/simd/fd_sse

LCOV - code coverage report

Current view:	top level - util/simd - fd_sse_vf.h (source / functions)		Hit	Total	Coverage
Test:	cov.lcov	Lines:	74	74	100.0 %
Date:	2025-08-05 05:04:49	Functions:	8	71	11.3 %

          Line data    Source code

       1             : #ifndef HEADER_fd_src_util_simd_fd_sse_h
       2             : #error "Do not include this directly; use fd_sse.h"
       3             : #endif
       4             : 
       5             : /* Vector float API ***************************************************/
       6             : 
       7             : /* A vf_t is a vector where each 32-bit wide lane holds a single
       8             :    precision IEEE 754 floating point value (a "float").
       9             : 
      10             :    Inputs to all operations assume that the values aren't exotic (no
      11             :    NaNs, no +/-Infs, no denorms) and, if the output of an operation
      12             :    would produce an exotic value in the IEEE 754 standard, the results
      13             :    of that operation are undefined.  Additionally, correct handling of
      14             :    signed zero is not guaranteed.  Lastly, these will not raise floating
      15             :    point exceptions or set math errno's.
      16             : 
      17             :    Basically, handling of exotics and signed zero will generally be
      18             :    reasonable but most of that relies on the underlying compiler and
      19             :    hardware having conformant behavior and this is flaky at the best of
      20             :    times.  So it is best for developers not to assume conformant
      21             :    behavior.
      22             : 
      23             :    These mirror the other APIs as much as possible.  Macros are
      24             :    preferred over static inlines when it is possible to do it robustly
      25             :    to reduce the risk of the compiler mucking it up. */
      26             : 
      27    18219059 : #define vf_t __m128
      28             : 
      29             : /* Constructors */
      30             : 
      31             : /* Given the float values, return ... */
      32             : 
      33     1179648 : #define vf(f0,f1,f2,f3) _mm_setr_ps( (f0), (f1), (f2), (f3) ) /* [ f0 f1 f2 f3 ] */
      34             : 
      35      262144 : #define vf_bcast(f0) _mm_set1_ps( (f0) ) /* [ f0 f0 f0 f0 ] */
      36             : 
      37             : static inline vf_t /* [ f0 f1 f0 f1 ] */
      38      196608 : vf_bcast_pair( float f0, float f1 ) {
      39      196608 :   return _mm_setr_ps( f0, f1, f0, f1 );
      40      196608 : }
      41             : 
      42             : static inline vf_t /* [ f0 f0 f1 f1 ] */
      43      196608 : vf_bcast_wide( float f0, float f1 ) {
      44      196608 :   return _mm_setr_ps( f0, f0, f1, f1 );
      45      196608 : }
      46             : 
      47             : /* vf_permute returns [ f(imm_i0) f(imm_i1) f(imm_i2) f(imm_i3) ].
      48             :    imm_i* should be compile time constants in 0:3. */
      49             : 
      50             : #define vf_permute(f,imm_i0,imm_i1,imm_i2,imm_i3) _mm_permute_ps( (f), _MM_SHUFFLE( (imm_i3), (imm_i2), (imm_i1), (imm_i0) ) )
      51             : 
      52             : /* Predefined constants */
      53             : 
      54             : #define vf_zero() _mm_setzero_ps()   /* Return [ 0.f 0.f 0.f 0.f ] */
      55     9830451 : #define vf_one()  _mm_set1_ps( 1.f ) /* Return [ 1.f 1.f 1.f 1.f ] */
      56             : 
      57             : /* Memory operations */
      58             : 
      59             : /* vf_ld return the 4 floats at the 16-byte aligned / 16-byte sized
      60             :    location p as a vector float.  vf_ldu is the same but p does not have
      61             :    to be aligned.  vf_st writes the vector float to the 16-byte aligned
      62             :    / 16-byte sized location p as 4 floats.  vf_stu is the same but p
      63             :    does not have to be aligned.  In all these lane l will be at p[l].
      64             :    FIXME: USE ATTRIBUTES ON P PASSED TO THESE? */
      65             : 
      66     9830451 : #define vf_ld(p)    _mm_load_ps( (p) )
      67    39321804 : #define vf_ldu(p)   _mm_loadu_ps( (p) )
      68     9830451 : #define vf_st(p,x)  _mm_store_ps( (p), (x) )
      69    39321804 : #define vf_stu(p,x) _mm_storeu_ps( (p), (x) )
      70             : 
      71             : /* vf_ldif is an optimized equivalent to vf_notczero(c,vf_ldu(p)) (may
      72             :    have different behavior if c is not a proper vector conditional).  It
      73             :    is provided for symmetry with the vf_stif operation.  vf_stif stores
      74             :    x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
      75             :    Undefined behavior if c is not a proper vector conditional. */
      76             : 
      77             : #define vf_ldif(c,p)   _mm_maskload_ps( (p),(c))
      78             : #define vf_stif(c,p,x) _mm_maskstore_ps((p),(c),(x))
      79             : 
      80             : /* Element operations */
      81             : 
      82             : /* vf_extract extracts the float in lane imm from the vector float
      83             :    as a float.  vf_insert returns the vector float formed by replacing
      84             :    the value in lane imm of a with the provided float.  imm should be a
      85             :    compile time constant in 0:3.  vf_extract_variable and
      86             :    vf_insert_variable are the slower but the lane n does not have to be
      87             :    known at compile time (should still be in 0:3). */
      88             : 
      89             : /* FIXME: ARE THESE BETTER IMPLEMENTED VIA BOUNCING OF THE STACK?  (IT
      90             :    SEEMS PRETTY CLEAR THAT INTEL DIDN'T INTEND THIS TO BE POSSIBLE) */
      91             : 
      92    39321804 : #define vf_extract(a,imm)  _mm_cvtss_f32( _mm_permute_ps( (a), _MM_SHUFFLE(3,2,1,(imm)) ) )
      93             : 
      94             : #define vf_insert(a,imm,v)                                     \
      95    40108236 :   _mm_castsi128_ps( _mm_insert_epi32( _mm_castps_si128( (a) ), \
      96    40108236 :                     _mm_extract_epi32( _mm_castps_si128( _mm_set_ss( (v) ) ), 0 ), (imm) ) )
      97             : 
      98             : static inline float
      99    39321804 : vf_extract_variable( vf_t a, int n ) {
     100    39321804 :   float f[4] V_ATTR;
     101    39321804 :   _mm_store_ps( f, a );
     102    39321804 :   return f[n];
     103    39321804 : }
     104             : 
     105             : static inline vf_t
     106    39321804 : vf_insert_variable( vf_t a, int n, float v ) {
     107    39321804 :   float f[4] V_ATTR;
     108    39321804 :   _mm_store_ps( f, a );
     109    39321804 :   f[n] = v;
     110    39321804 :   return _mm_load_ps( f );
     111    39321804 : }
     112             : 
     113             : /* Given [a0 a1 a2 a3], [b0 b1 b2 b3] and/or [c0 c1 c2 c3], return ... */
     114             : 
     115             : /* Arithmetic operations */
     116             : 
     117             : /* vf_neg(a)        returns [        -a0         -a1  ...        -a3  ] (i.e.       -a )
     118             :    vf_sign(a)       returns [   signf(a0)   signf(a1) ...   signf(a3) ]
     119             :    vf_abs(a)        returns [   fabsf(a0)   fabsf(a1) ...   fabsf(a3) ] (i.e.    abs(a))
     120             :    vf_negabs(a)     returns [  -fabsf(a0)  -fabsf(a1) ...  -fabsf(a3) ] (i.e.   -abs(a))
     121             :    vf_ceil(a)       returns [   ceilf(a0)   ceilf(a1) ...   ceilf(a3) ] (i.e.   ceil(a))
     122             :    vf_floor(a)      returns [  floorf(a0)  floorf(a1) ...  floorf(a3) ] (i.e.  floor(a))
     123             :    vf_rint(a)       returns [   rintf(a0)   rintf(a1) ...   rintf(a3) ] (i.e. roundb(a))
     124             :    vf_trunc(a)      returns [  truncf(a0)  truncf(a1) ...  truncf(a3) ] (i.e.    fix(a))
     125             :    vf_sqrt(a)       returns [   sqrtf(a0)   sqrtf(a1) ...   sqrtf(a3) ] (i.e.   sqrt(a))
     126             :    vf_rcp_fast(a)   returns [   ~rcpf(a0)   ~rcpf(a1) ...   ~rcpf(a3) ]
     127             :    vf_rsqrt_fast(a) returns [ ~rsqrtf(a0) ~rsqrtf(a1) ... ~rsqrtf(a3) ]
     128             : 
     129             :    vf_add(a,b)      returns [           a0+b0            a1+b1  ...           a3+b3  ] (i.e. a +b)
     130             :    vf_sub(a,b)      returns [           a0-b0            a1-b1  ...           a3-b3  ] (i.e. a -b)
     131             :    vf_mul(a,b)      returns [           a0*b0            a1*b1  ...           a3*b3  ] (i.e. a.*b)
     132             :    vf_div(a,b)      returns [           a0/b0            a1/b1  ...           a3/b3  ] (i.e. a./b)
     133             :    vf_min(a,b)      returns [     fminf(a0,b0)     fminf(a1,b1) ...     fminf(a3,b3) ] (i.e. min([a;b]) (a and b are 1x4)
     134             :    vf_max(a,b)      returns [     fmaxf(a0,b0)     fmaxf(a1,b1) ...     fmaxf(a3,b3) ] (i.e. max([a;b]) (a and b are 1x4)
     135             :    vf_copysign(a,b) returns [ copysignf(a0,b0) copysignf(a1,b1) ... copysignf(a3,b3) ]
     136             :    vf_flipsign(a,b) returns [ flipsignf(a0,b0) flipsignf(a1,b1) ... flipsignf(a3,b3) ]
     137             : 
     138             :    vf_fma(a,b,c)    returns [  fmaf(a0,b0, c0)  fmaf(a1,b1, c1) ...  fmaf(a3,b3, c3) ] (i.e.  a.*b+c)
     139             :    vf_fms(a,b,c)    returns [  fmaf(a0,b0,-c0)  fmaf(a1,b1,-c1) ...  fmaf(a3,b3,-c3) ] (i.e.  a.*b-c)
     140             :    vf_fnma(a,b,c)   returns [ -fmaf(a0,b0,-c0) -fmaf(a1,b1,-c1) ... -fmaf(a3,b3,-c3) ] (i.e. -a.*b+c)
     141             : 
     142             :    where sign(a) is -1. if a's sign bit is set and +1. otherwise, rcp(a)
     143             :    is 1./a and rsqrt(a) is 1./sqrt(a), and flipsign(a,b) returns -a if b
     144             :    signbit is set and a otherwise.
     145             : 
     146             :    rint is in round-to-nearest-even rounding mode (note rint and
     147             :    nearbyint are identical once floating point exceptions are ignored).
     148             : 
     149             :    sqrt should typically be full accuracy.
     150             : 
     151             :    rcp_fast and rsqrt_fast should typically be ~12 bits or more bits
     152             :    accurate (~3 or more decimal digits) such that (nearly) full accuracy
     153             :    can be achieved with two to three rounds of Newton-Raphson polishing.
     154             :    Bit level replicable code should avoid rcp_fast and rsqrt_fast though
     155             :    as the approximations used can vary between various generations /
     156             :    steppings / microcode updates of x86 processors (including Intel and
     157             :    AMD). */
     158             : 
     159             : #define vf_neg(a)        _mm_xor_ps(    _mm_set1_ps( -0.f ), (a) )
     160             : #define vf_sign(a)       _mm_xor_ps(    _mm_set1_ps(  1.f ), _mm_and_ps( _mm_set1_ps( -0.f ), (a) ) )
     161             : #define vf_abs(a)        _mm_andnot_ps( _mm_set1_ps( -0.f ), (a) )
     162             : #define vf_negabs(a)     _mm_or_ps(     _mm_set1_ps( -0.f ), (a) )
     163             : #define vf_ceil(a)       _mm_ceil_ps(  (a) )
     164             : #define vf_floor(a)      _mm_floor_ps( (a) )
     165             : #define vf_rint(a)       _mm_round_ps( (a), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC )
     166             : #define vf_trunc(a)      _mm_round_ps( (a), _MM_FROUND_TO_ZERO        | _MM_FROUND_NO_EXC )
     167             : #define vf_sqrt(a)       _mm_sqrt_ps(  (a) )
     168             : #define vf_rcp_fast(a)   _mm_rcp_ps(   (a) )
     169             : #define vf_rsqrt_fast(a) _mm_rsqrt_ps( (a) )
     170             : 
     171             : #define vf_add(a,b)      _mm_add_ps( (a), (b) )
     172      262144 : #define vf_sub(a,b)      _mm_sub_ps( (a), (b) )
     173             : #define vf_mul(a,b)      _mm_mul_ps( (a), (b) )
     174             : #define vf_div(a,b)      _mm_div_ps( (a), (b) )
     175             : #define vf_min(a,b)      _mm_min_ps( (a), (b) )
     176             : #define vf_max(a,b)      _mm_max_ps( (a), (b) )
     177             : #define vf_copysign(a,b) _mm_or_ps( _mm_andnot_ps( _mm_set1_ps( -0.f ), (a) ), _mm_and_ps( _mm_set1_ps( -0.f ), (b) ) )
     178             : #define vf_flipsign(a,b) _mm_xor_ps( (a), _mm_and_ps( _mm_set1_ps( -0.f ), (b) ) )
     179             : 
     180             : #if defined(__FMA__)
     181             : #define vf_fma(a,b,c)    _mm_fmadd_ps(  (a), (b), (c) )
     182             : #define vf_fms(a,b,c)    _mm_fmsub_ps(  (a), (b), (c) )
     183             : #define vf_fnma(a,b,c)   _mm_fnmadd_ps( (a), (b), (c) )
     184             : #endif
     185             : 
     186             : /* Binary operations */
     187             : 
     188             : /* Note: binary operations are not well defined on vector floats.
     189             :    If doing tricks with floating point binary representations, the user
     190             :    should use vf_to_vi_raw as necessary. */
     191             : 
     192             : /* Logical operations */
     193             : 
     194             : /* These all return proper vector conditionals */
     195             : 
     196             : #define vf_lnot(a)    _mm_castps_si128( _mm_cmp_ps( (a), _mm_setzero_ps(), _CMP_EQ_OQ  ) ) /* [  !a0  !a1 ...  !a3 ] */
     197             : #define vf_lnotnot(a) _mm_castps_si128( _mm_cmp_ps( (a), _mm_setzero_ps(), _CMP_NEQ_OQ ) ) /* [ !!a0 !!a1 ... !!a3 ] */
     198             : #define vf_signbit(a) _mm_srai_epi32( _mm_castps_si128( (a) ), 31 ) /* [ signbit(a0) signbit(a1) ... signbit(a3) ] */
     199             : 
     200             : #define vf_eq(a,b) _mm_castps_si128( _mm_cmp_ps( (a), (b), _CMP_EQ_OQ  ) ) /* [ a0==b0 a1==b1 ... a3==b3 ] */
     201             : #define vf_gt(a,b) _mm_castps_si128( _mm_cmp_ps( (a), (b), _CMP_GT_OQ  ) ) /* [ a0> b0 a1> b1 ... a3> b3 ] */
     202      262144 : #define vf_lt(a,b) _mm_castps_si128( _mm_cmp_ps( (a), (b), _CMP_LT_OQ  ) ) /* [ a0< b0 a1< b1 ... a3< b3 ] */
     203             : #define vf_ne(a,b) _mm_castps_si128( _mm_cmp_ps( (a), (b), _CMP_NEQ_OQ ) ) /* [ a0==b0 a1==b1 ... a3==b3 ] */
     204             : #define vf_ge(a,b) _mm_castps_si128( _mm_cmp_ps( (a), (b), _CMP_GE_OQ  ) ) /* [ a0==b0 a1==b1 ... a3==b3 ] */
     205             : #define vf_le(a,b) _mm_castps_si128( _mm_cmp_ps( (a), (b), _CMP_LE_OQ  ) ) /* [ a0==b0 a1==b1 ... a3==b3 ] */
     206             : 
     207             : /* Conditional operations */
     208             : 
     209             : #define vf_czero(c,f)    _mm_andnot_ps( _mm_castsi128_ps( (c) ), (f) ) /* [ c0?0.f:f0 c1?0.f:f1 ... c3?0.f:f3 ] */
     210             : #define vf_notczero(c,f) _mm_and_ps(    _mm_castsi128_ps( (c) ), (f) ) /* [ c0?f0:0.f c1?f1:0.f ... c3?f3:0.f ] */
     211             : 
     212      262144 : #define vf_if(c,t,f) _mm_blendv_ps( (f), (t), _mm_castsi128_ps( (c) ) ) /* [ c0?t0:f0 c1?t1:f1 ... c3?t3:f3 ] */
     213             : 
     214             : /* Conversion operations */
     215             : 
     216             : /* Summarizing:
     217             : 
     218             :    vf_to_vc(a)               returns [ !!a0 !!a1 ... !!a3 ]
     219             : 
     220             :    vf_to_vi(a)               returns [ (int)a0        (int)a1          ... (int)a3         ]
     221             :    vf_to_vi_fast(a)          returns [ (int)rintf(a0) (int)rintf(a1)   ... (int)rintf(a3)  ]
     222             : 
     223             :    vf_to_vu(a)               returns [ (uint)a0        (uint)a1        ... (uint)a3        ]
     224             :    vf_to_vu_fast(a)          returns [ (uint)rintf(a0) (uint)rintf(a1) ... (uint)rintf(a3) ]
     225             : 
     226             :    vf_to_vd(a,imm_i0,imm_i1) returns [ (double)a(imm_i0) (double)a(imm_i1) ]
     227             : 
     228             :    vf_to_vl(a,imm_i0,imm_i1) returns [ (long)a(imm_i0) (long)a(imm_i1) ]
     229             : 
     230             :    vf_to_vv(a,imm_i0,imm_i1) returns [ (ulong)a(imm_i0) (ulong)a(imm_i1) ]
     231             : 
     232             :    where rintf is configured for round-to-nearest-even rounding (Intel
     233             :    architecture defaults to round-nearest-even here ... sigh, they still
     234             :    don't fully get it) and imm_i* should be a compile time constant in
     235             :    0:3.  That is, the fast variants assume that float point inputs are
     236             :    already integral value in the appropriate range for the output type.
     237             : 
     238             :    The raw variants return just raw bits as the corresponding vector
     239             :    type.  vf_to_vi_raw in particular allows doing advanced bit tricks on
     240             :    a vector float.  The others are probably dubious but are provided for
     241             :    completeness. */
     242             : 
     243             : #define vf_to_vc(a)               _mm_castps_si128( _mm_cmp_ps( (a), _mm_setzero_ps(), _CMP_NEQ_OQ ) )
     244             : #define vf_to_vi(a)               vf_to_vi_fast(  _mm_round_ps( (a), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) )
     245             : #define vf_to_vu(a)               vf_to_vu_fast(  _mm_round_ps( (a), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) )
     246             : #define vf_to_vd(a,imm_i0,imm_i1) _mm_cvtps_pd( _mm_permute_ps( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
     247             : 
     248             : #define vf_to_vl(f,imm_i0,imm_i1) (__extension__({                                                                         \
     249             :     vf_t _vf_to_vl_tmp = (f);                                                                                              \
     250             :     _mm_set_epi64x( (long)vf_extract( _vf_to_vl_tmp, (imm_i1) ), (long)vf_extract( _vf_to_vl_tmp, (imm_i0) ) ); /* sigh */ \
     251             :   }))
     252             : 
     253             : #define vf_to_vv(f,imm_i0,imm_i1) (__extension__({                                   \
     254             :     vf_t _vf_to_vv_tmp = (f);                                                        \
     255             :     _mm_set_epi64x( (long)(ulong)vf_extract( _vf_to_vv_tmp, (imm_i1) ),              \
     256             :                     (long)(ulong)vf_extract( _vf_to_vv_tmp, (imm_i0) ) ); /* sigh */ \
     257             :   }))
     258             : 
     259             : #define vf_to_vi_fast(a)          _mm_cvtps_epi32(  (a) )
     260             : 
     261             : /* Note: Given that _mm_cvtps_epi32 existed for a long time, Intel
     262             :    clearly had the hardware under the hood for _mm_cvtps_epu32 but
     263             :    didn't bother to expose it pre-Skylake-X ... sigh (all too typical
     264             :    unfortunately).  We use _mm_cvtps_epu32 where supported because it
     265             :    is faster and it replicates the same IB behaviors as the compiler
     266             :    generated scalar ASM for float to uint casts on these targets.
     267             : 
     268             :    Pre-Skylake-X, we emulate it by noting that subtracting 2^31 from
     269             :    a float holding an integer in [2^31,2^32) is exact and the result
     270             :    can be exactly converted to a signed integer by _mm_cvtps_epi32.
     271             :    We then use twos complement hacks to add back any shift.  This also
     272             :    replicates the compiler's IB behaviors on these ISAs for float to
     273             :    int casts. */
     274             : 
     275             : #if defined(__AVX512F__) && defined(__AVX512VL__)
     276             : #define vf_to_vu_fast( a ) _mm_cvtps_epu32( (a) )
     277             : #else
     278      262144 : static inline __m128i vf_to_vu_fast( vf_t a ) { /* FIXME: workaround vu_t isn't declared at this point */
     279             :   /**/                                                              /* Assumes a is integer in [0,2^32) */
     280      262144 :   vf_t    s  = vf_bcast( (float)(1U<<31) );                         /* 2^31 */
     281      262144 :   vc_t    c  = vf_lt ( a, s );                                      /* -1 if a<2^31, 0 o.w. */
     282      262144 :   vf_t    as = vf_sub( a, s );                                      /* a-2^31 */
     283      262144 :   __m128i u  = _mm_cvtps_epi32( vf_if( c, a, as ) );                /* (uint)(a      if a<2^31, a-2^31 o.w.) */
     284      262144 :   __m128i us = _mm_add_epi32( u, _mm_set1_epi32( (int)(1U<<31) ) ); /* (uint)(a+2^31 if a<2^31, a      o.w.) */
     285      262144 :   return _mm_castps_si128( _mm_blendv_ps( _mm_castsi128_ps( us ), _mm_castsi128_ps( u ), _mm_castsi128_ps( c ) ) );
     286      262144 : }
     287             : #endif
     288             : 
     289             : #define vf_to_vc_raw(a) _mm_castps_si128( (a) )
     290             : #define vf_to_vi_raw(a) _mm_castps_si128( (a) )
     291             : #define vf_to_vu_raw(a) _mm_castps_si128( (a) )
     292             : #define vf_to_vd_raw(a) _mm_castps_pd(    (a) )
     293             : #define vf_to_vl_raw(a) _mm_castps_si128( (a) )
     294             : #define vf_to_vv_raw(a) _mm_castps_si128( (a) )
     295             : 
     296             : /* Reduction operations */
     297             : 
     298             : static inline vf_t
     299      196608 : vf_sum_all( vf_t x ) { /* Returns vf_bcast( sum( x ) ) */
     300      196608 :   x = _mm_hadd_ps( x, x );    /* x01 x23 ... */
     301      196608 :   return _mm_hadd_ps( x, x ); /* xsum ...    */
     302      196608 : }
     303             : 
     304             : static inline vf_t
     305      196608 : vf_min_all( vf_t x ) { /* Returns vf_bcast( min( x ) ) */
     306      196608 :   __m128 y;
     307      196608 :   y = _mm_permute_ps( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x2  x3  x0  x1 */
     308      196608 :   x = _mm_min_ps( x, y );                             /* x02 x13 ...    */
     309      196608 :   y = _mm_permute_ps( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x13 x02 ...    */
     310      196608 :   x = _mm_min_ps( x, y );                             /* xmin ...       */
     311      196608 :   return x;
     312      196608 : }
     313             : 
     314             : static inline vf_t
     315      196608 : vf_max_all( vf_t x ) { /* Returns vf_bcast( max( x ) ) */
     316      196608 :   __m128 y;
     317      196608 :   y = _mm_permute_ps( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x2  x3  x0  x1 */
     318      196608 :   x = _mm_max_ps( x, y );                             /* x02 x13 ...    */
     319      196608 :   y = _mm_permute_ps( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x13 x02 ...    */
     320      196608 :   x = _mm_max_ps( x, y );                             /* xmax ...       */
     321      196608 :   return x;
     322      196608 : }
     323             : 
     324             : /* Misc operations */
     325             : 
     326             : /* vf_gather(b,i) returns [ b[i(0)] b[i(1)] ... b[i(3)] ] where b is a
     327             :    "float const *" and i is a vi_t. */
     328             : 
     329     9830451 : #define vf_gather(b,i) _mm_i32gather_ps( (b), (i), 4 )
     330             : 
     331             : /* vf_transpose_4x4 transposes the 4x4 matrix stored in vf_t r0,r1,r2,r3
     332             :    and stores the result in 4x4 matrix vf_t c0,c1,c2,c3.  All
     333             :    c0,c1,c2,c3 should be different for a well defined result.
     334             :    Otherwise, in-place operation and/or using the same vf_t to specify
     335             :    multiple rows of r is fine. */
     336             : 
     337      196608 : #define vf_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do {                                                                   \
     338      196608 :     vf_t _vf_transpose_r0 = (r0); vf_t _vf_transpose_r1 = (r1); vf_t _vf_transpose_r2 = (r2); vf_t _vf_transpose_r3 = (r3); \
     339      196608 :     vf_t _vf_transpose_t;                                                                                                   \
     340      196608 :     /* Transpose 2x2 blocks */                                                                                              \
     341      196608 :     _vf_transpose_t = _vf_transpose_r0; _vf_transpose_r0 = _mm_unpacklo_ps( _vf_transpose_t,  _vf_transpose_r2 );           \
     342      196608 :     /**/                                _vf_transpose_r2 = _mm_unpackhi_ps( _vf_transpose_t,  _vf_transpose_r2 );           \
     343      196608 :     _vf_transpose_t = _vf_transpose_r1; _vf_transpose_r1 = _mm_unpacklo_ps( _vf_transpose_t,  _vf_transpose_r3 );           \
     344      196608 :     /**/                                _vf_transpose_r3 = _mm_unpackhi_ps( _vf_transpose_t,  _vf_transpose_r3 );           \
     345      196608 :     /* Transpose 1x1 blocks */                                                                                              \
     346      196608 :     /**/                                (c0)             = _mm_unpacklo_ps( _vf_transpose_r0, _vf_transpose_r1 );           \
     347      196608 :     /**/                                (c1)             = _mm_unpackhi_ps( _vf_transpose_r0, _vf_transpose_r1 );           \
     348      196608 :     /**/                                (c2)             = _mm_unpacklo_ps( _vf_transpose_r2, _vf_transpose_r3 );           \
     349      196608 :     /**/                                (c3)             = _mm_unpackhi_ps( _vf_transpose_r2, _vf_transpose_r3 );           \
     350      196608 :   } while(0)

Generated by: LCOV version 1.14