LCOV - cov.lcov - util/simd/fd_avx

LCOV - code coverage report

Current view:	top level - util/simd - fd_avx_wf.h (source / functions)		Hit	Total	Coverage
Test:	cov.lcov	Lines:	128	128	100.0 %
Date:	2024-11-13 11:58:15	Functions:	14	8743	0.2 %

          Line data    Source code

       1             : #ifndef HEADER_fd_src_util_simd_fd_avx_h
       2             : #error "Do not include this directly; use fd_avx.h"
       3             : #endif
       4             : 
       5             : /* Vector float API ***************************************************/
       6             : 
       7             : /* A wf_t is a vector where each 32-bit wide lane holds a single
       8             :    precision IEEE 754 floating point value (a "float").
       9             : 
      10             :    Inputs to all operations assume that the values aren't exotic (no
      11             :    NaNs, no +/-Infs, no denorms) and, if the output of an operation
      12             :    would produce an exotic value in the IEEE 754 standard, the results
      13             :    of that operation are undefined.  Additionally, correct handling of
      14             :    signed zero is not guaranteed.  Lastly, these will not raise floating
      15             :    point exceptions or set math errno's.
      16             : 
      17             :    Basically, handling of exotics and signed zero will generally be
      18             :    reasonable but most of that relies on the underlying compiler and
      19             :    hardware having conformant behavior and this is flaky at the best of
      20             :    times.  So it is best for developers not to assume conformant
      21             :    behavior.
      22             : 
      23             :    These mirror the other APIs as much as possible.  Macros are
      24             :    preferred over static inlines when it is possible to do it robustly
      25             :    to reduce the risk of the compiler mucking it up. */
      26             : 
      27    13894403 : #define wf_t  __m256
      28             : 
      29             : /* Constructors */
      30             : 
      31             : /* Given the float values, return ... */
      32             : 
      33             : #define wf(f0,f1,f2,f3,f4,f5,f6,f7) /* [ f0 f1 f2 f3 f4 f5 f6 f7 ] */ \
      34     1179648 :   _mm256_setr_ps( (f0), (f1), (f2), (f3), (f4), (f5), (f6), (f7) )
      35             : 
      36      262144 : #define wf_bcast(f0) _mm256_set1_ps( (f0) ) /* [ f0 f0 f0 f0 f0 f0 f0 f0 ] */
      37             : 
      38             : static inline wf_t /* [ f0 f1 f0 f1 f0 f1 f0 f1 ] */
      39      196608 : wf_bcast_pair( float f0, float f1 ) {
      40      196608 :   return _mm256_setr_ps( f0, f1, f0, f1, f0, f1, f0, f1 );
      41      196608 : }
      42             : 
      43             : static inline wf_t /* [ f0 f0 f0 f0 f1 f1 f1 f1 ] */
      44      196608 : wf_bcast_lohi( float f0, float f1 ) {
      45      196608 :   return _mm256_setr_ps( f0, f0, f0, f0, f1, f1, f1, f1 );
      46      196608 : }
      47             : 
      48             : static inline wf_t /* [ f0 f1 f2 f3 f0 f1 f2 f3 ] */
      49      196608 : wf_bcast_quad( float f0, float f1, float f2, float f3 ) {
      50      196608 :   return _mm256_setr_ps( f0, f1, f2, f3, f0, f1, f2, f3 );
      51      196608 : }
      52             : 
      53             : static inline wf_t /* [ f0 f0 f1 f1 f2 f2 f3 f3 ] */
      54      196608 : wf_bcast_wide( float f0, float f1, float f2, float f3 ) {
      55      196608 :   return _mm256_setr_ps( f0, f0, f1, f1, f2, f2, f3, f3 );
      56      196608 : }
      57             : 
      58             : /* No general vf_permute due to cross-128-bit lane limitations in AVX.
      59             :    Useful cases are provided below.  Given [ f0 f1 f2 f3 f4 f5 f6 f7 ],
      60             :    return ... */
      61             : 
      62             : #define wf_bcast_even(f)    _mm256_permute_ps( (f), _MM_SHUFFLE(2,2,0,0) ) /* [ f0 f0 f2 f2 f4 f4 f6 f6 ] */
      63             : #define wf_bcast_odd(f)     _mm256_permute_ps( (f), _MM_SHUFFLE(3,3,1,1) ) /* [ f1 f1 f3 f3 f5 f5 f7 f7 ] */
      64             : #define wf_exch_adj(f)      _mm256_permute_ps( (f), _MM_SHUFFLE(2,3,0,1) ) /* [ f1 f0 f3 f2 f5 f4 f7 f6 ] */
      65             : #define wf_exch_adj_pair(f) _mm256_permute_ps( (f), _MM_SHUFFLE(1,0,3,2) ) /* [ f2 f3 f0 f1 f6 f7 f4 f5 ] */
      66             : 
      67             : static inline wf_t
      68      196608 : wf_exch_adj_quad( wf_t f ) { /* [ f4 f5 f6 f7 f0 f1 f2 f3 ] */
      69      196608 :   return _mm256_permute2f128_ps( f, f, 1 );
      70      196608 : }
      71             : 
      72             : /* Predefined constants */
      73             : 
      74             : #define wf_zero() _mm256_setzero_ps()   /* Return [ 0.f 0.f 0.f 0.f 0.f 0.f 0.f 0.f ] */
      75     9044739 : #define wf_one()  _mm256_set1_ps( 1.f ) /* Return [ 1.f 1.f 1.f 1.f 1.f 1.f 1.f 1.f ] */
      76             : 
      77             : /* Memory operations */
      78             : 
      79             : /* wf_ld return the 8 floats at the 32-byte aligned / 32-byte sized
      80             :    location p as a vector float.  wf_ldu is the same but p does not have
      81             :    to be aligned.  wf_st writes the vector float to the 32-byte aligned
      82             :    / 32-byte sized location p as 8 floats.  wf_stu is the same but p
      83             :    does not have to be aligned.  In all these lane l will be at p[l].
      84             :    FIXME: USE ATTRIBUTES ON P PASSED TO THESE? */
      85             : 
      86     9044739 : #define wf_ld(p)    _mm256_load_ps( (p) )
      87    72357912 : #define wf_ldu(p)   _mm256_loadu_ps( (p) )
      88     9044739 : #define wf_st(p,x)  _mm256_store_ps( (p), (x) )
      89    72357912 : #define wf_stu(p,x) _mm256_storeu_ps( (p), (x) )
      90             : 
      91             : /* wf_ldif is an optimized equivalent to wf_notczero(c,wf_ldu(p)) (may
      92             :    have different behavior if c is not a proper vector conditional).  It
      93             :    is provided for symmetry with the wf_stif operation.  wf_stif stores
      94             :    x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
      95             :    Undefined behavior if c is not a proper vector conditional. */
      96             : 
      97             : #define wf_ldif(c,p)   _mm256_maskload_ps( (p),(c))
      98             : #define wf_stif(c,p,x) _mm256_maskstore_ps((p),(c),(x))
      99             : 
     100             : /* Element operations */
     101             : 
     102             : /* wf_extract extracts the float in lane imm from the vector float
     103             :    as a float.  wf_insert returns the vector float formed by replacing
     104             :    the value in lane imm of a with the provided float.  imm should be a
     105             :    compile time constant in 0:7.  wf_extract_variable and
     106             :    wf_insert_variable are the slower but the lane n does not have to be
     107             :    known at compile time (should still be in 0:7). */
     108             : 
     109             : /* FIXME: ARE THESE BETTER IMPLEMENTED VIA BOUNCING OF THE STACK (IT
     110             :    SEEMS PRETTY CLEAR THAT INTEL DIDN'T INTEND THIS TO BE POSSIBLE,
     111             :    ESPECIALLY GIVEN THE _MM256_CVTSS_F32 IS MISSING GCC)?
     112             :    ALTERNATIVELY, IT IS WORTHWHILE TO SPECIAL CASE 0 AND 4 EXTRACTION AS
     113             :    PER THE BELOW? */
     114             : 
     115             : #if FD_USING_CLANG || !FD_HAS_OPTIMIZATION
     116             : 
     117             : /* Sigh ... clang is sad and can't handle passing compile time const
     118             :    expressions through a static inline */
     119             : static inline float
     120    72357912 : wf_extract( wf_t a, int imm ) {
     121    72357912 :   union { float f[8]; __m256 v[1]; } t[1];
     122    72357912 :   _mm256_store_ps( t->f, a );
     123    72357912 :   return t->f[ imm ];
     124    72357912 : }
     125             : 
     126             : #else
     127             : 
     128             : static inline float
     129             : wf_extract( wf_t a, int imm ) {
     130             :   int avx_lane = imm >> 2; /* compile time eval */
     131             :   int sse_lane = imm & 3;  /* compile time eval */
     132             :   __m128 t = _mm256_extractf128_ps( a, avx_lane );
     133             :   if( sse_lane ) /* compile time eval */
     134             :     t = _mm_castsi128_ps( _mm_insert_epi32( _mm_setzero_si128(), _mm_extract_epi32( _mm_castps_si128( t ), sse_lane ), 0 ) );
     135             :   return _mm_cvtss_f32( t );
     136             : }
     137             : 
     138             : #endif
     139             : 
     140             : #define wf_insert(a,imm,v)                                              \
     141    72357912 :   _mm256_castsi256_ps( _mm256_insert_epi32( _mm256_castps_si256( (a) ), \
     142    72357912 :                                             _mm_extract_epi32( _mm_castps_si128( _mm_set_ss( (v) ) ), 0 ), (imm) ) )
     143             : 
     144             : static inline float
     145    72357912 : wf_extract_variable( wf_t a, int n ) {
     146    72357912 :   float f[8] W_ATTR;
     147    72357912 :   _mm256_store_ps( f, a );
     148    72357912 :   return f[n];
     149    72357912 : }
     150             : 
     151             : static inline wf_t
     152    72357912 : wf_insert_variable( wf_t a, int n, float v ) {
     153    72357912 :   float f[8] W_ATTR;
     154    72357912 :   _mm256_store_ps( f, a );
     155    72357912 :   f[n] = v;
     156    72357912 :   return _mm256_load_ps( f );
     157    72357912 : }
     158             : 
     159             : /* Given [a0 a1 a2 a3 a4 a5 a6 a7], [b0 b1 b2 b3 b4 b5 b6 b7] and/or
     160             :    [c0 c1 c2 c3 c4 c5 c6 c7], return ... */
     161             : 
     162             : /* Arithmetic operations */
     163             : 
     164             : /* wf_neg(a)        returns [        -a0         -a1  ...        -a7  ] (i.e.       -a )
     165             :    wf_sign(a)       returns [   signf(a0)   signf(a1) ...   signf(a7) ]
     166             :    wf_abs(a)        returns [   fabsf(a0)   fabsf(a1) ...   fabsf(a7) ] (i.e.    abs(a))
     167             :    wf_negabs(a)     returns [  -fabsf(a0)  -fabsf(a1) ...  -fabsf(a7) ] (i.e.   -abs(a))
     168             :    wf_ceil(a)       returns [   ceilf(a0)   ceilf(a1) ...   ceilf(a7) ] (i.e.   ceil(a))
     169             :    wf_floor(a)      returns [  floorf(a0)  floorf(a1) ...  floorf(a7) ] (i.e.  floor(a))
     170             :    wf_rint(a)       returns [   rintf(a0)   rintf(a1) ...   rintf(a7) ] (i.e. roundb(a))
     171             :    wf_trunc(a)      returns [  truncf(a0)  truncf(a1) ...  truncf(a7) ] (i.e.    fix(a))
     172             :    wf_sqrt(a)       returns [   sqrtf(a0)   sqrtf(a1) ...   sqrtf(a7) ] (i.e.   sqrt(a))
     173             :    wf_rcp_fast(a)   returns [   ~rcpf(a0)   ~rcpf(a1) ...   ~rcpf(a7) ]
     174             :    wf_rsqrt_fast(a) returns [ ~rsqrtf(a0) ~rsqrtf(a1) ... ~rsqrtf(a7) ]
     175             : 
     176             :    wf_add(a,b)      returns [           a0+b0            a1+b1  ...           a7+b7  ] (i.e. a +b)
     177             :    wf_sub(a,b)      returns [           a0-b0            a1-b1  ...           a7-b7  ] (i.e. a -b)
     178             :    wf_mul(a,b)      returns [           a0*b0            a1*b1  ...           a7*b7  ] (i.e. a.*b)
     179             :    wf_div(a,b)      returns [           a0/b0            a1/b1  ...           a7/b7  ] (i.e. a./b)
     180             :    wf_min(a,b)      returns [     fminf(a0,b0)     fminf(a1,b1) ...     fminf(a7,b7) ] (i.e. min([a;b]) (a and b are 1x8)
     181             :    wf_max(a,b)      returns [     fmaxf(a0,b0)     fmaxf(a1,b1) ...     fmaxf(a7,b7) ] (i.e. max([a;b]) (a and b are 1x8)
     182             :    wf_copysign(a,b) returns [ copysignf(a0,b0) copysignf(a1,b1) ... copysignf(a7,b7) ]
     183             :    wf_flipsign(a,b) returns [ flipsignf(a0,b0) flipsignf(a1,b1) ... flipsignf(a7,b7) ]
     184             : 
     185             :    wf_fma(a,b,c)    returns [  fmaf(a0,b0, c0)  fmaf(a1,b1, c1) ...  fmaf(a7,b7, c7) ] (i.e.  a.*b+c)
     186             :    wf_fms(a,b,c)    returns [  fmaf(a0,b0,-c0)  fmaf(a1,b1,-c1) ...  fmaf(a7,b7,-c7) ] (i.e.  a.*b-c)
     187             :    wf_fnma(a,b,c)   returns [ -fmaf(a0,b0,-c0) -fmaf(a1,b1,-c1) ... -fmaf(a7,b7,-c7) ] (i.e. -a.*b+c)
     188             : 
     189             :    where sign(a) is -1. if a's sign bit is set and +1. otherwise, rcp(a)
     190             :    is 1./a and rsqrt(a) is 1./sqrt(a), and flipsign(a,b) returns -a if b
     191             :    signbit is set and a otherwise.
     192             : 
     193             :    rintf is in round-to-nearest-even rounding mode (note rintf and
     194             :    nearbyintf are identical once floating point exceptions are ignored).
     195             : 
     196             :    sqrtf should typically be full accuracy.
     197             : 
     198             :    rcp_fast and rsqrt_fast should typically be ~12 bits or more bits
     199             :    accurate (~3 or more decimal digits) such that (nearly) full accuracy
     200             :    can be achieved with two to three rounds of Newton-Raphson polishing.
     201             :    Bit level replicable code should avoid rcp_fast and rsqrt_fast though
     202             :    as the approximations used can vary between various generations /
     203             :    steppings / microcode updates of x86 processors (including Intel and
     204             :    AMD). */
     205             : 
     206             : #define wf_neg(a)        _mm256_xor_ps(    _mm256_set1_ps( -0.f ), (a) )
     207             : #define wf_sign(a)       _mm256_xor_ps(    _mm256_set1_ps(  1.f ), _mm256_and_ps( _mm256_set1_ps( -0.f ), (a) ) )
     208             : #define wf_abs(a)        _mm256_andnot_ps( _mm256_set1_ps( -0.f ), (a) )
     209             : #define wf_negabs(a)     _mm256_or_ps(     _mm256_set1_ps( -0.f ), (a) )
     210             : #define wf_ceil(a)       _mm256_ceil_ps(  (a) )
     211             : #define wf_floor(a)      _mm256_floor_ps( (a) )
     212             : #define wf_rint(a)       _mm256_round_ps( (a), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC )
     213             : #define wf_trunc(a)      _mm256_round_ps( (a), _MM_FROUND_TO_ZERO        | _MM_FROUND_NO_EXC )
     214             : #define wf_sqrt(a)       _mm256_sqrt_ps(  (a) )
     215             : #define wf_rcp_fast(a)   _mm256_rcp_ps(   (a) )
     216             : #define wf_rsqrt_fast(a) _mm256_rsqrt_ps( (a) )
     217             : 
     218             : #define wf_add(a,b)      _mm256_add_ps( (a), (b) )
     219      262144 : #define wf_sub(a,b)      _mm256_sub_ps( (a), (b) )
     220             : #define wf_mul(a,b)      _mm256_mul_ps( (a), (b) )
     221             : #define wf_div(a,b)      _mm256_div_ps( (a), (b) )
     222             : #define wf_min(a,b)      _mm256_min_ps( (a), (b) )
     223             : #define wf_max(a,b)      _mm256_max_ps( (a), (b) )
     224             : #define wf_copysign(a,b) _mm256_or_ps( _mm256_andnot_ps( _mm256_set1_ps( -0.f ), (a) ), \
     225             :                                        _mm256_and_ps(    _mm256_set1_ps( -0.f ), (b) ) )
     226             : #define wf_flipsign(a,b) _mm256_xor_ps( (a), _mm256_and_ps( _mm256_set1_ps( -0.f ), (b) ) )
     227             : 
     228             : #define wf_fma(a,b,c)  _mm256_fmadd_ps(  (a), (b), (c) )
     229             : #define wf_fms(a,b,c)  _mm256_fmsub_ps(  (a), (b), (c) )
     230             : #define wf_fnma(a,b,c) _mm256_fnmadd_ps( (a), (b), (c) )
     231             : 
     232             : /* Binary operations */
     233             : 
     234             : /* Note: binary operations are not well defined on vector floats.
     235             :    If doing tricks with floating point binary representations, the user
     236             :    should use wf_to_wi_raw as necessary. */
     237             : 
     238             : /* Logical operations */
     239             : 
     240             : /* These all return proper vector conditionals */
     241             : 
     242             : #define wf_lnot(a)    _mm256_castps_si256( _mm256_cmp_ps( (a), _mm256_setzero_ps(), _CMP_EQ_OQ  ) ) /* [  !a0  !a1 ...  !a7 ] */
     243             : #define wf_lnotnot(a) _mm256_castps_si256( _mm256_cmp_ps( (a), _mm256_setzero_ps(), _CMP_NEQ_OQ ) ) /* [ !!a0 !!a1 ... !!a7 ] */
     244             : #define wf_signbit(a) _mm256_srai_epi32( _mm256_castps_si256( (a) ), 31 )
     245             : 
     246             : #define wf_eq(a,b) _mm256_castps_si256( _mm256_cmp_ps( (a), (b), _CMP_EQ_OQ  ) ) /* [ a0==b0 a1==b1 ... a7==b7 ] */
     247             : #define wf_gt(a,b) _mm256_castps_si256( _mm256_cmp_ps( (a), (b), _CMP_GT_OQ  ) ) /* [ a0> b0 a1> b1 ... a7> b7 ] */
     248      262144 : #define wf_lt(a,b) _mm256_castps_si256( _mm256_cmp_ps( (a), (b), _CMP_LT_OQ  ) ) /* [ a0< b0 a1< b1 ... a7< b7 ] */
     249             : #define wf_ne(a,b) _mm256_castps_si256( _mm256_cmp_ps( (a), (b), _CMP_NEQ_OQ ) ) /* [ a0==b0 a1==b1 ... a7==b7 ] */
     250             : #define wf_ge(a,b) _mm256_castps_si256( _mm256_cmp_ps( (a), (b), _CMP_GE_OQ  ) ) /* [ a0==b0 a1==b1 ... a7==b7 ] */
     251             : #define wf_le(a,b) _mm256_castps_si256( _mm256_cmp_ps( (a), (b), _CMP_LE_OQ  ) ) /* [ a0==b0 a1==b1 ... a7==b7 ] */
     252             : 
     253             : /* Conditional operations */
     254             : 
     255             : #define wf_czero(c,f)    _mm256_andnot_ps( _mm256_castsi256_ps( (c) ), (f) ) /* [ c0?0.f:f0 c1?0.f:f1 ... c7?0.f:f7 ] */
     256             : #define wf_notczero(c,f) _mm256_and_ps(    _mm256_castsi256_ps( (c) ), (f) ) /* [ c0?f0:0.f c1?f1:0.f ... c7?f7:0.f ] */
     257             : 
     258      262144 : #define wf_if(c,t,f) _mm256_blendv_ps( (f), (t), _mm256_castsi256_ps( (c) ) ) /* [ c0?t0:f0 c1?t1:f1 ... c7?t7:f7 ] */
     259             : 
     260             : /* Conversion operations */
     261             : 
     262             : /* Summarizing:
     263             : 
     264             :    wf_to_wc(a)      returns [ !!a0 !!a1 ... !!a7 ]
     265             : 
     266             :    wf_to_wi(a)      returns [ (int)a0        (int)a1        ... (int)a7        ]
     267             :    wf_to_wi_fast(a) returns [ (int)rintf(a0) (int)rintf(a1) ... (int)rintf(a7) ]
     268             : 
     269             :    wf_to_wu(a)      returns [ (uint)a0        (uint)a1        ... (uint)a7        ]
     270             :    wf_to_wu_fast(a) returns [ (uint)rintf(a0) (uint)rintf(a1) ... (uint)rintf(a7) ]
     271             : 
     272             :    wf_to_wd(a,0)    returns [ (double)a0 (double)a1 (double)a2 (double)a3 ]
     273             :    wf_to_wd(a,1)    returns [ (double)a4 (double)a5 (double)a6 (double)a7 ]
     274             : 
     275             :    wf_to_wl(a,0)    returns [ (long)a0 (long)a1 (long)a2 (long)a3 ]
     276             :    wf_to_wl(a,1)    returns [ (long)a4 (long)a5 (long)a6 (long)a7 ]
     277             : 
     278             :    wf_to_wv(a,0)    returns [ (ulong)a0 (ulong)a1 (ulong)a2 (ulong)a3 ]
     279             :    wf_to_wv(a,1)    returns [ (ulong)a4 (ulong)a5 (ulong)a6 (ulong)a7 ]
     280             : 
     281             :    where rintf is configured for round-to-nearest-even rounding (Intel
     282             :    architecture defaults to round-nearest-even here ... sigh, they still
     283             :    don't fully get it) and imm_hi should be a compile time constant.
     284             :    That is, the fast variants assume that float point inputs are already
     285             :    integral value in the appropriate range for the output type.
     286             : 
     287             :    For wf_to_{wd,wl}, the permutation used for the conversion is less
     288             :    flexible due to cross 128-bit lane limitations in AVX.  If imm_hi==0,
     289             :    the conversion is done to lanes 0:3.  Otherwise, the conversion is
     290             :    done to lanes 4:7.
     291             : 
     292             :    The raw variants return just raw bits as the corresponding vector
     293             :    type.  wf_to_wi_raw in particular allows doing advanced bit tricks on
     294             :    a vector float.  The others are probably dubious but they are
     295             :    provided for completeness. */
     296             : 
     297             : #define wf_to_wc(a)        _mm256_castps_si256( _mm256_cmp_ps( (a), _mm256_setzero_ps(), _CMP_NEQ_OQ ) )
     298             : #define wf_to_wi(a)        wf_to_wi_fast( _mm256_round_ps( (a), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) )
     299             : #define wf_to_wu(a)        wf_to_wu_fast( _mm256_round_ps( (a), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) )
     300             : #define wf_to_wd(a,imm_hi) _mm256_cvtps_pd( _mm256_extractf128_ps( (a), !!(imm_hi) ) )
     301             : 
     302             : /* FIXME: IS IT FASTER TO USE INSERT / EXTRACT FOR THESE? */
     303             : 
     304      393216 : static inline __m256i wf_to_wl( wf_t f, int imm_hi ) { /* FIXME: workaround wl_t isn't declared at this point */
     305      393216 :   union { float f[4]; __m128  v[1]; } t[1];
     306      393216 :   union { long  l[4]; __m256i v[1]; } u[1];
     307      393216 :   _mm_store_ps( t->f, imm_hi ? _mm256_extractf128_ps( f, 1 ) : _mm256_extractf128_ps( f, 0 ) /* compile time */ );
     308      393216 :   u->l[0] = (long)t->f[0];
     309      393216 :   u->l[1] = (long)t->f[1];
     310      393216 :   u->l[2] = (long)t->f[2];
     311      393216 :   u->l[3] = (long)t->f[3];
     312      393216 :   return _mm256_load_si256( u->v );
     313      393216 : }
     314             : 
     315      393216 : static inline __m256i wf_to_wv( wf_t f, int imm_hi ) { /* FIXME: workaround wv_t isn't declared at this point */
     316      393216 :   union { float f[4]; __m128  v[1]; } t[1];
     317      393216 :   union { ulong u[4]; __m256i v[1]; } u[1];
     318      393216 :   _mm_store_ps( t->f, imm_hi ? _mm256_extractf128_ps( f, 1 ) : _mm256_extractf128_ps( f, 0 ) /* compile time */ );
     319      393216 :   u->u[0] = (ulong)t->f[0];
     320      393216 :   u->u[1] = (ulong)t->f[1];
     321      393216 :   u->u[2] = (ulong)t->f[2];
     322      393216 :   u->u[3] = (ulong)t->f[3];
     323      393216 :   return _mm256_load_si256( u->v );
     324      393216 : }
     325             : 
     326             : #define wf_to_wi_fast(a) _mm256_cvtps_epi32( (a) )
     327             : 
     328             : /* Note: Given that _mm256_cvtps_epi32 existed for a long time, Intel
     329             :    clearly had the hardware under the hood for _mm256_cvtps_epu32 but
     330             :    didn't bother to expose it pre-Skylake-X ... sigh (all too typical
     331             :    unfortunately).  We use _mm256_cvtps_epu32 where supported because
     332             :    it is faster and it replicates the same IB behaviors as the compiler
     333             :    generated scalar ASM for float to uint casts on these targets.
     334             : 
     335             :    Pre-Skylake-X, we emulate it by noting that subtracting 2^31 from
     336             :    a float holding an integer in [2^31,2^32) is exact and the result can
     337             :    be exactly converted to a signed integer by _mm256_cvtps_epi32.  We
     338             :    then use twos complement hacks to add back any shift.  This also
     339             :    replicates the compiler's IB behaviors on these ISAs for float to
     340             :    int casts. */
     341             : 
     342             : #if defined(__AVX512F__) && defined(__AVX512VL__)
     343             : #define wf_to_wu_fast( a ) _mm256_cvtps_epu32( (a) )
     344             : #else
     345      262144 : static inline __m256i wf_to_wu_fast( wf_t a ) { /* FIXME: workaround wu_t isn't declared at this point */
     346             :   /**/                                                                    /* Assumes a is integer in [0,2^32) */
     347      262144 :   wf_t    s  = wf_bcast( (float)(1U<<31) );                               /* 2^31 */
     348      262144 :   wc_t    c  = wf_lt ( a, s );                                            /* -1 if a<2^31, 0 o.w. */
     349      262144 :   wf_t    as = wf_sub( a, s );                                            /* a-2^31 */
     350      262144 :   __m256i u  = _mm256_cvtps_epi32( wf_if( c, a, as ) );                   /* (uint)(a      if a<2^31, a-2^31 o.w.) */
     351      262144 :   __m256i us = _mm256_add_epi32( u, _mm256_set1_epi32( (int)(1U<<31) ) ); /* (uint)(a+2^31 if a<2^31, a      o.w.) */
     352      262144 :   return _mm256_castps_si256( _mm256_blendv_ps( _mm256_castsi256_ps( us ), _mm256_castsi256_ps( u ), _mm256_castsi256_ps( c ) ) );
     353      262144 : }
     354             : #endif
     355             : 
     356             : #define wf_to_wc_raw(a) _mm256_castps_si256( (a) )
     357             : #define wf_to_wi_raw(a) _mm256_castps_si256( (a) )
     358             : #define wf_to_wu_raw(a) _mm256_castps_si256( (a) )
     359             : #define wf_to_wd_raw(a) _mm256_castps_pd(    (a) )
     360             : #define wf_to_wl_raw(a) _mm256_castps_si256( (a) )
     361             : #define wf_to_wv_raw(a) _mm256_castps_si256( (a) )
     362             : 
     363             : /* Reduction operations */
     364             : 
     365             : static inline wf_t
     366      196608 : wf_sum_all( wf_t x ) { /* Returns wf_bcast( sum( x ) ) */
     367      196608 :   x = _mm256_add_ps( x, _mm256_permute2f128_ps( x, x, 1 ) ); /* x04   x15   x26   x37   ... */
     368      196608 :   x = _mm256_hadd_ps( x, x );                                /* x0145 x2367 ... */
     369      196608 :   return _mm256_hadd_ps( x, x );                             /* xsum  ... */
     370      196608 : }
     371             : 
     372             : static inline wf_t
     373      196608 : wf_min_all( wf_t x ) { /* Returns wf_bcast( min( x ) ) */
     374      196608 :   __m256 y = _mm256_permute2f128_ps( x, x, 1 );          /* x4    x5    x6   x7    x0    x1   x2    x3   */
     375      196608 :   x = _mm256_min_ps( x, y );                             /* x04   x15   x26  x37   ... */
     376      196608 :   y = _mm256_permute_ps( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x26   x37   x04  x15   ... */
     377      196608 :   x = _mm256_min_ps( x, y );                             /* x0246 x1357 ... */
     378      196608 :   y = _mm256_permute_ps( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x1357 x0246 ... */
     379      196608 :   x = _mm256_min_ps( x, y );                             /* xmin  ... */
     380      196608 :   return x;
     381      196608 : }
     382             : 
     383             : static inline wf_t
     384      196608 : wf_max_all( wf_t x ) { /* Returns wf_bcast( max( x ) ) */
     385      196608 :   __m256 y = _mm256_permute2f128_ps( x, x, 1 );          /* x4    x5    x6   x7    x0    x1   x2    x3   */
     386      196608 :   x = _mm256_max_ps( x, y );                             /* x04   x15   x26  x37   ... */
     387      196608 :   y = _mm256_permute_ps( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x26   x37   x04  x15   ... */
     388      196608 :   x = _mm256_max_ps( x, y );                             /* x0246 x1357 ... */
     389      196608 :   y = _mm256_permute_ps( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x1357 x0246 ... */
     390      196608 :   x = _mm256_max_ps( x, y );                             /* xmax  ... */
     391      196608 :   return x;
     392      196608 : }
     393             : 
     394             : /* Misc operations */
     395             : 
     396             : /* wf_gather(b,i) returns [ b[i(0)] b[i(1)] ... b[i(7)] ] where b is a
     397             :    "float const *" and i is a wi_t. */
     398             : 
     399     9044739 : #define wf_gather(b,i) _mm256_i32gather_ps( (b), (i), 4 )
     400             : 
     401             : /* wf_transpose_8x8 transposes the 8x8 matrix stored in wf_t r0,r1,...r7
     402             :    and stores the result in 8x8 matrix wf_t c0,c1,...c7.  All
     403             :    c0,c1,...c7 should be different for a well defined result.
     404             :    Otherwise, in-place operation and/or using the same wf_t to specify
     405             :    multiple rows of r is fine. */
     406             : 
     407      196608 : #define wf_transpose_8x8( r0,r1,r2,r3,r4,r5,r6,r7, c0,c1,c2,c3,c4,c5,c6,c7 ) do {                                              \
     408      196608 :     wf_t _wf_transpose_r0 = (r0); wf_t _wf_transpose_r1 = (r1); wf_t _wf_transpose_r2 = (r2); wf_t _wf_transpose_r3 = (r3);    \
     409      196608 :     wf_t _wf_transpose_r4 = (r4); wf_t _wf_transpose_r5 = (r5); wf_t _wf_transpose_r6 = (r6); wf_t _wf_transpose_r7 = (r7);    \
     410      196608 :     wf_t _wf_transpose_t;                                                                                                      \
     411      196608 :     /* Transpose 4x4 blocks */                                                                                                 \
     412      196608 :     _wf_transpose_t = _wf_transpose_r0; _wf_transpose_r0 = _mm256_permute2f128_ps( _wf_transpose_t,  _wf_transpose_r4, 0x20 ); \
     413      196608 :     /**/                                _wf_transpose_r4 = _mm256_permute2f128_ps( _wf_transpose_t,  _wf_transpose_r4, 0x31 ); \
     414      196608 :     _wf_transpose_t = _wf_transpose_r1; _wf_transpose_r1 = _mm256_permute2f128_ps( _wf_transpose_t,  _wf_transpose_r5, 0x20 ); \
     415      196608 :     /**/                                _wf_transpose_r5 = _mm256_permute2f128_ps( _wf_transpose_t,  _wf_transpose_r5, 0x31 ); \
     416      196608 :     _wf_transpose_t = _wf_transpose_r2; _wf_transpose_r2 = _mm256_permute2f128_ps( _wf_transpose_t,  _wf_transpose_r6, 0x20 ); \
     417      196608 :     /**/                                _wf_transpose_r6 = _mm256_permute2f128_ps( _wf_transpose_t,  _wf_transpose_r6, 0x31 ); \
     418      196608 :     _wf_transpose_t = _wf_transpose_r3; _wf_transpose_r3 = _mm256_permute2f128_ps( _wf_transpose_t,  _wf_transpose_r7, 0x20 ); \
     419      196608 :     /**/                                _wf_transpose_r7 = _mm256_permute2f128_ps( _wf_transpose_t,  _wf_transpose_r7, 0x31 ); \
     420      196608 :     /* Transpose 2x2 blocks */                                                                                                 \
     421      196608 :     _wf_transpose_t = _wf_transpose_r0; _wf_transpose_r0 = _mm256_unpacklo_ps(     _wf_transpose_t,  _wf_transpose_r2 );       \
     422      196608 :     /**/                                _wf_transpose_r2 = _mm256_unpackhi_ps(     _wf_transpose_t,  _wf_transpose_r2 );       \
     423      196608 :     _wf_transpose_t = _wf_transpose_r1; _wf_transpose_r1 = _mm256_unpacklo_ps(     _wf_transpose_t,  _wf_transpose_r3 );       \
     424      196608 :     /**/                                _wf_transpose_r3 = _mm256_unpackhi_ps(     _wf_transpose_t,  _wf_transpose_r3 );       \
     425      196608 :     _wf_transpose_t = _wf_transpose_r4; _wf_transpose_r4 = _mm256_unpacklo_ps(     _wf_transpose_t,  _wf_transpose_r6 );       \
     426      196608 :     /**/                                _wf_transpose_r6 = _mm256_unpackhi_ps(     _wf_transpose_t,  _wf_transpose_r6 );       \
     427      196608 :     _wf_transpose_t = _wf_transpose_r5; _wf_transpose_r5 = _mm256_unpacklo_ps(     _wf_transpose_t,  _wf_transpose_r7 );       \
     428      196608 :     /**/                                _wf_transpose_r7 = _mm256_unpackhi_ps(     _wf_transpose_t,  _wf_transpose_r7 );       \
     429      196608 :     /* Transpose 1x1 blocks */                                                                                                 \
     430      196608 :     /**/                                (c0)             = _mm256_unpacklo_ps(     _wf_transpose_r0, _wf_transpose_r1 );       \
     431      196608 :     /**/                                (c1)             = _mm256_unpackhi_ps(     _wf_transpose_r0, _wf_transpose_r1 );       \
     432      196608 :     /**/                                (c2)             = _mm256_unpacklo_ps(     _wf_transpose_r2, _wf_transpose_r3 );       \
     433      196608 :     /**/                                (c3)             = _mm256_unpackhi_ps(     _wf_transpose_r2, _wf_transpose_r3 );       \
     434      196608 :     /**/                                (c4)             = _mm256_unpacklo_ps(     _wf_transpose_r4, _wf_transpose_r5 );       \
     435      196608 :     /**/                                (c5)             = _mm256_unpackhi_ps(     _wf_transpose_r4, _wf_transpose_r5 );       \
     436      196608 :     /**/                                (c6)             = _mm256_unpacklo_ps(     _wf_transpose_r6, _wf_transpose_r7 );       \
     437      196608 :     /**/                                (c7)             = _mm256_unpackhi_ps(     _wf_transpose_r6, _wf_transpose_r7 );       \
     438      196608 :   } while(0)

Generated by: LCOV version 1.14