LCOV - code coverage report
Current view: top level - util/simd - fd_avx_wl.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 125 137 91.2 %
Date: 2025-01-08 12:08:44 Functions: 36 18554 0.2 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_avx_h
       2             : #error "Do not include this directly; use fd_avx.h"
       3             : #endif
       4             : 
       5             : /* Vector long API ****************************************************/
       6             : 
       7             : /* A wl_t is a vector where each adjacent pair of 32-bit wide lanes
       8             :    (e.g. 0-1 / 2-3 / 4-5 / 6-7) holds a signed 64-bit twos-complement
       9             :    integer (a "long").
      10             : 
      11             :    These mirror the other APIs as much as possible.  Macros are
      12             :    preferred over static inlines when it is possible to do it robustly
      13             :    to reduce the risk of the compiler mucking it up. */
      14             : 
      15  3295635064 : #define wl_t __m256i
      16             : 
      17             : /* Constructors */
      18             : 
      19             : /* Given the long values, return ... */
      20             : 
      21     1052832 : #define wl(l0,l1,l2,l3) _mm256_setr_epi64x( (l0), (l1), (l2), (l3) ) /* [ l0 l1 l2 l3 ] */
      22             : 
      23  3140095411 : #define wl_bcast(l0) _mm256_set1_epi64x( (l0) ) /* [ l0 l0 l0 l0 ] */
      24             : 
      25             : static inline wl_t /* [ l0 l1 l0 l1 ] */
      26      196608 : wl_bcast_pair( long l0, long l1 ) {
      27      196608 :   return _mm256_setr_epi64x( l0, l1, l0, l1 );
      28      196608 : }
      29             : 
      30             : static inline wl_t /* [ l0 l0 l1 l1 ] */
      31      196608 : wl_bcast_wide( long l0, long l1 ) {
      32      196608 :   return _mm256_setr_epi64x( l0, l0, l1, l1 );
      33      196608 : }
      34             : 
      35             : /* wl_permute returns [ l(imm_i0) l(imm_i1) l(imm_i2) l(imm_i3) ].
      36             :    imm_i* should be compile time constants in 0:3. */
      37             : 
      38             : #if FD_USING_CLANG /* Sigh ... clang is sad and can't handle passing compile time const expressions through a static inline */
      39             : 
      40             : static inline wl_t
      41     2162688 : wl_permute( wl_t x, int imm_i0, int imm_i1, int imm_i2, int imm_i3 ) {
      42     2162688 :   union { long l[4]; __m256i v[1]; } t, u;
      43     2162688 :   _mm256_store_si256( t.v, x );
      44     2162688 :   u.l[0] = t.l[ imm_i0 ];
      45     2162688 :   u.l[1] = t.l[ imm_i1 ];
      46     2162688 :   u.l[2] = t.l[ imm_i2 ];
      47     2162688 :   u.l[3] = t.l[ imm_i3 ];
      48     2162688 :   return _mm256_load_si256( u.v );
      49     2162688 : }
      50             : 
      51             : #else
      52             : 
      53             : #define wl_permute(x,imm_i0,imm_i1,imm_i2,imm_i3) _mm256_permute4x64_epi64( (x), (imm_i0)+4*(imm_i1)+16*(imm_i2)+64*(imm_i3) )
      54             : 
      55             : #endif
      56             : 
      57             : /* Predefined constants */
      58             : 
      59             : #define wl_zero() _mm256_setzero_si256()   /* Return [ 0L 0L 0L 0L ] */
      60   134873859 : #define wl_one()  _mm256_set1_epi64x( 1L ) /* Return [ 1L 1L 1L 1L ] */
      61             : 
      62             : /* Memory operations */
      63             : 
      64             : /* wl_ld return the 4 longs at the 32-byte aligned / 32-byte sized
      65             :    location p as a vector long.  wl_ldu is the same but p does not have
      66             :    to be aligned.  wl_st writes the vector long to the 32-byte aligned /
      67             :    32-byte sized location p as 4 longs.  wl_stu is the same but p does
      68             :    not have to be aligned.  In all these 64-bit lane l wlll be at p[l].
      69             :    FIXME: USE ATTRIBUTES ON P PASSED TO THESE?
      70             : 
      71             :    Note: gcc knows a __m256i may alias. */
      72             : 
      73 12676271251 : static inline wl_t wl_ld( long const * p ) { return _mm256_load_si256(  (__m256i const *)p ); }
      74   134873859 : static inline void wl_st( long * p, wl_t i ) { _mm256_store_si256(  (__m256i *)p, i ); }
      75             : 
      76 13080678208 : static inline wl_t wl_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
      77   539495436 : static inline void wl_stu( void * p, wl_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
      78             : 
      79             : /* wl_ldif is an optimized equivalent to wl_notczero(c,wl_ldu(p)) (may
      80             :    have different behavior if c is not a proper vector conditional).  It
      81             :    is provided for symmetry with the wl_stif operation.  wl_stif stores
      82             :    x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
      83             :    Undefined behavior if c is not a proper vector conditional. */
      84             : 
      85             : #define wl_ldif(c,p)   _mm256_maskload_epi64( (p),(c))
      86             : #define wl_stif(c,p,x) _mm256_maskstore_epi64((p),(c),(x))
      87             : 
      88             : /* Element operations */
      89             : 
      90             : /* wl_extract extracts the long in lane imm from the vector long as a
      91             :    long.  wl_insert returns the vector long formed by replacing the
      92             :    value in lane imm of a with the provided long.  imm should be a
      93             :    compile time known in 0:3.  wl_extract_variable and
      94             :    wl_insert_variable are the slower but the lane n does not have to be
      95             :    known at compile time (should still be in 0:3).
      96             : 
      97             :    Note: C99 TC3 allows type punning through a union. */
      98             : 
      99   539495436 : #define wl_extract(a,imm)  _mm256_extract_epi64( (a), (imm) )
     100             : 
     101   539495436 : #define wl_insert(a,imm,v) _mm256_insert_epi64( (a), (v), (imm) )
     102             : 
     103             : static inline long
     104   539495436 : wl_extract_variable( wl_t a, int n ) {
     105   539495436 :   union { __m256i m[1]; long l[4]; } t[1];
     106   539495436 :   _mm256_store_si256( t->m, a );
     107   539495436 :   return t->l[n];
     108   539495436 : }
     109             : 
     110             : static inline wl_t
     111   539495436 : wl_insert_variable( wl_t a, int n, long v ) {
     112   539495436 :   union { __m256i m[1]; long l[4]; } t[1];
     113   539495436 :   _mm256_store_si256( t->m, a );
     114   539495436 :   t->l[n] = v;
     115   539495436 :   return _mm256_load_si256( t->m );
     116   539495436 : }
     117             : 
     118             : /* Given [a0 a1 a2 a3] and/or [b0 b1 b2 b3], return ... */
     119             : 
     120             : /* Arithmetic operations */
     121             : 
     122             : #define wl_neg(a)   _mm256_sub_epi64( _mm256_setzero_si256(), (a) ) /* [ -a0  -a1  ... -a3  ] (twos complement handling) */
     123             : 
     124             : /* Note: _mm256_{abs,min,max}_epi64 are missing pre AVX-512.  We emulate
     125             :    these below (and use the AVX-512 versions if possible).  Likewise,
     126             :    there is no _mm256_mullo_epi64 pre AVX-512.  Since this is not cheap to
     127             :    emulate, we do not provide a wl_mul for the time being (we could
     128             :    consider exposing it on AVX-512 targets though).  There is a
     129             :    64L*64L->64 multiply (where the lower 32-bits will be sign extended
     130             :    to 64-bits beforehand) though and that is very useful.  So we do
     131             :    provide that. */
     132             : 
     133             : #define wl_add(a,b)    _mm256_add_epi64(   (a), (b) ) /* [ a0 +b0     a1 +b1     ... a3 +b3     ] */
     134     5556432 : #define wl_sub(a,b)    _mm256_sub_epi64(   (a), (b) ) /* [ a0 -b0     a1 -b1     ... a3 -b3     ] */
     135             : //#define wl_mul(a,b)  _mm256_mullo_epi64( (a), (b) ) /* [ a0 *b0     a1 *b1     ... a3 *b3     ] */
     136             : #define wl_mul_ll(a,b) _mm256_mul_epi32(   (a), (b) ) /* [ a0l*b0l    a1l*b1l    ... a3l *b3l   ] */
     137             : 
     138             : /* Binary operations */
     139             : 
     140             : /* Note: wl_shl/wl_shr/wl_shru is a left/signed right/unsigned right
     141             :    shift by imm bits; imm should be a compile time constant in 0:63.
     142             :    The variable variants are slower but do not require the shift amount
     143             :    to be known at compile time (should still be in 0:63).  Also, AVX is
     144             :    missing _mm256_sra*_epi64 intrinsics.  We emulate these below. */
     145             : 
     146             : #define wl_not(a) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), (a) ) /* [ ~a0 ~a1 ... ~a3 ] */
     147             : 
     148      463008 : #define wl_shl(a,imm)   _mm256_slli_epi64( (a), (imm) ) /* [ a0<<imm a1<<imm ... a3<<imm ] */
     149             : //#define wl_shr(a,imm) _mm256_srai_epi64( (a), (imm) ) /* [ a0>>imm a1>>imm ... a3>>imm ] (treat a as signed)*/
     150     6019440 : #define wl_shru(a,imm)  _mm256_srli_epi64( (a), (imm) ) /* [ a0>>imm a1>>imm ... a3>>imm ] (treat a as unsigned) */
     151             : 
     152             : #define wl_shl_variable(a,n)   _mm256_sll_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     153             : //#define wl_shr_variable(a,n) _mm256_sra_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     154             : #define wl_shru_variable(a,n)  _mm256_srl_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     155             : 
     156             : #define wl_shl_vector(a,b)   _mm256_sllv_epi64( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a3<<b3 ] */
     157             : //#define wl_shr_vector(a,b) _mm256_srav_epi64( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a3>>b3 ] (treat a as signed) */
     158      463008 : #define wl_shru_vector(a,b)  _mm256_srlv_epi64( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a3>>b3 ] (treat a as unsigned) */
     159             : 
     160             : #define wl_and(a,b)    _mm256_and_si256(    (a), (b) ) /* [   a0 &b0    a1& b1 ...   a3& b3 ] */
     161             : #define wl_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0  (~a1)&b1 ... (~a3)&b3 ] */
     162    41943040 : #define wl_or(a,b)     _mm256_or_si256(     (a), (b) ) /* [   a0 |b0    a1 |b1 ...   a3 |b3 ] */
     163             : #define wl_xor(a,b)    _mm256_xor_si256(    (a), (b) ) /* [   a0 ^b0    a1 ^b1 ...   a3 ^b3 ] */
     164             : 
     165             : /* wl_rol(x,n) returns wl( rotate_left (x0,n), rotate_left (x1,n), ... )
     166             :    wl_ror(x,n) returns wl( rotate_right(x0,n), rotate_right(x1,n), ... ) */
     167             : 
     168             : #if FD_HAS_AVX512
     169             : #define wl_rol(a,imm)  _mm256_rol_epi64( (a), (imm) )
     170             : #define wl_ror(a,imm)  _mm256_ror_epi64( (a), (imm) )
     171             : #else
     172     8388608 : static inline wl_t wl_rol( wl_t a, int imm ) { return wl_or( wl_shl(  a, imm & 63 ), wl_shru( a, (-imm) & 63 ) ); }
     173     8388608 : static inline wl_t wl_ror( wl_t a, int imm ) { return wl_or( wl_shru( a, imm & 63 ), wl_shl(  a, (-imm) & 63 ) ); }
     174             : #endif
     175             : 
     176    12582912 : static inline wl_t wl_rol_variable( wl_t a, int n ) { return wl_or( wl_shl_variable(  a, n&63 ), wl_shru_variable( a, (-n)&63 ) ); }
     177    12582912 : static inline wl_t wl_ror_variable( wl_t a, int n ) { return wl_or( wl_shru_variable( a, n&63 ), wl_shl_variable(  a, (-n)&63 ) ); }
     178             : 
     179           0 : static inline wl_t wl_rol_vector( wl_t a, wl_t b ) {
     180           0 :   wl_t m = wl_bcast( 63L );
     181           0 :   return wl_or( wl_shl_vector(  a, wl_and( b, m ) ), wl_shru_vector( a, wl_and( wl_neg( b ), m ) ) );
     182           0 : }
     183             : 
     184           0 : static inline wl_t wl_ror_vector( wl_t a, wl_t b ) {
     185           0 :   wl_t m = wl_bcast( 63L );
     186           0 :   return wl_or( wl_shru_vector( a, wl_and( b, m ) ), wl_shl_vector(  a, wl_and( wl_neg( b ), m ) ) );
     187           0 : }
     188             : 
     189             : /* Logical operations */
     190             : 
     191             : #define wl_lnot(a)    _mm256_cmpeq_epi64( (a), _mm256_setzero_si256() ) /* [  !a0  !a1 ...  !a3 ] */
     192             : #define wl_lnotnot(a)                                                   /* [ !!a0 !!a1 ... !!a3 ] */ \
     193             :   _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpeq_epi64( (a), _mm256_setzero_si256() ) )
     194             : 
     195      463008 : #define wl_eq(a,b) _mm256_cmpeq_epi64( (a), (b) )                                                /* [ a0==b0 a1==b1 ... a3==b3 ] */
     196      463008 : #define wl_gt(a,b) _mm256_cmpgt_epi64( (a), (b) )                                                /* [ a0> b0 a1> b1 ... a3> b3 ] */
     197    25165824 : #define wl_lt(a,b) _mm256_cmpgt_epi64( (b), (a) )                                                /* [ a0< b0 a1< b1 ... a3< b3 ] */
     198             : #define wl_ne(a,b) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpeq_epi64( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a3!=b3 ] */
     199             : #define wl_ge(a,b) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpgt_epi64( (b), (a) ) ) /* [ a0>=b0 a1>=b1 ... a3>=b3 ] */
     200             : #define wl_le(a,b) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpgt_epi64( (a), (b) ) ) /* [ a0<=b0 a1<=b1 ... a3<=b3 ] */
     201             : 
     202             : /* Conditional operations */
     203             : 
     204             : #define wl_czero(c,f)    _mm256_andnot_si256( (c), (f) ) /* [ c0?0L:f0 c1?0L:f1 ... c3?0L:f3 ] */
     205             : #define wl_notczero(c,f) _mm256_and_si256(    (c), (f) ) /* [ c0?f0:0L c1?f1:0L ... c3?f3:0L ] */
     206             : 
     207      917504 : #define wl_if(c,t,f) _mm256_blendv_epi8(  (f), (t), (c) ) /* [ c0?t0:f0 c1?t1:f1 ... c3?t3:f3 ] */
     208             : 
     209             : #if defined(__AVX512F__) && defined(__AVX512VL__) /* See note above */
     210             : #define wl_abs(a)   _mm256_abs_epi64( (a) )
     211      131072 : #define wl_min(a,b) _mm256_min_epi64( (a), (b) )
     212      131072 : #define wl_max(a,b) _mm256_max_epi64( (a), (b) )
     213             : #else
     214      131072 : static inline wl_t wl_abs( wl_t a )         { return wl_if( wl_lt( a, wl_zero() ), wl_neg( a ), a ); }
     215      393216 : static inline wl_t wl_min( wl_t a, wl_t b ) { return wl_if( wl_lt( a, b ), a, b ); }
     216      393216 : static inline wl_t wl_max( wl_t a, wl_t b ) { return wl_if( wl_gt( a, b ), a, b ); }
     217             : #endif
     218             : 
     219    12582912 : static inline wl_t wl_shr( wl_t a, int imm ) {
     220    12582912 :   wc_t c = wl_lt( a, wl_zero() ); /* Note that wc_t is binary compat with wl_t */
     221    12582912 :   return _mm256_xor_si256( _mm256_srli_epi64( _mm256_xor_si256( a, c ), imm ), c );
     222    12582912 : }
     223    12582912 : static inline wl_t wl_shr_variable( wl_t a, int n ) {
     224    12582912 :   wc_t c = wl_lt( a, wl_zero() ); /* Note that wc_t is binary compat with wl_t */
     225    12582912 :   return _mm256_xor_si256( _mm256_srl_epi64( _mm256_xor_si256( a, c ), _mm_insert_epi64( _mm_setzero_si128(), n, 0 ) ), c );
     226    12582912 : }
     227           0 : static inline wl_t wl_shr_vector( wl_t a, wl_t n ) {
     228           0 :   wc_t c = wl_lt( a, wl_zero() ); /* Note that wc_t is binary compat with wl_t */
     229           0 :   return _mm256_xor_si256( _mm256_srlv_epi64( _mm256_xor_si256( a, c ), n ), c );
     230           0 : }
     231             : 
     232             : /* Conversion operations */
     233             : 
     234             : /* Summarizing:
     235             : 
     236             :    wl_to_wc(d)     returns [ !!l0 !!l0 !!l1 !!l1 ... !!l3 !!l3 ]
     237             : 
     238             :    wl_to_wf(l,i,0) returns [ (float)l0 (float)l1 (float)l2 (float)l3 f4 f5 f6 f7 ]
     239             :    wl_to_wf(l,i,1) returns [ f0 f1 f2 f3 (float)l0 (float)l1 (float)l2 (float)l3 ]
     240             : 
     241             :    wl_to_wi(l,i,0) returns [ (int)l0 (int)l1 (int)l2 (int)l3 i4 i5 i6 i7 ]
     242             :    wl_to_wi(l,i,1) returns [ i0 i1 i2 i3 (int)l0 (int)l1 (int)l2 (int)l3 ]
     243             : 
     244             :    wl_to_wu(l,u,0) returns [ (uint)l0 (uint)l1 (uint)l2 (uint)l3 u4 u5 u6 u7 ]
     245             :    wl_to_wu(l,u,1) returns [ u0 u1 u2 u3 (uint)l0 (uint)l1 (uint)l2 (uint)l3 ]
     246             : 
     247             :    wl_to_wd(l)     returns [ (double)l0 (double)l1 (double)l2 (double)l3 ]
     248             : 
     249             :    wl_to_wv(l)     returns [ (ulong)l0 (ulong)l1 (ulong)l2 (ulong)l3 ]
     250             : 
     251             :    The raw variants just treat the raw bits as the corresponding vector
     252             :    type.  For wl_to_wc_raw, the user promises wl contains a proper
     253             :    vector conditional (e.g. 0 or -1 in each lane).  The others are
     254             :    provided to facilitate doing advanced bit tricks on floating point
     255             :    values. */
     256             : 
     257             : #define wl_to_wc(a) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpeq_epi64( (a), _mm256_setzero_si256() ) )
     258             : 
     259      393216 : static inline wf_t wl_to_wf( wl_t l, wf_t f, int imm_hi ) {
     260      393216 :   union { long  l[4]; __m256i v[1]; } t[1];
     261      393216 :   union { float f[4]; __m128  v[1]; } u[1];
     262      393216 :   _mm256_store_si256( t->v, l );
     263      393216 :   u->f[0] = (float)t->l[0];
     264      393216 :   u->f[1] = (float)t->l[1];
     265      393216 :   u->f[2] = (float)t->l[2];
     266      393216 :   u->f[3] = (float)t->l[3];
     267      393216 :   __m128 v = _mm_load_ps( u->f );
     268      393216 :   return imm_hi ? _mm256_insertf128_ps( f, v, 1 ) : _mm256_insertf128_ps( f, v, 0 ); /* compile time */
     269      393216 : }
     270             : 
     271      393216 : static inline wl_t wl_to_wi( wl_t l, wi_t i, int imm_hi ) {
     272      393216 :   __m128  v01 = _mm_castsi128_ps( _mm256_extractf128_si256( l, 0 ) ); /* [ l0l l0h l1l l1h ] */
     273      393216 :   __m128  v23 = _mm_castsi128_ps( _mm256_extractf128_si256( l, 1 ) ); /* [ l2l l2h l3l l3h ] */
     274      393216 :   __m128i v   = _mm_castps_si128( _mm_shuffle_ps( v01, v23, _MM_SHUFFLE(2,0,2,0) ) );
     275      393216 :   return imm_hi ? _mm256_insertf128_si256( i, v, 1 ) : _mm256_insertf128_si256( i, v, 0 ); /* compile time */
     276      393216 : }
     277             : 
     278      393216 : static inline wu_t wl_to_wu( wl_t l, wu_t u, int imm_hi ) {
     279      393216 :   __m128  v01 = _mm_castsi128_ps( _mm256_extractf128_si256( l, 0 ) ); /* [ l0l l0h l1l l1h ] */
     280      393216 :   __m128  v23 = _mm_castsi128_ps( _mm256_extractf128_si256( l, 1 ) ); /* [ l2l l2h l3l l3h ] */
     281      393216 :   __m128i v   = _mm_castps_si128( _mm_shuffle_ps( v01, v23, _MM_SHUFFLE(2,0,2,0) ) );
     282      393216 :   return imm_hi ? _mm256_insertf128_si256( u, v, 1 ) : _mm256_insertf128_si256( u, v, 0 ); /* compile time */
     283      393216 : }
     284             : 
     285             : /* FIXME: IS IT FASTER TO USE INSERT / EXTRACT HERE? */
     286      196608 : static inline wd_t wl_to_wd( wl_t l ) {
     287      196608 :   union { long   l[4]; __m256i v[1]; } t[1];
     288      196608 :   union { double d[4]; __m256d v[1]; } u[1];
     289      196608 :   _mm256_store_si256( t->v, l );
     290      196608 :   u->d[0] = (double)t->l[0];
     291      196608 :   u->d[1] = (double)t->l[1];
     292      196608 :   u->d[2] = (double)t->l[2];
     293      196608 :   u->d[3] = (double)t->l[3];
     294      196608 :   return _mm256_load_pd( u->d );
     295      196608 : }
     296             : 
     297             : #define wl_to_wv(a) (a)
     298             : 
     299             : #define wl_to_wc_raw(a) (a)
     300             : #define wl_to_wf_raw(a) _mm256_castsi256_ps( (a) )
     301             : #define wl_to_wi_raw(a) (a)
     302             : #define wl_to_wu_raw(a) (a)
     303             : #define wl_to_wd_raw(a) _mm256_castsi256_pd( (a) )
     304             : #define wl_to_wv_raw(a) (a)
     305             : 
     306             : /* Reduction operations */
     307             : 
     308             : static inline wl_t
     309      196608 : wl_sum_all( wl_t x ) { /* Returns wl_bcast( sum( x ) ) */
     310      196608 :   x = _mm256_add_epi64( x, _mm256_permute2f128_si256( x, x, 1 ) );
     311      196608 :   return _mm256_add_epi64( x, _mm256_castpd_si256( _mm256_permute_pd( _mm256_castsi256_pd( x ), 5 ) ) );
     312      196608 : }
     313             : 
     314             : static inline wl_t
     315      196608 : wl_min_all( wl_t x ) { /* Returns wl_bcast( min( x ) ) */
     316      196608 :   x = wl_min( x, _mm256_permute2f128_si256( x, x, 1 ) );
     317      196608 :   return wl_min( x, _mm256_castpd_si256( _mm256_permute_pd( _mm256_castsi256_pd( x ), 5 ) ) );
     318      196608 : }
     319             : 
     320             : static inline wl_t
     321      196608 : wl_max_all( wl_t x ) { /* Returns wl_bcast( max( x ) ) */
     322      196608 :   x = wl_max( x, _mm256_permute2f128_si256( x, x, 1 ) );
     323      196608 :   return wl_max( x, _mm256_castpd_si256( _mm256_permute_pd( _mm256_castsi256_pd( x ), 5 ) ) );
     324      196608 : }
     325             : 
     326             : /* Misc operations */
     327             : 
     328             : /* wl_gather(b,i,imm_hi) returns
     329             :      [ b[i(0)] b[i(1)] b[i(2)] b[i(3)] ] if imm_hi is 0 and
     330             :      [ b[i(4)] b[i(5)] b[i(6)] b[i(7)] ] o.w.
     331             :    where b is a "long const*", i is wi_t and imm_hi is a compile time
     332             :    constant.  We use a static inline here instead of a define to keep
     333             :    strict type checking while working around yet another Intel intrinsic
     334             :    type mismatch issue. */
     335             : 
     336   269747718 : static inline wl_t wl_gather( long const * b, wi_t i, int imm_hi ) {
     337             :   /* A compile time branch, but older versions of GCC can't handle the
     338             :      ternary operator with -O0 */
     339   269747718 :   if( imm_hi ) return _mm256_i32gather_epi64( (long long const *)b, _mm256_extractf128_si256( i, 1 ), 8 );
     340   134873859 :   else         return _mm256_i32gather_epi64( (long long const *)b, _mm256_extractf128_si256( i, 0 ), 8 );
     341   269747718 : }
     342             : 
     343             : /* wl_transpose_4x4 transposes the 4x4 matrix stored in wl_t r0,r1,r2,r3
     344             :    and stores the result in 4x4 matrix wl_t c0,c1,c2,c3.  All
     345             :    c0,c1,c2,c3 should be different for a well defined result.
     346             :    Otherwise, in-place operation and/or using the same wl_t to specify
     347             :    multiple rows of r is fine. */
     348             : 
     349      196608 : #define wl_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do {                                                                         \
     350      196608 :     wl_t _wl_transpose_r0 = (r0); wl_t _wl_transpose_r1 = (r1); wl_t _wl_transpose_r2 = (r2); wl_t _wl_transpose_r3 = (r3);       \
     351      196608 :     wl_t _wl_transpose_t;                                                                                                         \
     352      196608 :     /* Transpose 2x2 blocks */                                                                                                    \
     353      196608 :     _wl_transpose_t = _wl_transpose_r0; _wl_transpose_r0 = _mm256_permute2f128_si256( _wl_transpose_t,  _wl_transpose_r2, 0x20 ); \
     354      196608 :     /**/                                _wl_transpose_r2 = _mm256_permute2f128_si256( _wl_transpose_t,  _wl_transpose_r2, 0x31 ); \
     355      196608 :     _wl_transpose_t = _wl_transpose_r1; _wl_transpose_r1 = _mm256_permute2f128_si256( _wl_transpose_t,  _wl_transpose_r3, 0x20 ); \
     356      196608 :     /**/                                _wl_transpose_r3 = _mm256_permute2f128_si256( _wl_transpose_t,  _wl_transpose_r3, 0x31 ); \
     357      196608 :     /* Transpose 1x1 blocks */                                                                                                    \
     358      196608 :     /**/                                (c0)             = _mm256_unpacklo_epi64(     _wl_transpose_r0, _wl_transpose_r1 );       \
     359      196608 :     /**/                                (c1)             = _mm256_unpackhi_epi64(     _wl_transpose_r0, _wl_transpose_r1 );       \
     360      196608 :     /**/                                (c2)             = _mm256_unpacklo_epi64(     _wl_transpose_r2, _wl_transpose_r3 );       \
     361      196608 :     /**/                                (c3)             = _mm256_unpackhi_epi64(     _wl_transpose_r2, _wl_transpose_r3 );       \
     362      196608 :   } while(0)

Generated by: LCOV version 1.14