LCOV - cov.lcov - util/simd/fd_avx

LCOV - code coverage report

Current view:	top level - util/simd - fd_avx_wv.h (source / functions)		Hit	Total	Coverage
Test:	cov.lcov	Lines:	116	124	93.5 %
Date:	2025-02-18 12:28:12	Functions:	46	16272	0.3 %

          Line data    Source code

       1             : #ifndef HEADER_fd_src_util_simd_fd_avx_h
       2             : #error "Do not include this directly; use fd_avx.h"
       3             : #endif
       4             : 
       5             : /* Vector ulong API ***************************************************/
       6             : 
       7             : /* A wv_t is a vector where each adjacent pair of 32-bit wide lanes
       8             :    (e.g. 0-1 / 2-3 / 4-5 / 6-7) holds an unsigned 64-bit integer (a
       9             :    "ulong").
      10             : 
      11             :    These mirror the other APIs as much as possible.  Macros are
      12             :    preferred over static inlines when it is possible to do it robustly
      13             :    to reduce the risk of the compiler mucking it up. */
      14             : 
      15  2117802161 : #define wv_t __m256i
      16             : 
      17             : /* Constructors */
      18             : 
      19             : /* Given the ulong values, return ... */
      20             : 
      21    72291400 : #define wv(v0,v1,v2,v3) _mm256_setr_epi64x( (long)(v0), (long)(v1), (long)(v2), (long)(v3) ) /* [ v0 v1 v2 v3 ] */
      22             : 
      23   171716920 : #define wv_bcast(v0) _mm256_set1_epi64x( (long)(v0) ) /* [ v0 v0 v0 v0 ] */
      24             : 
      25             : static inline wv_t /* [ v0 v1 v0 v1 ] */
      26      196608 : wv_bcast_pair( ulong v0, ulong v1 ) {
      27      196608 :   return _mm256_setr_epi64x( (long)v0, (long)v1, (long)v0, (long)v1 );
      28      196608 : }
      29             : 
      30             : static inline wv_t /* [ v0 v0 v1 v1 ] */
      31      196608 : wv_bcast_wide( ulong v0, ulong v1 ) {
      32      196608 :   return _mm256_setr_epi64x( (long)v0, (long)v0, (long)v1, (long)v1 );
      33      196608 : }
      34             : 
      35             : /* wv_permute returns [ l(imm_v0) l(imm_i1) l(imm_i2) l(imm_i3) ].
      36             :    imm_i* should be compile time constants in 0:3. */
      37             : 
      38             : #if FD_USING_CLANG /* Sigh ... clang is sad and can't handle passing compile time const expressions through a static inline */
      39             : 
      40             : static inline wv_t
      41     2162688 : wv_permute( wv_t x, int imm_i0, int imm_i1, int imm_i2, int imm_i3 ) {
      42     2162688 :   union { ulong u[4]; __m256i v[1]; } t, u;
      43     2162688 :   _mm256_store_si256( t.v, x );
      44     2162688 :   u.u[0] = t.u[ imm_i0 ];
      45     2162688 :   u.u[1] = t.u[ imm_i1 ];
      46     2162688 :   u.u[2] = t.u[ imm_i2 ];
      47     2162688 :   u.u[3] = t.u[ imm_i3 ];
      48     2162688 :   return _mm256_load_si256( u.v );
      49     2162688 : }
      50             : 
      51             : #else
      52             : 
      53             : #define wv_permute(x,imm_i0,imm_i1,imm_i2,imm_i3) _mm256_permute4x64_epi64( (x), (imm_i0)+4*(imm_i1)+16*(imm_i2)+64*(imm_i3) )
      54             : 
      55             : #endif
      56             : 
      57             : /* Predefined constants */
      58             : 
      59    38249754 : #define wv_zero() _mm256_setzero_si256()   /* Return [ 0UL 0UL 0UL 0UL ] */
      60   109904643 : #define wv_one()  _mm256_set1_epi64x( 1L ) /* Return [ 1UL 1UL 1UL 1UL ] */
      61             : 
      62             : /* Memory operations */
      63             : 
      64             : /* wv_ld return the 4 ulongs at the 32-byte aligned / 32-byte sized
      65             :    location p as a vector ulong.  wv_ldu is the same but p does not have
      66             :    to be aligned.  wv_st writes the vector ulong to the 32-byte aligned
      67             :    / 32-byte sized location p as 4 ulongs.  wv_stu is the same but p
      68             :    does not have to be aligned.  In all these 64-bit lane l wvll be at
      69             :    p[l].  FIXME: USE ATTRIBUTES ON P PASSED TO THESE?
      70             : 
      71             :    Note: gcc knows a __m256i may alias. */
      72             : 
      73 12749038241 : static inline wv_t wv_ld( ulong const * p ) { return _mm256_load_si256(  (__m256i const *)p ); }
      74 13270196043 : static inline void wv_st( ulong * p, wv_t i ) { _mm256_store_si256(  (__m256i *)p, i ); }
      75             : 
      76   869924428 : static inline wv_t wv_ldu( void const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
      77   471502077 : static inline void wv_stu( void * p, wv_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
      78             : 
      79             : /* wv_ldif is an optimized equivalent to wv_notczero(c,wv_ldu(p)) (may
      80             :    have different behavior if c is not a proper vector conditional).  It
      81             :    is provided for symmetry with the wv_stif operation.  wv_stif stores
      82             :    x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
      83             :    Undefined behavior if c is not a proper vector conditional. */
      84             : 
      85             : #define wv_ldif(c,p)   _mm256_maskload_epi64( (p),(c))
      86             : #define wv_stif(c,p,x) _mm256_maskstore_epi64((p),(c),(x))
      87             : 
      88             : /* Element operations */
      89             : 
      90             : /* wv_extract extracts the ulong in lane imm from the vector ulong as a
      91             :    ulong.  wv_insert returns the vector ulong formed by replacing the
      92             :    value in lane imm of a with the provided ulong.  imm should be a
      93             :    compile time known in 0:3.  wv_extract_variable and
      94             :    wv_insert_variable are the slower but the lane n does not have to be
      95             :    known at compile time (should still be in 0:3).
      96             : 
      97             :    Note: C99 TC3 allows type punning through a union. */
      98             : 
      99  2371582272 : #define wv_extract(a,imm)  ((ulong)_mm256_extract_epi64( (a), (imm) ))
     100             : 
     101   439618572 : #define wv_insert(a,imm,v) _mm256_insert_epi64( (a), (long)(v), (imm) )
     102             : 
     103             : static inline ulong
     104   439618572 : wv_extract_variable( wv_t a, int n ) {
     105   439618572 :   union { __m256i m[1]; ulong u[4]; } t[1];
     106   439618572 :   _mm256_store_si256( t->m, a );
     107   439618572 :   return t->u[n];
     108   439618572 : }
     109             : 
     110             : static inline wv_t
     111   439618572 : wv_insert_variable( wv_t a, int n, ulong v ) {
     112   439618572 :   union { __m256i m[1]; ulong u[4]; } t[1];
     113   439618572 :   _mm256_store_si256( t->m, a );
     114   439618572 :   t->u[n] = v;
     115   439618572 :   return _mm256_load_si256( t->m );
     116   439618572 : }
     117             : 
     118             : /* Given [a0 a1 a2 a3] and/or [b0 b1 b2 b3], return ... */
     119             : 
     120             : /* Arithmetic operations */
     121             : 
     122             : #define wv_neg(a) _mm256_sub_epi64( _mm256_setzero_si256(), (a) ) /* [ -a0  -a1  ... -a3  ] */
     123             : #define wv_abs(a) (a)                                             /* [ |a0| |a1| ... |a3| ] */
     124             : 
     125             : /* Note: _mm256_{min,max}_epu64 are missing pre AVX-512.  We emulate
     126             :    these on pre AVX-512 targets below (and use the AVX-512 versions if
     127             :    possible).  Likewise, there is no _mm256_mullo_epi64 pre AVX-512.
     128             :    Since this is not cheap to emulate, we do not provide a wv_mul for
     129             :    the time being (we could consider exposing it on AVX-512 targets
     130             :    though).  There is a 64L*64L->64 multiply (where the lower 32-bits of
     131             :    the inputs will be zero extended to 64-bits beforehand) though and
     132             :    that is very useful.  So we do provide that. */
     133             : 
     134 10026808226 : #define wv_add(a,b)    _mm256_add_epi64(   (a), (b) ) /* [ a0 +b0     a1 +b1     ... a3 +b3     ] */
     135 12492913960 : #define wv_sub(a,b)    _mm256_sub_epi64(   (a), (b) ) /* [ a0 -b0     a1 -b1     ... a3 -b3     ] */
     136             : //#define wv_mul(a,b)  _mm256_mullo_epi64( (a), (b) ) /* [ a0 *b0     a1 *b1     ... a3 *b3     ] */
     137             : #define wv_mul_ll(a,b) _mm256_mul_epu32(   (a), (b) ) /* [ a0l*b0l    a1l*b1l    ... a3l *b3l   ] */
     138             : 
     139             : /* Binary operations */
     140             : 
     141             : /* Note: wv_shl/wv_shr is a left/right shift by imm bits; imm should be
     142             :    a compile time constant in 0:63.  The variable variants are slower
     143             :    but do not require the shift amount to be known at compile time
     144             :    (should still be in 0:63). */
     145             : 
     146             : #define wv_not(a) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), (a) ) /* [ ~a0 ~a1 ... ~a3 ] */
     147             : 
     148             : #define wv_shl(a,imm) _mm256_slli_epi64( (a), (imm) ) /* [ a0<<imm a1<<imm ... a3<<imm ] */
     149             : #define wv_shr(a,imm) _mm256_srli_epi64( (a), (imm) ) /* [ a0>>imm a1>>imm ... a3>>imm ] */
     150             : 
     151             : #define wv_shl_variable(a,n) _mm256_sll_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     152             : #define wv_shr_variable(a,n) _mm256_srl_epi64( (a), _mm_insert_epi64( _mm_setzero_si128(), (n), 0 ) )
     153             : 
     154             : #define wv_shl_vector(a,b) _mm256_sllv_epi64( (a), (b) ) /* [ a0<<b0 a1<<b1 ... a3<<b3 ] */
     155             : #define wv_shr_vector(a,b) _mm256_srlv_epi64( (a), (b) ) /* [ a0>>b0 a1>>b1 ... a3>>b3 ] */
     156             : 
     157             : #define wv_and(a,b)    _mm256_and_si256(    (a), (b) ) /* [   a0 &b0    a1& b1 ...   a3& b3 ] */
     158    28454324 : #define wv_andnot(a,b) _mm256_andnot_si256( (a), (b) ) /* [ (~a0)&b0  (~a1)&b1 ... (~a3)&b3 ] */
     159 16511113830 : #define wv_or(a,b)     _mm256_or_si256(     (a), (b) ) /* [   a0 |b0    a1 |b1 ...   a3 |b3 ] */
     160             : #define wv_xor(a,b)    _mm256_xor_si256(    (a), (b) ) /* [   a0 ^b0    a1 ^b1 ...   a3 ^b3 ] */
     161             : 
     162             : /* wv_rol(x,n) returns wv( rotate_left (x0,n), rotate_left (x1,n), ... )
     163             :    wv_ror(x,n) returns wv( rotate_right(x0,n), rotate_right(x1,n), ... ) */
     164             : 
     165             : #if FD_HAS_AVX512
     166             : #define wv_rol(a,imm)  _mm256_rol_epi64( (a), (imm) )
     167             : #define wv_ror(a,imm)  _mm256_ror_epi64( (a), (imm) )
     168             : #else
     169   384824008 : static inline wv_t wv_rol( wv_t a, int imm ) { return wv_or( wv_shl( a, imm & 63 ), wv_shr( a, (-imm) & 63 ) ); }
     170 16061886656 : static inline wv_t wv_ror( wv_t a, int imm ) { return wv_or( wv_shr( a, imm & 63 ), wv_shl( a, (-imm) & 63 ) ); }
     171             : #endif
     172             : 
     173    12582912 : static inline wv_t wv_rol_variable( wv_t a, int n ) { return wv_or( wv_shl_variable( a, n&63 ), wv_shr_variable( a, (-n)&63 ) ); }
     174    12582912 : static inline wv_t wv_ror_variable( wv_t a, int n ) { return wv_or( wv_shr_variable( a, n&63 ), wv_shl_variable( a, (-n)&63 ) ); }
     175             : 
     176           0 : static inline wv_t wv_rol_vector( wv_t a, wl_t b ) {
     177           0 :   wl_t m = wl_bcast( 63L );
     178           0 :   return wv_or( wv_shl_vector( a, wl_and( b, m ) ), wv_shr_vector( a, wl_and( wl_neg( b ), m ) ) );
     179           0 : }
     180             : 
     181           0 : static inline wv_t wv_ror_vector( wv_t a, wl_t b ) {
     182           0 :   wl_t m = wl_bcast( 63L );
     183           0 :   return wv_or( wv_shr_vector( a, wl_and( b, m ) ), wv_shl_vector( a, wl_and( wl_neg( b ), m ) ) );
     184           0 : }
     185             : 
     186    28682184 : #define wv_bswap(a) wu_to_wv_raw( wu_bswap( wv_to_wu_raw( wv_rol( (a), 32 ) ) ) )
     187             : 
     188             : /* Logical operations */
     189             : 
     190             : /* Like noted below in the converters, Intel clearly has the hardware to
     191             :    do a _mm256_cmpgt_epu64 given that _mm256_cmpgt_epi64 exists but
     192             :    doesn't expose it in the ISA pre AVX-512.  Sigh ... twos complement
     193             :    bit tricks to the rescue for wu_{gt,lt,ge,le}. */
     194             : 
     195             : #define wv_lnot(a) _mm256_cmpeq_epi64( (a), _mm256_setzero_si256() )                           /* [  !a0  !a1 ...  !a3 ] */
     196             : #define wv_lnotnot(a)                                                                          /* [ !!a0 !!a1 ... !!a3 ] */ \
     197             :   _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpeq_epi64( (a), _mm256_setzero_si256() ) )
     198             : 
     199             : #define wv_eq(a,b) _mm256_cmpeq_epi64( (a), (b) )                                              /* [ a0==b0 a1==b1 ... a3==b3 ] */
     200             : #define wv_gt(a,b)                                                                             /* [ a0> b0 a1> b1 ... a3> b3 ] */ \
     201             :   _mm256_cmpgt_epi64( _mm256_sub_epi64( (a), _mm256_set1_epi64x( (long)(1UL<<63) ) ),                                             \
     202             :                       _mm256_sub_epi64( (b), _mm256_set1_epi64x( (long)(1UL<<63) ) ) )
     203             : #define wv_lt(a,b) wv_gt( (b), (a) )                                                           /* [ a0< b0 a1< b1 ... a3< b3 ] */
     204             : #define wv_ne(a,b) _mm256_xor_si256( _mm256_set1_epi64x(-1L), _mm256_cmpeq_epi64( (a), (b) ) ) /* [ a0!=b0 a1!=b1 ... a3!=b3 ] */
     205             : #define wv_ge(a,b) _mm256_xor_si256( _mm256_set1_epi64x(-1L), wv_gt( (b), (a) ) )              /* [ a0>=b0 a1>=b1 ... a3>=b3 ] */
     206             : #define wv_le(a,b) _mm256_xor_si256( _mm256_set1_epi64x(-1L), wv_gt( (a), (b) ) )              /* [ a0<=b0 a1<=b1 ... a3<=b3 ] */
     207             : 
     208             : /* Conditional operations */
     209             : 
     210             : #define wv_czero(c,f)    _mm256_andnot_si256( (c), (f) )     /* [ c0?0UL:f0 c1?0UL:f1 ... c3?0UL:f3 ] */
     211    36405338 : #define wv_notczero(c,f) _mm256_and_si256(    (c), (f) )     /* [ c0?f0:0UL c1?f1:0UL ... c3?f3:0UL ] */
     212             : 
     213   966768282 : #define wv_if(c,t,f)     _mm256_blendv_epi8( (f), (t), (c) ) /* [ c0?t0:f0  c1?t1:f1  ... c3?t3:f3 ] */
     214             : 
     215             : #if defined(__AVX512F__) && defined(__AVX512VL__) /* See note above */
     216      131072 : #define wv_min(a,b) _mm256_min_epu64( (a), (b) )
     217      131072 : #define wv_max(a,b) _mm256_max_epu64( (a), (b) )
     218             : #else
     219      393216 : static inline wv_t wv_min( wv_t a, wv_t b ) { return wv_if( wv_lt( a, b ), a, b ); }
     220      393216 : static inline wv_t wv_max( wv_t a, wv_t b ) { return wv_if( wv_gt( a, b ), a, b ); }
     221             : #endif
     222             : 
     223             : /* Conversion operations */
     224             : 
     225             : /* Summarizing:
     226             : 
     227             :    wv_to_wc(d)     returns [ !!v0 !!v0 !!v1 !!v1 ... !!v3 !!v3 ]
     228             : 
     229             :    wv_to_wf(l,i,0) returns [ (float)v0 (float)v1 (float)v2 (float)v3 f4 f5 f6 f7 ]
     230             :    wv_to_wf(l,i,1) returns [ f0 f1 f2 f3 (float)v0 (float)v1 (float)v2 (float)v3 ]
     231             : 
     232             :    wv_to_wi(l,i,0) returns [ (int)v0 (int)v1 (int)v2 (int)v3 i4 i5 i6 i7 ]
     233             :    wv_to_wi(l,i,1) returns [ i0 i1 i2 i3 (int)v0 (int)v1 (int)v2 (int)v3 ]
     234             : 
     235             :    wv_to_wu(l,u,0) returns [ (uint)v0 (uint)v1 (uint)v2 (uint)v3 u4 u5 u6 u7 ]
     236             :    wv_to_wu(l,u,1) returns [ v0 v1 v2 v3 (uint)v0 (uint)v1 (uint)v2 (uint)v3 ]
     237             : 
     238             :    wv_to_wd(l)     returns [ (double)v0 (double)v1 (double)v2 (double)v3 ]
     239             : 
     240             :    wv_to_wl(l)     returns [ (long)v0 (long)v1 (long)v2 (long)v3 ]
     241             : 
     242             :    The raw variants just treat the raw bits as the corresponding vector
     243             :    type.  For wv_to_wc_raw, the user promises wv contains a proper
     244             :    vector conditional (e.g. 0 or -1 in each lane).  The others are
     245             :    provided to facilitate doing advanced bit tricks on floating point
     246             :    values. */
     247             : 
     248   519396263 : #define wv_to_wc(a) _mm256_xor_si256( _mm256_set1_epi64x( -1L ), _mm256_cmpeq_epi64( (a), _mm256_setzero_si256() ) )
     249             : 
     250      393216 : static inline wf_t wv_to_wf( wv_t v, wf_t f, int imm_hi ) {
     251      393216 :   union { ulong u[4]; __m256i v[1]; } t[1];
     252      393216 :   union { float f[4]; __m128  v[1]; } u[1];
     253      393216 :   _mm256_store_si256( t->v, v );
     254      393216 :   u->f[0] = (float)t->u[0];
     255      393216 :   u->f[1] = (float)t->u[1];
     256      393216 :   u->f[2] = (float)t->u[2];
     257      393216 :   u->f[3] = (float)t->u[3];
     258      393216 :   __m128 w = _mm_load_ps( u->f );
     259      393216 :   return imm_hi ? _mm256_insertf128_ps( f, w, 1 ) : _mm256_insertf128_ps( f, w, 0 ); /* compile time */
     260      393216 : }
     261             : 
     262      393216 : static inline wv_t wv_to_wi( wv_t v, wi_t i, int imm_hi ) {
     263      393216 :   __m128  v01 = _mm_castsi128_ps( _mm256_extractf128_si256( v, 0 ) ); /* [ v0l v0h v1l v1h ] */
     264      393216 :   __m128  v23 = _mm_castsi128_ps( _mm256_extractf128_si256( v, 1 ) ); /* [ v2l v2h v3l v3h ] */
     265      393216 :   __m128i w   = _mm_castps_si128( _mm_shuffle_ps( v01, v23, _MM_SHUFFLE(2,0,2,0) ) );
     266      393216 :   return imm_hi ? _mm256_insertf128_si256( i, w, 1 ) : _mm256_insertf128_si256( i, w, 0 ); /* compile time */
     267      393216 : }
     268             : 
     269      393216 : static inline wu_t wv_to_wu( wv_t v, wu_t u, int imm_hi ) {
     270      393216 :   __m128  v01 = _mm_castsi128_ps( _mm256_extractf128_si256( v, 0 ) ); /* [ v0l v0h v1l v1h ] */
     271      393216 :   __m128  v23 = _mm_castsi128_ps( _mm256_extractf128_si256( v, 1 ) ); /* [ v2l v2h v3l v3h ] */
     272      393216 :   __m128i w   = _mm_castps_si128( _mm_shuffle_ps( v01, v23, _MM_SHUFFLE(2,0,2,0) ) );
     273      393216 :   return imm_hi ? _mm256_insertf128_si256( u, w, 1 ) : _mm256_insertf128_si256( u, w, 0 ); /* compile time */
     274      393216 : }
     275             : 
     276             : /* FIXME: IS IT FASTER TO USE INSERT / EXTRACT HERE? */
     277      196608 : static inline wd_t wv_to_wd( wv_t v ) {
     278      196608 :   union { ulong  u[4]; __m256i v[1]; } t[1];
     279      196608 :   union { double d[4]; __m256d v[1]; } u[1];
     280      196608 :   _mm256_store_si256( t->v, v );
     281      196608 :   u->d[0] = (double)t->u[0];
     282      196608 :   u->d[1] = (double)t->u[1];
     283      196608 :   u->d[2] = (double)t->u[2];
     284      196608 :   u->d[3] = (double)t->u[3];
     285      196608 :   return _mm256_load_pd( u->d );
     286      196608 : }
     287             : 
     288             : #define wv_to_wl(a) (a)
     289             : 
     290             : #define wv_to_wc_raw(a) (a)
     291             : #define wv_to_wf_raw(a) _mm256_castsi256_ps( (a) )
     292             : #define wv_to_wi_raw(a) (a)
     293             : #define wv_to_wu_raw(a) (a)
     294             : #define wv_to_wd_raw(a) _mm256_castsi256_pd( (a) )
     295             : #define wv_to_wl_raw(a) (a)
     296             : 
     297             : /* Reduction operations */
     298             : 
     299             : static inline wv_t
     300      196608 : wv_sum_all( wv_t x ) { /* Returns wv_bcast( sum( x ) ) */
     301      196608 :   x = _mm256_add_epi64( x, _mm256_permute2f128_si256( x, x, 1 ) );
     302      196608 :   return _mm256_add_epi64( x, _mm256_castpd_si256( _mm256_permute_pd( _mm256_castsi256_pd( x ), 5 ) ) );
     303      196608 : }
     304             : 
     305             : static inline wv_t
     306      196608 : wv_min_all( wv_t x ) { /* Returns wv_bcast( min( x ) ) */
     307      196608 :   x = wv_min( x, _mm256_permute2f128_si256( x, x, 1 ) );
     308      196608 :   return wv_min( x, _mm256_castpd_si256( _mm256_permute_pd( _mm256_castsi256_pd( x ), 5 ) ) );
     309      196608 : }
     310             : 
     311             : static inline wv_t
     312      196608 : wv_max_all( wv_t x ) { /* Returns wv_bcast( max( x ) ) */
     313      196608 :   x = wv_max( x, _mm256_permute2f128_si256( x, x, 1 ) );
     314      196608 :   return wv_max( x, _mm256_castpd_si256( _mm256_permute_pd( _mm256_castsi256_pd( x ), 5 ) ) );
     315      196608 : }
     316             : 
     317             : /* Misc operations */
     318             : 
     319             : /* wv_gather(b,i,imm_hi) returns
     320             :      [ b[i(0)] b[i(1)] b[i(2)] b[i(3)] ] if imm_hi is 0 and
     321             :      [ b[i(4)] b[i(5)] b[i(6)] b[i(7)] ] o.w.
     322             :    where b is a "ulong const*", i is wi_t and imm_hi is a compile time
     323             :    constant.  We use a static inline here instead of a define to keep
     324             :    strict type checking while working around yet another Intel intrinsic
     325             :    type mismatch issue. */
     326             : 
     327   219809286 : static inline wv_t wv_gather( ulong const * b, wi_t i, int imm_hi ) {
     328             :   /* A compile time branch, but older versions of GCC can't handle the
     329             :      ternary operator with -O0 */
     330   219809286 :   if( imm_hi ) return _mm256_i32gather_epi64( (long long const *)b, _mm256_extractf128_si256( i, 1 ), 8 );
     331   109904643 :   else         return _mm256_i32gather_epi64( (long long const *)b, _mm256_extractf128_si256( i, 0 ), 8 );
     332   219809286 : }
     333             : 
     334             : /* wv_transpose_4x4 transposes the 4x4 matrix stored in wv_t r0,r1,r2,r3
     335             :    and stores the result in 4x4 matrix wv_t c0,c1,c2,c3.  All
     336             :    c0,c1,c2,c3 should be different for a well defined result.
     337             :    Otherwise, in-place operation and/or using the same wv_t to specify
     338             :    multiple rows of r is fine. */
     339             : 
     340   100392968 : #define wv_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do {                                                                         \
     341   100392968 :     wv_t _wv_transpose_r0 = (r0); wv_t _wv_transpose_r1 = (r1); wv_t _wv_transpose_r2 = (r2); wv_t _wv_transpose_r3 = (r3);       \
     342   100392968 :     wv_t _wv_transpose_t;                                                                                                         \
     343   100392968 :     /* Transpose 2x2 blocks */                                                                                                    \
     344   100392968 :     _wv_transpose_t = _wv_transpose_r0; _wv_transpose_r0 = _mm256_permute2f128_si256( _wv_transpose_t,  _wv_transpose_r2, 0x20 ); \
     345   100392968 :     /**/                                _wv_transpose_r2 = _mm256_permute2f128_si256( _wv_transpose_t,  _wv_transpose_r2, 0x31 ); \
     346   100392968 :     _wv_transpose_t = _wv_transpose_r1; _wv_transpose_r1 = _mm256_permute2f128_si256( _wv_transpose_t,  _wv_transpose_r3, 0x20 ); \
     347   100392968 :     /**/                                _wv_transpose_r3 = _mm256_permute2f128_si256( _wv_transpose_t,  _wv_transpose_r3, 0x31 ); \
     348   100392968 :     /* Transpose 1x1 blocks */                                                                                                    \
     349   100392968 :     /**/                                (c0)             = _mm256_unpacklo_epi64(     _wv_transpose_r0, _wv_transpose_r1 );       \
     350   100392968 :     /**/                                (c1)             = _mm256_unpackhi_epi64(     _wv_transpose_r0, _wv_transpose_r1 );       \
     351   100392968 :     /**/                                (c2)             = _mm256_unpacklo_epi64(     _wv_transpose_r2, _wv_transpose_r3 );       \
     352   100392968 :     /**/                                (c3)             = _mm256_unpackhi_epi64(     _wv_transpose_r2, _wv_transpose_r3 );       \
     353   100392968 :   } while(0)

Generated by: LCOV version 1.14