LCOV - code coverage report
Current view: top level - util/simd - fd_avx512_wwu.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 169 170 99.4 %
Date: 2024-11-13 11:58:15 Functions: 12 312 3.8 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_avx512_h
       2             : #error "Do not include this directly; use fd_avx512.h"
       3             : #endif
       4             : 
       5             : /* TODO: REDUCE, EXTRACT, ADDITIONAL LANE OPS, ... */
       6             : /* TODO: USE INT FOR THS SCALAR N ROL/ROR (AND IN OTHER ROL/ROR)? */
       7             : /* TODO: BACKPORT UNPACKS TO AVX AND SSE? */
       8             : 
       9             : /* Vector uint API ***************************************************/
      10             : 
      11             : /* A wwu_t is a vector where each 32-bit wide lane holds an unsigned
      12             :    32-bit integer (a "uint").
      13             : 
      14             :    These mirror the other APIs as much as possible.  Macros are
      15             :    preferred over static inlines when it is possible to do it robustly
      16             :    to reduce the risk of the compiler mucking it up. */
      17             : 
      18 10412551087 : #define wwu_t __m512i
      19             : 
      20             : /* Constructors */
      21             : 
      22             : /* wwu(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf)
      23             :    returns the wwu_t [x0 x1 ... xf] where x* are uints */
      24             : 
      25             : #define wwu(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf)                                                 \
      26    36376122 :   _mm512_setr_epi32( (int)(x0), (int)(x1), (int)(x2), (int)(x3), (int)(x4), (int)(x5), (int)(x6), (int)(x7), \
      27    36376122 :                      (int)(x8), (int)(x9), (int)(xa), (int)(xb), (int)(xc), (int)(xd), (int)(xe), (int)(xf) )
      28             : 
      29   118251372 : #define wwu_bcast(x)         _mm512_set1_epi32( (int)(x) ) /* wwu(x, x, ... x) */
      30             : 
      31             : /* wwu_permute(p,x) returns:
      32             :      wwu( x(p(0)), x(p(1)), ... x(p(15)) ).
      33             :    As such p(*) should be uints in [0,15]. */
      34             : 
      35             : #define wwu_permute(p,x)     _mm512_permutexvar_epi32( (p), (x) )
      36             : 
      37             : /* wwu_select(s,x,y) concatenates the wwu_t's x and y into
      38             :      z = [ x0 x1 ... xf y0 y1 ... yf ]
      39             :    and then returns:
      40             :      wwu( z(p(0)), z(p(1)), ... z(p(15)) ).
      41             :    As such p(*) should be uints in [0,31]. */
      42             : 
      43    41246456 : #define wwu_select(p,x,y)    _mm512_permutex2var_epi32( (x), (p), (y) )
      44             : 
      45             : /* Predefined constants */
      46             : 
      47    51534712 : #define wwu_zero()           _mm512_setzero_si512()  /* wwu(0, 0, ... 0) */
      48             : #define wwu_one()            _mm512_set1_epi32( 1 )  /* wwu(1, 1, ... 1) */
      49             : 
      50             : /* Memory operations */
      51             : /* Note: wwu_{ld,st} assume m is 64-byte aligned while wwu_{ldu,stu}
      52             :    allow m to have arbitrary alignment */
      53             : 
      54    42502458 : static inline wwu_t wwu_ld( uint const * m ) { return _mm512_load_epi32( m ); }  /* wwu( m[0], m[1], ... m[15] ) */
      55   171000000 : static inline void  wwu_st( uint * m, wwu_t x ) { _mm512_store_epi32( m, x ); }  /* does m[0] = x0, m[1] = x1, ... m[15] = xf */
      56             : 
      57   935957008 : static inline wwu_t wwu_ldu( void const * m ) { return _mm512_loadu_epi32( m ); } /* wwu( m[0], m[1], ... m[15]) */
      58     1000000 : static inline void  wwu_stu( void * m, wwu_t x ) { _mm512_storeu_epi32( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[15] = xf */
      59             : 
      60             : /* Arithmetic operations */
      61             : 
      62             : #define wwu_neg(x)           _mm512_sub_epi32( _mm512_setzero_si512(), (x) ) /* wwu( -x0, -x1, ... -xf ) */
      63             : #define wwu_abs(x)           (x)                                             /* wwu(  x0,  x1, ...  xf ) */
      64             : 
      65             : #define wwu_min(x,y)         _mm512_min_epu32  ( (x), (y) ) /* wwu( min(x0,y0), min(x1,y1), ... min(xf,yf) ) */
      66             : #define wwu_max(x,y)         _mm512_max_epu32  ( (x), (y) ) /* wwu( max(x0,y0), max(x1,y1), ... max(xf,yf) ) */
      67 17764183152 : #define wwu_add(x,y)         _mm512_add_epi32  ( (x), (y) ) /* wwu( x0+y0,      x1+y1,      ... xf+yf      ) */
      68             : #define wwu_sub(x,y)         _mm512_sub_epi32  ( (x), (y) ) /* wwu( x0-y0,      x1-y1,      ... xf-yf      ) */
      69             : #define wwu_mul(x,y)         _mm512_mullo_epi32( (x), (y) ) /* wwu( x0*y0,      x1*y1,      ... xf*yf      ) */
      70             : 
      71             : /* Binary operations */
      72             : /* Note: shifts assumes n and or y* in [0,31].  Rotates work for
      73             :    arbitrary values */
      74             : 
      75     4000000 : #define wwu_not(x)           _mm512_xor_epi32( _mm512_set1_epi32( -1 ), (x) )
      76             : 
      77             : #define wwu_shl(x,n)         _mm512_slli_epi32  ( (x), (uint)(n) ) /* wwu( x0<<n,  x1<<n,  ... xf<<n  ) */
      78           0 : #define wwu_shr(x,n)         _mm512_srli_epi32  ( (x), (uint)(n) ) /* wwu( x0>>n,  x1>>n,  ... xf>>n  ) */
      79   958951476 : #define wwu_shl_vector(x,y)  _mm512_sllv_epi32  ( (x), (y)       ) /* wwu( x0<<y0, x1<<y1, ... xf<<yf ) */
      80             : #define wwu_shr_vector(x,y)  _mm512_srlv_epi32  ( (x), (y)       ) /* wwu( x0>>y0, x1>>y1, ... xf>>yf ) */
      81  1546539543 : #define wwu_and(x,y)         _mm512_and_epi32   ( (x), (y)       ) /* wwu( x0&y0,  x1&y1,  ... xf&yf  ) */
      82             : #define wwu_andnot(x,y)      _mm512_andnot_epi32( (x), (y)       ) /* wwu( ~x0&y0, ~x1&y1, ... ~xf&yf ) */
      83  1550539543 : #define wwu_or(x,y)          _mm512_or_epi32    ( (x), (y)       ) /* wwu( x0|y0,  x1|y1,  ... xf|yf  ) */
      84  1067063805 : #define wwu_xor(x,y)         _mm512_xor_epi32   ( (x), (y)       ) /* wwu( x0^y0,  x1^y1,  ... xf^yf  ) */
      85             : 
      86             : /* wwu_rol(x,n)          returns wwu( rotate_left (x0,n ), rotate_left (x1,n ), ... )
      87             :    wwu_ror(x,n)          returns wwu( rotate_right(x0,n ), rotate_right(x1,n ), ... )
      88             :    wwu_rol_variable(x,n) returns wwu( rotate_left (x0,n ), rotate_left (x1,n ), ... )
      89             :    wwu_ror_variable(x,n) returns wwu( rotate_right(x0,n ), rotate_right(x1,n ), ... )
      90             :    wwu_rol_vector(x,y)   returns wwu( rotate_left (x0,y0), rotate_left (x1,y1), ... )
      91             :    wwu_ror_vector(x,y)   returns wwu( rotate_right(x0,y0), rotate_right(x1,y1), ... )
      92             : 
      93             :    The variable variants are slower but do not require the shift amount
      94             :    to be known at compile time. */
      95             : 
      96     4000000 : #define wwu_rol(a,imm)       _mm512_rol_epi32( (a), (imm)&31U )
      97     4000000 : #define wwu_ror(a,imm)       _mm512_ror_epi32( (a), (imm)&31U )
      98             : 
      99     1000000 : static inline wwu_t wwu_rol_variable( wwu_t a, uint n ) { return wwu_or( wwu_shl( a, n & 31U ), wwu_shr( a, (-n) & 31U ) ); }
     100     1000000 : static inline wwu_t wwu_ror_variable( wwu_t a, uint n ) { return wwu_or( wwu_shr( a, n & 31U ), wwu_shl( a, (-n) & 31U ) ); }
     101             : 
     102     1000000 : static inline wwu_t wwu_rol_vector( wwu_t a, wwu_t b ) {
     103     1000000 :   wwu_t m = wwu_bcast( 31U );
     104     1000000 :   return wwu_or( wwu_shl_vector( a, wwu_and( b, m ) ), wwu_shr_vector( a, wwu_and( wwu_neg( b ), m ) ) );
     105     1000000 : }
     106             : 
     107     1000000 : static inline wwu_t wwu_ror_vector( wwu_t a, wwu_t b ) {
     108     1000000 :   wwu_t m = wwu_bcast( 31U );
     109     1000000 :   return wwu_or( wwu_shr_vector( a, wwu_and( b, m ) ), wwu_shl_vector( a, wwu_and( wwu_neg( b ), m ) ) );
     110     1000000 : }
     111             : 
     112             : /* wwu_bswap(x) returns wwu( bswap(x0), bswap(x1), ... ) */
     113             : 
     114             : #define wwu_bswap( x ) _mm512_shuffle_epi8( (x), _mm512_set_epi8( 12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3, \
     115             :                                                                   12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3, \
     116             :                                                                   12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3, \
     117             :                                                                   12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3 ) )
     118             : 
     119             : /* Comparison operations */
     120             : /* mask(c0,c1,...) means (((int)c0)<<0) | (((int)c1)<<1) | ... */
     121             : 
     122             : #define wwu_eq(x,y) ((int)_mm512_cmpeq_epu32_mask(  (x), (y) )) /* mask( x0==y0, x1==y1, ... ) */
     123             : #define wwu_gt(x,y) ((int)_mm512_cmpgt_epu32_mask(  (x), (y) )) /* mask( x0> y0, x1> y1, ... ) */
     124             : #define wwu_lt(x,y) ((int)_mm512_cmplt_epu32_mask(  (x), (y) )) /* mask( x0< y0, x1< y1, ... ) */
     125             : #define wwu_ne(x,y) ((int)_mm512_cmpneq_epu32_mask( (x), (y) )) /* mask( x0!=y0, x1!=y1, ... ) */
     126             : #define wwu_ge(x,y) ((int)_mm512_cmpge_epu32_mask(  (x), (y) )) /* mask( x0>=y0, x1>=y1, ... ) */
     127             : #define wwu_le(x,y) ((int)_mm512_cmple_epu32_mask(  (x), (y) )) /* mask( x0<=y0, x1<=y1, ... ) */
     128             : 
     129             : #define wwu_lnot(x)    wwu_eq( (x), wwu_zero() )                /* mask(  !x0,  !x1, ... ) */
     130             : #define wwu_lnotnot(x) wwu_ne( (x), wwu_zero() )                /* mask( !!x0, !!x1, ... ) */
     131             : 
     132             : /* Conditional operations */
     133             : /* cn means bit n of c */
     134             : 
     135     2000000 : #define wwu_if(c,x,y)          _mm512_mask_blend_epi32 ( (__mmask16)(c), (y), (x) )    /* wwu( c0? x0    :y0, ... ) */
     136             : 
     137   467478504 : #define wwu_add_if(c,x,y,z)    _mm512_mask_add_epi32   ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0+y0):z0, ... ) */
     138             : #define wwu_sub_if(c,x,y,z)    _mm512_mask_sub_epi32   ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0-y0):z0, ... ) */
     139             : 
     140             : #define wwu_and_if(c,x,y,z)    _mm512_mask_and_epi32   ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0&y0):z0, ... ) */
     141             : #define wwu_andnot_if(c,x,y,z) _mm512_mask_andnot_epi32( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(~x0&y0):z0, ... ) */
     142             : #define wwu_or_if(c,x,y,z)     _mm512_mask_or_epi32    ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0|y0):z0, ... ) */
     143             : #define wwu_xor_if(c,x,y,z)    _mm512_mask_xor_epi32   ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0^y0):z0, ... ) */
     144             : 
     145             : /* Conversions */
     146             : 
     147             : /* wwu_to_wwi( x )    returns wwi(   (int)x0,   (int)x1, ...   (int)x15 )
     148             : 
     149             :    wwu_to_wwl( x, 0 ) returns wwl(  (long)x0,  (long)x2, ...  (long)x14 )
     150             :    wwu_to_wwl( x, 1 ) returns wwl(  (long)x1,  (long)x3, ...  (long)x15 )
     151             : 
     152             :    wwu_to_wwv( x, 0 ) returns wwv( (ulong)x0, (ulong)x2, ... (ulong)x14 )
     153             :    wwu_to_wwv( x, 1 ) returns wwv( (ulong)x1, (ulong)x3, ... (ulong)x15 )
     154             : 
     155             :    TODO: consider _mm512_cvtepu32_* intrinsics? */
     156             : 
     157             : #define wwu_to_wwi( x ) (x)
     158             : #define wwu_to_wwl( x, odd ) /* trinary should be compile time */ \
     159             :   (__extension__({ wwl_t _wwu_to_wwl_tmp = (x); wwl_shru( (odd) ? _wwu_to_wwl_tmp : wwl_shl( _wwu_to_wwl_tmp, 32 ), 32 ); }))
     160             : #define wwu_to_wwv( x, odd ) /* trinary should be compile time */ \
     161             :   (__extension__({ wwv_t _wwu_to_wwv_tmp = (x); wwv_shr ( (odd) ? _wwu_to_wwv_tmp : wwv_shl( _wwu_to_wwv_tmp, 32 ), 32 ); }))
     162             : 
     163             : #define wwu_to_wwi_raw(x) (x)
     164             : #define wwu_to_wwl_raw(x) (x)
     165             : #define wwu_to_wwv_raw(x) (x)
     166             : 
     167             : /* Misc operations */
     168             : 
     169             : /* wwu_pack_halves(x,imm0,y,imm1) packs half of x and half of y into a
     170             :    wwu.  imm0/imm1 select which half of x and y to pack.  imm0 / imm1
     171             :    should be in [0,1].  That is, this returns:
     172             : 
     173             :      [ if( imm0, x(8:15), x(0:7) ) if( imm1, y(8:15), y(0:7) ) ]
     174             : 
     175             :    wwu_pack_h0_h1(x,y) does the wwu_pack_halves(x,0,y,1) case faster.
     176             :    Hat tip to Philip Taffet for pointing this out. */
     177             : 
     178             : #define wwu_pack_halves(x,imm0,y,imm1) _mm512_shuffle_i32x4( (x), (y), 68+10*(imm0)+160*(imm1) )
     179             : #define wwu_pack_h0_h1(x,y)            _mm512_mask_blend_epi32( (__mmask16)0xFF00, (x), (y) )
     180             : 
     181             : /* wwu_slide(x,y,imm) treats as a x FIFO with the oldest / newest
     182             :    element at lane 0 / 15.  Returns the result of dequeing x imm times
     183             :    and enqueing the values y0 ... y{imm-1} in that order.  imm should be
     184             :    in [0,15].  For example, with imm==5 case, returns:
     185             :      [ x5 x6 ... xf y0 y1 y2 y3 y4 ]. */
     186             : 
     187             : #define wwu_slide(x,y,imm) _mm512_alignr_epi32( (y), (x), (imm) )
     188             : 
     189             : /* wwv_unpack unpacks the wwv x into its uint components x0,x1,...xf. */
     190             : 
     191     1000000 : #define wwu_unpack( x, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf ) do { \
     192     1000000 :     __m512i _wwu_unpack_x  = (x);                                             \
     193     1000000 :     __m256i _wwu_unpack_xl = _mm512_extracti32x8_epi32( _wwu_unpack_x, 0 );   \
     194     1000000 :     __m256i _wwu_unpack_xh = _mm512_extracti32x8_epi32( _wwu_unpack_x, 1 );   \
     195     1000000 :     (x0) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 0 );                   \
     196     1000000 :     (x1) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 1 );                   \
     197     1000000 :     (x2) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 2 );                   \
     198     1000000 :     (x3) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 3 );                   \
     199     1000000 :     (x4) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 4 );                   \
     200     1000000 :     (x5) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 5 );                   \
     201     1000000 :     (x6) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 6 );                   \
     202     1000000 :     (x7) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 7 );                   \
     203     1000000 :     (x8) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 0 );                   \
     204     1000000 :     (x9) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 1 );                   \
     205     1000000 :     (xa) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 2 );                   \
     206     1000000 :     (xb) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 3 );                   \
     207     1000000 :     (xc) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 4 );                   \
     208     1000000 :     (xd) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 5 );                   \
     209     1000000 :     (xe) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 6 );                   \
     210     1000000 :     (xf) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 7 );                   \
     211     1000000 :   } while(0)
     212             : 
     213             : /* wwu_transpose_16x16 sets wwu_t's c0,c1,...cf to the columns of a
     214             :    16x16 uint matrix given the rows of the matrix in wwu_t's
     215             :    r0,r1,...rf.  In-place operation fine. */
     216             : 
     217             : #define wwu_transpose_16x16( r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,ra,rb,rc,rd,re,rf,                      \
     218    59434813 :                              c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf ) do {                \
     219    59434813 :     wwu_t _wwu_transpose_r0 = (r0); wwu_t _wwu_transpose_r1 = (r1);                                \
     220    59434813 :     wwu_t _wwu_transpose_r2 = (r2); wwu_t _wwu_transpose_r3 = (r3);                                \
     221    59434813 :     wwu_t _wwu_transpose_r4 = (r4); wwu_t _wwu_transpose_r5 = (r5);                                \
     222    59434813 :     wwu_t _wwu_transpose_r6 = (r6); wwu_t _wwu_transpose_r7 = (r7);                                \
     223    59434813 :     wwu_t _wwu_transpose_r8 = (r8); wwu_t _wwu_transpose_r9 = (r9);                                \
     224    59434813 :     wwu_t _wwu_transpose_ra = (ra); wwu_t _wwu_transpose_rb = (rb);                                \
     225    59434813 :     wwu_t _wwu_transpose_rc = (rc); wwu_t _wwu_transpose_rd = (rd);                                \
     226    59434813 :     wwu_t _wwu_transpose_re = (re); wwu_t _wwu_transpose_rf = (rf);                                \
     227    59434813 :                                                                                                    \
     228    59434813 :     /* Outer 4x4 transpose of 4x4 blocks */                                                        \
     229    59434813 :     wwu_t _wwu_transpose_t0  = _mm512_shuffle_i32x4( _wwu_transpose_r0, _wwu_transpose_r4, 0x88 ); \
     230    59434813 :     wwu_t _wwu_transpose_t1  = _mm512_shuffle_i32x4( _wwu_transpose_r1, _wwu_transpose_r5, 0x88 ); \
     231    59434813 :     wwu_t _wwu_transpose_t2  = _mm512_shuffle_i32x4( _wwu_transpose_r2, _wwu_transpose_r6, 0x88 ); \
     232    59434813 :     wwu_t _wwu_transpose_t3  = _mm512_shuffle_i32x4( _wwu_transpose_r3, _wwu_transpose_r7, 0x88 ); \
     233    59434813 :     wwu_t _wwu_transpose_t4  = _mm512_shuffle_i32x4( _wwu_transpose_r0, _wwu_transpose_r4, 0xdd ); \
     234    59434813 :     wwu_t _wwu_transpose_t5  = _mm512_shuffle_i32x4( _wwu_transpose_r1, _wwu_transpose_r5, 0xdd ); \
     235    59434813 :     wwu_t _wwu_transpose_t6  = _mm512_shuffle_i32x4( _wwu_transpose_r2, _wwu_transpose_r6, 0xdd ); \
     236    59434813 :     wwu_t _wwu_transpose_t7  = _mm512_shuffle_i32x4( _wwu_transpose_r3, _wwu_transpose_r7, 0xdd ); \
     237    59434813 :     wwu_t _wwu_transpose_t8  = _mm512_shuffle_i32x4( _wwu_transpose_r8, _wwu_transpose_rc, 0x88 ); \
     238    59434813 :     wwu_t _wwu_transpose_t9  = _mm512_shuffle_i32x4( _wwu_transpose_r9, _wwu_transpose_rd, 0x88 ); \
     239    59434813 :     wwu_t _wwu_transpose_ta  = _mm512_shuffle_i32x4( _wwu_transpose_ra, _wwu_transpose_re, 0x88 ); \
     240    59434813 :     wwu_t _wwu_transpose_tb  = _mm512_shuffle_i32x4( _wwu_transpose_rb, _wwu_transpose_rf, 0x88 ); \
     241    59434813 :     wwu_t _wwu_transpose_tc  = _mm512_shuffle_i32x4( _wwu_transpose_r8, _wwu_transpose_rc, 0xdd ); \
     242    59434813 :     wwu_t _wwu_transpose_td  = _mm512_shuffle_i32x4( _wwu_transpose_r9, _wwu_transpose_rd, 0xdd ); \
     243    59434813 :     wwu_t _wwu_transpose_te  = _mm512_shuffle_i32x4( _wwu_transpose_ra, _wwu_transpose_re, 0xdd ); \
     244    59434813 :     wwu_t _wwu_transpose_tf  = _mm512_shuffle_i32x4( _wwu_transpose_rb, _wwu_transpose_rf, 0xdd ); \
     245    59434813 :                                                                                                    \
     246    59434813 :     /**/  _wwu_transpose_r0  = _mm512_shuffle_i32x4( _wwu_transpose_t0, _wwu_transpose_t8, 0x88 ); \
     247    59434813 :     /**/  _wwu_transpose_r1  = _mm512_shuffle_i32x4( _wwu_transpose_t1, _wwu_transpose_t9, 0x88 ); \
     248    59434813 :     /**/  _wwu_transpose_r2  = _mm512_shuffle_i32x4( _wwu_transpose_t2, _wwu_transpose_ta, 0x88 ); \
     249    59434813 :     /**/  _wwu_transpose_r3  = _mm512_shuffle_i32x4( _wwu_transpose_t3, _wwu_transpose_tb, 0x88 ); \
     250    59434813 :     /**/  _wwu_transpose_r4  = _mm512_shuffle_i32x4( _wwu_transpose_t4, _wwu_transpose_tc, 0x88 ); \
     251    59434813 :     /**/  _wwu_transpose_r5  = _mm512_shuffle_i32x4( _wwu_transpose_t5, _wwu_transpose_td, 0x88 ); \
     252    59434813 :     /**/  _wwu_transpose_r6  = _mm512_shuffle_i32x4( _wwu_transpose_t6, _wwu_transpose_te, 0x88 ); \
     253    59434813 :     /**/  _wwu_transpose_r7  = _mm512_shuffle_i32x4( _wwu_transpose_t7, _wwu_transpose_tf, 0x88 ); \
     254    59434813 :     /**/  _wwu_transpose_r8  = _mm512_shuffle_i32x4( _wwu_transpose_t0, _wwu_transpose_t8, 0xdd ); \
     255    59434813 :     /**/  _wwu_transpose_r9  = _mm512_shuffle_i32x4( _wwu_transpose_t1, _wwu_transpose_t9, 0xdd ); \
     256    59434813 :     /**/  _wwu_transpose_ra  = _mm512_shuffle_i32x4( _wwu_transpose_t2, _wwu_transpose_ta, 0xdd ); \
     257    59434813 :     /**/  _wwu_transpose_rb  = _mm512_shuffle_i32x4( _wwu_transpose_t3, _wwu_transpose_tb, 0xdd ); \
     258    59434813 :     /**/  _wwu_transpose_rc  = _mm512_shuffle_i32x4( _wwu_transpose_t4, _wwu_transpose_tc, 0xdd ); \
     259    59434813 :     /**/  _wwu_transpose_rd  = _mm512_shuffle_i32x4( _wwu_transpose_t5, _wwu_transpose_td, 0xdd ); \
     260    59434813 :     /**/  _wwu_transpose_re  = _mm512_shuffle_i32x4( _wwu_transpose_t6, _wwu_transpose_te, 0xdd ); \
     261    59434813 :     /**/  _wwu_transpose_rf  = _mm512_shuffle_i32x4( _wwu_transpose_t7, _wwu_transpose_tf, 0xdd ); \
     262    59434813 :                                                                                                    \
     263    59434813 :     /* Inner 4x4 transpose of 1x1 blocks */                                                        \
     264    59434813 :     /**/  _wwu_transpose_t0  = _mm512_unpacklo_epi32( _wwu_transpose_r0, _wwu_transpose_r2 );      \
     265    59434813 :     /**/  _wwu_transpose_t1  = _mm512_unpacklo_epi32( _wwu_transpose_r1, _wwu_transpose_r3 );      \
     266    59434813 :     /**/  _wwu_transpose_t2  = _mm512_unpackhi_epi32( _wwu_transpose_r0, _wwu_transpose_r2 );      \
     267    59434813 :     /**/  _wwu_transpose_t3  = _mm512_unpackhi_epi32( _wwu_transpose_r1, _wwu_transpose_r3 );      \
     268    59434813 :     /**/  _wwu_transpose_t4  = _mm512_unpacklo_epi32( _wwu_transpose_r4, _wwu_transpose_r6 );      \
     269    59434813 :     /**/  _wwu_transpose_t5  = _mm512_unpacklo_epi32( _wwu_transpose_r5, _wwu_transpose_r7 );      \
     270    59434813 :     /**/  _wwu_transpose_t6  = _mm512_unpackhi_epi32( _wwu_transpose_r4, _wwu_transpose_r6 );      \
     271    59434813 :     /**/  _wwu_transpose_t7  = _mm512_unpackhi_epi32( _wwu_transpose_r5, _wwu_transpose_r7 );      \
     272    59434813 :     /**/  _wwu_transpose_t8  = _mm512_unpacklo_epi32( _wwu_transpose_r8, _wwu_transpose_ra );      \
     273    59434813 :     /**/  _wwu_transpose_t9  = _mm512_unpacklo_epi32( _wwu_transpose_r9, _wwu_transpose_rb );      \
     274    59434813 :     /**/  _wwu_transpose_ta  = _mm512_unpackhi_epi32( _wwu_transpose_r8, _wwu_transpose_ra );      \
     275    59434813 :     /**/  _wwu_transpose_tb  = _mm512_unpackhi_epi32( _wwu_transpose_r9, _wwu_transpose_rb );      \
     276    59434813 :     /**/  _wwu_transpose_tc  = _mm512_unpacklo_epi32( _wwu_transpose_rc, _wwu_transpose_re );      \
     277    59434813 :     /**/  _wwu_transpose_td  = _mm512_unpacklo_epi32( _wwu_transpose_rd, _wwu_transpose_rf );      \
     278    59434813 :     /**/  _wwu_transpose_te  = _mm512_unpackhi_epi32( _wwu_transpose_rc, _wwu_transpose_re );      \
     279    59434813 :     /**/  _wwu_transpose_tf  = _mm512_unpackhi_epi32( _wwu_transpose_rd, _wwu_transpose_rf );      \
     280    59434813 :                                                                                                    \
     281    59434813 :     /**/  (c0)               = _mm512_unpacklo_epi32( _wwu_transpose_t0, _wwu_transpose_t1 );      \
     282    59434813 :     /**/  (c1)               = _mm512_unpackhi_epi32( _wwu_transpose_t0, _wwu_transpose_t1 );      \
     283    59434813 :     /**/  (c2)               = _mm512_unpacklo_epi32( _wwu_transpose_t2, _wwu_transpose_t3 );      \
     284    59434813 :     /**/  (c3)               = _mm512_unpackhi_epi32( _wwu_transpose_t2, _wwu_transpose_t3 );      \
     285    59434813 :     /**/  (c4)               = _mm512_unpacklo_epi32( _wwu_transpose_t4, _wwu_transpose_t5 );      \
     286    59434813 :     /**/  (c5)               = _mm512_unpackhi_epi32( _wwu_transpose_t4, _wwu_transpose_t5 );      \
     287    59434813 :     /**/  (c6)               = _mm512_unpacklo_epi32( _wwu_transpose_t6, _wwu_transpose_t7 );      \
     288    59434813 :     /**/  (c7)               = _mm512_unpackhi_epi32( _wwu_transpose_t6, _wwu_transpose_t7 );      \
     289    59434813 :     /**/  (c8)               = _mm512_unpacklo_epi32( _wwu_transpose_t8, _wwu_transpose_t9 );      \
     290    59434813 :     /**/  (c9)               = _mm512_unpackhi_epi32( _wwu_transpose_t8, _wwu_transpose_t9 );      \
     291    59434813 :     /**/  (ca)               = _mm512_unpacklo_epi32( _wwu_transpose_ta, _wwu_transpose_tb );      \
     292    59434813 :     /**/  (cb)               = _mm512_unpackhi_epi32( _wwu_transpose_ta, _wwu_transpose_tb );      \
     293    59434813 :     /**/  (cc)               = _mm512_unpacklo_epi32( _wwu_transpose_tc, _wwu_transpose_td );      \
     294    59434813 :     /**/  (cd)               = _mm512_unpackhi_epi32( _wwu_transpose_tc, _wwu_transpose_td );      \
     295    59434813 :     /**/  (ce)               = _mm512_unpacklo_epi32( _wwu_transpose_te, _wwu_transpose_tf );      \
     296    59434813 :     /**/  (cf)               = _mm512_unpackhi_epi32( _wwu_transpose_te, _wwu_transpose_tf );      \
     297    59434813 :   } while(0)
     298             : 
     299             : /* wwu_transpose_2x8x8 transposes the 2 8x8 matrices whose rows are
     300             :    held in the lower and upper halves of wwu_t's r0,r1...r7 and
     301             :    stores the result in c0,c1...c7.  In-place operation fine. */
     302             : 
     303             : #define wwu_transpose_2x8x8( r0,r1,r2,r3,r4,r5,r6,r7,                                                \
     304     5155807 :                              c0,c1,c2,c3,c4,c5,c6,c7 ) {                                             \
     305     5155807 :     wwu_t _wwu_transpose_r0 = (r0); wwu_t _wwu_transpose_r1 = (r1);                                  \
     306     5155807 :     wwu_t _wwu_transpose_r2 = (r2); wwu_t _wwu_transpose_r3 = (r3);                                  \
     307     5155807 :     wwu_t _wwu_transpose_r4 = (r4); wwu_t _wwu_transpose_r5 = (r5);                                  \
     308     5155807 :     wwu_t _wwu_transpose_r6 = (r6); wwu_t _wwu_transpose_r7 = (r7);                                  \
     309     5155807 :                                                                                                      \
     310     5155807 :     /* Outer 2x2 transpose of 4x4 blocks */                                                          \
     311     5155807 :     /* No _mm256_permute2f128_si128 equiv? sigh ... probably a better method possible here */        \
     312     5155807 :     wwu_t _wwu_transpose_p   = wwu( 0U, 1U, 2U, 3U,16U,17U,18U,19U, 8U, 9U,10U,11U,24U,25U,26U,27U); \
     313     5155807 :     wwu_t _wwu_transpose_q   = wwu( 4U, 5U, 6U, 7U,20U,21U,22U,23U,12U,13U,14U,15U,28U,29U,30U,31U); \
     314     5155807 :     wwu_t _wwu_transpose_t0  = wwu_select( _wwu_transpose_p, _wwu_transpose_r0, _wwu_transpose_r4 ); \
     315     5155807 :     wwu_t _wwu_transpose_t1  = wwu_select( _wwu_transpose_p, _wwu_transpose_r1, _wwu_transpose_r5 ); \
     316     5155807 :     wwu_t _wwu_transpose_t2  = wwu_select( _wwu_transpose_p, _wwu_transpose_r2, _wwu_transpose_r6 ); \
     317     5155807 :     wwu_t _wwu_transpose_t3  = wwu_select( _wwu_transpose_p, _wwu_transpose_r3, _wwu_transpose_r7 ); \
     318     5155807 :     wwu_t _wwu_transpose_t4  = wwu_select( _wwu_transpose_q, _wwu_transpose_r0, _wwu_transpose_r4 ); \
     319     5155807 :     wwu_t _wwu_transpose_t5  = wwu_select( _wwu_transpose_q, _wwu_transpose_r1, _wwu_transpose_r5 ); \
     320     5155807 :     wwu_t _wwu_transpose_t6  = wwu_select( _wwu_transpose_q, _wwu_transpose_r2, _wwu_transpose_r6 ); \
     321     5155807 :     wwu_t _wwu_transpose_t7  = wwu_select( _wwu_transpose_q, _wwu_transpose_r3, _wwu_transpose_r7 ); \
     322     5155807 :                                                                                                      \
     323     5155807 :     /* Inner 4x4 transpose of 1x1 blocks */                                                          \
     324     5155807 :     /**/  _wwu_transpose_r0  = _mm512_unpacklo_epi32( _wwu_transpose_t0, _wwu_transpose_t2 );        \
     325     5155807 :     /**/  _wwu_transpose_r1  = _mm512_unpacklo_epi32( _wwu_transpose_t1, _wwu_transpose_t3 );        \
     326     5155807 :     /**/  _wwu_transpose_r2  = _mm512_unpackhi_epi32( _wwu_transpose_t0, _wwu_transpose_t2 );        \
     327     5155807 :     /**/  _wwu_transpose_r3  = _mm512_unpackhi_epi32( _wwu_transpose_t1, _wwu_transpose_t3 );        \
     328     5155807 :     /**/  _wwu_transpose_r4  = _mm512_unpacklo_epi32( _wwu_transpose_t4, _wwu_transpose_t6 );        \
     329     5155807 :     /**/  _wwu_transpose_r5  = _mm512_unpacklo_epi32( _wwu_transpose_t5, _wwu_transpose_t7 );        \
     330     5155807 :     /**/  _wwu_transpose_r6  = _mm512_unpackhi_epi32( _wwu_transpose_t4, _wwu_transpose_t6 );        \
     331     5155807 :     /**/  _wwu_transpose_r7  = _mm512_unpackhi_epi32( _wwu_transpose_t5, _wwu_transpose_t7 );        \
     332     5155807 :                                                                                                      \
     333     5155807 :     /**/  (c0)               = _mm512_unpacklo_epi32( _wwu_transpose_r0, _wwu_transpose_r1 );        \
     334     5155807 :     /**/  (c1)               = _mm512_unpackhi_epi32( _wwu_transpose_r0, _wwu_transpose_r1 );        \
     335     5155807 :     /**/  (c2)               = _mm512_unpacklo_epi32( _wwu_transpose_r2, _wwu_transpose_r3 );        \
     336     5155807 :     /**/  (c3)               = _mm512_unpackhi_epi32( _wwu_transpose_r2, _wwu_transpose_r3 );        \
     337     5155807 :     /**/  (c4)               = _mm512_unpacklo_epi32( _wwu_transpose_r4, _wwu_transpose_r5 );        \
     338     5155807 :     /**/  (c5)               = _mm512_unpackhi_epi32( _wwu_transpose_r4, _wwu_transpose_r5 );        \
     339     5155807 :     /**/  (c6)               = _mm512_unpacklo_epi32( _wwu_transpose_r6, _wwu_transpose_r7 );        \
     340     5155807 :     /**/  (c7)               = _mm512_unpackhi_epi32( _wwu_transpose_r6, _wwu_transpose_r7 );        \
     341     5155807 :   } while(0)

Generated by: LCOV version 1.14