LCOV - code coverage report
Current view: top level - util/simd - fd_avx512_wwu.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 173 174 99.4 %
Date: 2025-09-18 04:41:32 Functions: 17 352 4.8 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_avx512_h
       2             : #error "Do not include this directly; use fd_avx512.h"
       3             : #endif
       4             : 
       5             : /* TODO: REDUCE, EXTRACT, ADDITIONAL LANE OPS, ... */
       6             : /* TODO: USE INT FOR THS SCALAR N ROL/ROR (AND IN OTHER ROL/ROR)? */
       7             : /* TODO: BACKPORT UNPACKS TO AVX AND SSE? */
       8             : 
       9             : /* Vector uint API ***************************************************/
      10             : 
      11             : /* A wwu_t is a vector where each 32-bit wide lane holds an unsigned
      12             :    32-bit integer (a "uint").
      13             : 
      14             :    These mirror the other APIs as much as possible.  Macros are
      15             :    preferred over static inlines when it is possible to do it robustly
      16             :    to reduce the risk of the compiler mucking it up. */
      17             : 
      18 17407345788 : #define wwu_t __m512i
      19             : 
      20             : /* Constructors */
      21             : 
      22             : /* wwu(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf)
      23             :    returns the wwu_t [x0 x1 ... xf] where x* are uints */
      24             : 
      25             : #define wwu(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf)                                                 \
      26    98577774 :   _mm512_setr_epi32( (int)(x0), (int)(x1), (int)(x2), (int)(x3), (int)(x4), (int)(x5), (int)(x6), (int)(x7), \
      27    98577774 :                      (int)(x8), (int)(x9), (int)(xa), (int)(xb), (int)(xc), (int)(xd), (int)(xe), (int)(xf) )
      28             : 
      29   482120755 : #define wwu_bcast(x)         _mm512_set1_epi32( (int)(x) ) /* wwu(x, x, ... x) */
      30             : 
      31             : /* wwu_permute(p,x) returns:
      32             :      wwu( x(p(0)), x(p(1)), ... x(p(15)) ).
      33             :    As such p(*) should be uints in [0,15]. */
      34             : 
      35             : #define wwu_permute(p,x)     _mm512_permutexvar_epi32( (p), (x) )
      36             : 
      37             : /* wwu_select(s,x,y) concatenates the wwu_t's x and y into
      38             :      z = [ x0 x1 ... xf y0 y1 ... yf ]
      39             :    and then returns:
      40             :      wwu( z(p(0)), z(p(1)), ... z(p(15)) ).
      41             :    As such p(*) should be uints in [0,31]. */
      42             : 
      43    42317808 : #define wwu_select(p,x,y)    _mm512_permutex2var_epi32( (x), (p), (y) )
      44             : 
      45             : /* Predefined constants */
      46             : 
      47    93172185 : #define wwu_zero()           _mm512_setzero_si512()  /* wwu(0, 0, ... 0) */
      48             : #define wwu_one()            _mm512_set1_epi32( 1 )  /* wwu(1, 1, ... 1) */
      49             : 
      50             : /* Memory operations */
      51             : /* Note: wwu_{ld,st} assume m is 64-byte aligned while wwu_{ldu,stu}
      52             :    allow m to have arbitrary alignment */
      53             : 
      54    42507013 : static inline wwu_t wwu_ld( uint const * m ) { return _mm512_load_epi32( m ); }  /* wwu( m[0], m[1], ... m[15] ) */
      55   309848384 : static inline void  wwu_st( uint * m, wwu_t x ) { _mm512_store_epi32( m, x ); }  /* does m[0] = x0, m[1] = x1, ... m[15] = xf */
      56             : 
      57  2598089286 : static inline wwu_t wwu_ldu( void const * m ) { return _mm512_loadu_epi32( m ); } /* wwu( m[0], m[1], ... m[15]) */
      58   378011616 : static inline void  wwu_stu( void * m, wwu_t x ) { _mm512_storeu_epi32( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[15] = xf */
      59             : 
      60             : /* Element operations */
      61             : 
      62             : /* wwu_extract extracts the uint in lane imm from the vector uint.
      63             : 
      64             :    Note: C99 TC3 allows type punning through a union. */
      65             : 
      66             : #define wwu_extract(a,imm) __extension__({ wwu_t l = (a); (uint)_mm_extract_epi32( _mm512_castsi512_si128( _mm512_alignr_epi32( l, l, (imm) ) ), 0 ); })
      67             : 
      68             : /* Arithmetic operations */
      69             : 
      70             : #define wwu_neg(x)           _mm512_sub_epi32( _mm512_setzero_si512(), (x) ) /* wwu( -x0, -x1, ... -xf ) */
      71             : #define wwu_abs(x)           (x)                                             /* wwu(  x0,  x1, ...  xf ) */
      72             : 
      73   105809102 : #define wwu_min(x,y)         _mm512_min_epu32  ( (x), (y) ) /* wwu( min(x0,y0), min(x1,y1), ... min(xf,yf) ) */
      74             : #define wwu_max(x,y)         _mm512_max_epu32  ( (x), (y) ) /* wwu( max(x0,y0), max(x1,y1), ... max(xf,yf) ) */
      75 56082518531 : #define wwu_add(x,y)         _mm512_add_epi32  ( (x), (y) ) /* wwu( x0+y0,      x1+y1,      ... xf+yf      ) */
      76             : #define wwu_sub(x,y)         _mm512_sub_epi32  ( (x), (y) ) /* wwu( x0-y0,      x1-y1,      ... xf-yf      ) */
      77             : #define wwu_mul(x,y)         _mm512_mullo_epi32( (x), (y) ) /* wwu( x0*y0,      x1*y1,      ... xf*yf      ) */
      78             : 
      79             : /* Binary operations */
      80             : /* Note: shifts assumes n and or y* in [0,31].  Rotates work for
      81             :    arbitrary values */
      82             : 
      83     4000000 : #define wwu_not(x)           _mm512_xor_epi32( _mm512_set1_epi32( -1 ), (x) )
      84             : 
      85             : #define wwu_shl(x,n)         _mm512_slli_epi32  ( (x), (uint)(n) ) /* wwu( x0<<n,  x1<<n,  ... xf<<n  ) */
      86           0 : #define wwu_shr(x,n)         _mm512_srli_epi32  ( (x), (uint)(n) ) /* wwu( x0>>n,  x1>>n,  ... xf>>n  ) */
      87   958951476 : #define wwu_shl_vector(x,y)  _mm512_sllv_epi32  ( (x), (y)       ) /* wwu( x0<<y0, x1<<y1, ... xf<<yf ) */
      88             : #define wwu_shr_vector(x,y)  _mm512_srlv_epi32  ( (x), (y)       ) /* wwu( x0>>y0, x1>>y1, ... xf>>yf ) */
      89  1654360746 : #define wwu_and(x,y)         _mm512_and_epi32   ( (x), (y)       ) /* wwu( x0&y0,  x1&y1,  ... xf&yf  ) */
      90             : #define wwu_andnot(x,y)      _mm512_andnot_epi32( (x), (y)       ) /* wwu( ~x0&y0, ~x1&y1, ... ~xf&yf ) */
      91  1715055409 : #define wwu_or(x,y)          _mm512_or_epi32    ( (x), (y)       ) /* wwu( x0|y0,  x1|y1,  ... xf|yf  ) */
      92 27353889199 : #define wwu_xor(x,y)         _mm512_xor_epi32   ( (x), (y)       ) /* wwu( x0^y0,  x1^y1,  ... xf^yf  ) */
      93             : 
      94             : /* wwu_rol(x,n)          returns wwu( rotate_left (x0,n ), rotate_left (x1,n ), ... )
      95             :    wwu_ror(x,n)          returns wwu( rotate_right(x0,n ), rotate_right(x1,n ), ... )
      96             :    wwu_rol_variable(x,n) returns wwu( rotate_left (x0,n ), rotate_left (x1,n ), ... )
      97             :    wwu_ror_variable(x,n) returns wwu( rotate_right(x0,n ), rotate_right(x1,n ), ... )
      98             :    wwu_rol_vector(x,y)   returns wwu( rotate_left (x0,y0), rotate_left (x1,y1), ... )
      99             :    wwu_ror_vector(x,y)   returns wwu( rotate_right(x0,y0), rotate_right(x1,y1), ... )
     100             : 
     101             :    The variable variants are slower but do not require the shift amount
     102             :    to be known at compile time. */
     103             : 
     104  1278633888 : #define wwu_rol(a,imm)       _mm512_rol_epi32( (a), (imm)&31U )
     105 23451629856 : #define wwu_ror(a,imm)       _mm512_ror_epi32( (a), (imm)&31U )
     106             : 
     107     1000000 : static inline wwu_t wwu_rol_variable( wwu_t a, uint n ) { return wwu_or( wwu_shl( a, n & 31U ), wwu_shr( a, (-n) & 31U ) ); }
     108     1000000 : static inline wwu_t wwu_ror_variable( wwu_t a, uint n ) { return wwu_or( wwu_shr( a, n & 31U ), wwu_shl( a, (-n) & 31U ) ); }
     109             : 
     110     1000000 : static inline wwu_t wwu_rol_vector( wwu_t a, wwu_t b ) {
     111     1000000 :   wwu_t m = wwu_bcast( 31U );
     112     1000000 :   return wwu_or( wwu_shl_vector( a, wwu_and( b, m ) ), wwu_shr_vector( a, wwu_and( wwu_neg( b ), m ) ) );
     113     1000000 : }
     114             : 
     115     1000000 : static inline wwu_t wwu_ror_vector( wwu_t a, wwu_t b ) {
     116     1000000 :   wwu_t m = wwu_bcast( 31U );
     117     1000000 :   return wwu_or( wwu_shr_vector( a, wwu_and( b, m ) ), wwu_shl_vector( a, wwu_and( wwu_neg( b ), m ) ) );
     118     1000000 : }
     119             : 
     120             : /* wwu_bswap(x) returns wwu( bswap(x0), bswap(x1), ... ) */
     121             : 
     122             : #define wwu_bswap( x ) _mm512_shuffle_epi8( (x), _mm512_set_epi8( 12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3, \
     123             :                                                                   12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3, \
     124             :                                                                   12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3, \
     125             :                                                                   12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3 ) )
     126             : 
     127             : /* Comparison operations */
     128             : /* mask(c0,c1,...) means (((int)c0)<<0) | (((int)c1)<<1) | ... */
     129             : 
     130   105809102 : #define wwu_eq(x,y) ((int)_mm512_cmpeq_epu32_mask(  (x), (y) )) /* mask( x0==y0, x1==y1, ... ) */
     131             : #define wwu_gt(x,y) ((int)_mm512_cmpgt_epu32_mask(  (x), (y) )) /* mask( x0> y0, x1> y1, ... ) */
     132             : #define wwu_lt(x,y) ((int)_mm512_cmplt_epu32_mask(  (x), (y) )) /* mask( x0< y0, x1< y1, ... ) */
     133   107457157 : #define wwu_ne(x,y) ((int)_mm512_cmpneq_epu32_mask( (x), (y) )) /* mask( x0!=y0, x1!=y1, ... ) */
     134             : #define wwu_ge(x,y) ((int)_mm512_cmpge_epu32_mask(  (x), (y) )) /* mask( x0>=y0, x1>=y1, ... ) */
     135             : #define wwu_le(x,y) ((int)_mm512_cmple_epu32_mask(  (x), (y) )) /* mask( x0<=y0, x1<=y1, ... ) */
     136             : 
     137             : #define wwu_lnot(x)    wwu_eq( (x), wwu_zero() )                /* mask(  !x0,  !x1, ... ) */
     138             : #define wwu_lnotnot(x) wwu_ne( (x), wwu_zero() )                /* mask( !!x0, !!x1, ... ) */
     139             : 
     140             : /* Conditional operations */
     141             : /* cn means bit n of c */
     142             : 
     143   213618204 : #define wwu_if(c,x,y)          _mm512_mask_blend_epi32 ( (__mmask16)(c), (y), (x) )    /* wwu( c0? x0    :y0, ... ) */
     144             : 
     145   563020846 : #define wwu_add_if(c,x,y,z)    _mm512_mask_add_epi32   ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0+y0):z0, ... ) */
     146             : #define wwu_sub_if(c,x,y,z)    _mm512_mask_sub_epi32   ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0-y0):z0, ... ) */
     147             : 
     148             : #define wwu_and_if(c,x,y,z)    _mm512_mask_and_epi32   ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0&y0):z0, ... ) */
     149             : #define wwu_andnot_if(c,x,y,z) _mm512_mask_andnot_epi32( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(~x0&y0):z0, ... ) */
     150             : #define wwu_or_if(c,x,y,z)     _mm512_mask_or_epi32    ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0|y0):z0, ... ) */
     151   787243368 : #define wwu_xor_if(c,x,y,z)    _mm512_mask_xor_epi32   ( (z), (__mmask16)(c), (x), (y) ) /* wwu( c0?(x0^y0):z0, ... ) */
     152             : 
     153             : /* Conversions */
     154             : 
     155             : /* wwu_to_wwi( x )    returns wwi(   (int)x0,   (int)x1, ...   (int)x15 )
     156             : 
     157             :    wwu_to_wwl( x, 0 ) returns wwl(  (long)x0,  (long)x2, ...  (long)x14 )
     158             :    wwu_to_wwl( x, 1 ) returns wwl(  (long)x1,  (long)x3, ...  (long)x15 )
     159             : 
     160             :    wwu_to_wwv( x, 0 ) returns wwv( (ulong)x0, (ulong)x2, ... (ulong)x14 )
     161             :    wwu_to_wwv( x, 1 ) returns wwv( (ulong)x1, (ulong)x3, ... (ulong)x15 )
     162             : 
     163             :    TODO: consider _mm512_cvtepu32_* intrinsics? */
     164             : 
     165             : #define wwu_to_wwi( x ) (x)
     166             : #define wwu_to_wwl( x, odd ) /* trinary should be compile time */ \
     167             :   (__extension__({ wwl_t _wwu_to_wwl_tmp = (x); wwl_shru( (odd) ? _wwu_to_wwl_tmp : wwl_shl( _wwu_to_wwl_tmp, 32 ), 32 ); }))
     168             : #define wwu_to_wwv( x, odd ) /* trinary should be compile time */ \
     169             :   (__extension__({ wwv_t _wwu_to_wwv_tmp = (x); wwv_shr ( (odd) ? _wwu_to_wwv_tmp : wwv_shl( _wwu_to_wwv_tmp, 32 ), 32 ); }))
     170             : 
     171             : #define wwu_to_wwi_raw(x) (x)
     172             : #define wwu_to_wwl_raw(x) (x)
     173             : #define wwu_to_wwv_raw(x) (x)
     174             : 
     175             : /* Misc operations */
     176             : 
     177             : /* wwu_pack_halves(x,imm0,y,imm1) packs half of x and half of y into a
     178             :    wwu.  imm0/imm1 select which half of x and y to pack.  imm0 / imm1
     179             :    should be in [0,1].  That is, this returns:
     180             : 
     181             :      [ if( imm0, x(8:15), x(0:7) ) if( imm1, y(8:15), y(0:7) ) ]
     182             : 
     183             :    wwu_pack_h0_h1(x,y) does the wwu_pack_halves(x,0,y,1) case faster.
     184             :    Hat tip to Philip Taffet for pointing this out. */
     185             : 
     186             : #define wwu_pack_halves(x,imm0,y,imm1) _mm512_shuffle_i32x4( (x), (y), 68+10*(imm0)+160*(imm1) )
     187             : #define wwu_pack_h0_h1(x,y)            _mm512_mask_blend_epi32( (__mmask16)0xFF00, (x), (y) )
     188             : 
     189             : /* wwu_slide(x,y,imm) treats as a x FIFO with the oldest / newest
     190             :    element at lane 0 / 15.  Returns the result of dequeing x imm times
     191             :    and enqueing the values y0 ... y{imm-1} in that order.  imm should be
     192             :    in [0,15].  For example, with imm==5 case, returns:
     193             :      [ x5 x6 ... xf y0 y1 y2 y3 y4 ]. */
     194             : 
     195             : #define wwu_slide(x,y,imm) _mm512_alignr_epi32( (y), (x), (imm) )
     196             : 
     197             : /* wwv_unpack unpacks the wwv x into its uint components x0,x1,...xf. */
     198             : 
     199     1000000 : #define wwu_unpack( x, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf ) do { \
     200     1000000 :     __m512i _wwu_unpack_x  = (x);                                             \
     201     1000000 :     __m256i _wwu_unpack_xl = _mm512_extracti32x8_epi32( _wwu_unpack_x, 0 );   \
     202     1000000 :     __m256i _wwu_unpack_xh = _mm512_extracti32x8_epi32( _wwu_unpack_x, 1 );   \
     203     1000000 :     (x0) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 0 );                   \
     204     1000000 :     (x1) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 1 );                   \
     205     1000000 :     (x2) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 2 );                   \
     206     1000000 :     (x3) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 3 );                   \
     207     1000000 :     (x4) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 4 );                   \
     208     1000000 :     (x5) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 5 );                   \
     209     1000000 :     (x6) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 6 );                   \
     210     1000000 :     (x7) = (uint)_mm256_extract_epi32( _wwu_unpack_xl, 7 );                   \
     211     1000000 :     (x8) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 0 );                   \
     212     1000000 :     (x9) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 1 );                   \
     213     1000000 :     (xa) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 2 );                   \
     214     1000000 :     (xb) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 3 );                   \
     215     1000000 :     (xc) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 4 );                   \
     216     1000000 :     (xd) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 5 );                   \
     217     1000000 :     (xe) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 6 );                   \
     218     1000000 :     (xf) = (uint)_mm256_extract_epi32( _wwu_unpack_xh, 7 );                   \
     219     1000000 :   } while(0)
     220             : 
     221             : /* wwu_transpose_16x16 sets wwu_t's c0,c1,...cf to the columns of a
     222             :    16x16 uint matrix given the rows of the matrix in wwu_t's
     223             :    r0,r1,...rf.  In-place operation fine. */
     224             : 
     225             : #define wwu_transpose_16x16( r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,ra,rb,rc,rd,re,rf,                      \
     226   207283266 :                              c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf ) do {                \
     227   207283266 :     wwu_t _wwu_transpose_r0 = (r0); wwu_t _wwu_transpose_r1 = (r1);                                \
     228   207283266 :     wwu_t _wwu_transpose_r2 = (r2); wwu_t _wwu_transpose_r3 = (r3);                                \
     229   207283266 :     wwu_t _wwu_transpose_r4 = (r4); wwu_t _wwu_transpose_r5 = (r5);                                \
     230   207283266 :     wwu_t _wwu_transpose_r6 = (r6); wwu_t _wwu_transpose_r7 = (r7);                                \
     231   207283266 :     wwu_t _wwu_transpose_r8 = (r8); wwu_t _wwu_transpose_r9 = (r9);                                \
     232   207283266 :     wwu_t _wwu_transpose_ra = (ra); wwu_t _wwu_transpose_rb = (rb);                                \
     233   207283266 :     wwu_t _wwu_transpose_rc = (rc); wwu_t _wwu_transpose_rd = (rd);                                \
     234   207283266 :     wwu_t _wwu_transpose_re = (re); wwu_t _wwu_transpose_rf = (rf);                                \
     235   207283266 :                                                                                                    \
     236   207283266 :     /* Outer 4x4 transpose of 4x4 blocks */                                                        \
     237   207283266 :     wwu_t _wwu_transpose_t0  = _mm512_shuffle_i32x4( _wwu_transpose_r0, _wwu_transpose_r4, 0x88 ); \
     238   207283266 :     wwu_t _wwu_transpose_t1  = _mm512_shuffle_i32x4( _wwu_transpose_r1, _wwu_transpose_r5, 0x88 ); \
     239   207283266 :     wwu_t _wwu_transpose_t2  = _mm512_shuffle_i32x4( _wwu_transpose_r2, _wwu_transpose_r6, 0x88 ); \
     240   207283266 :     wwu_t _wwu_transpose_t3  = _mm512_shuffle_i32x4( _wwu_transpose_r3, _wwu_transpose_r7, 0x88 ); \
     241   207283266 :     wwu_t _wwu_transpose_t4  = _mm512_shuffle_i32x4( _wwu_transpose_r0, _wwu_transpose_r4, 0xdd ); \
     242   207283266 :     wwu_t _wwu_transpose_t5  = _mm512_shuffle_i32x4( _wwu_transpose_r1, _wwu_transpose_r5, 0xdd ); \
     243   207283266 :     wwu_t _wwu_transpose_t6  = _mm512_shuffle_i32x4( _wwu_transpose_r2, _wwu_transpose_r6, 0xdd ); \
     244   207283266 :     wwu_t _wwu_transpose_t7  = _mm512_shuffle_i32x4( _wwu_transpose_r3, _wwu_transpose_r7, 0xdd ); \
     245   207283266 :     wwu_t _wwu_transpose_t8  = _mm512_shuffle_i32x4( _wwu_transpose_r8, _wwu_transpose_rc, 0x88 ); \
     246   207283266 :     wwu_t _wwu_transpose_t9  = _mm512_shuffle_i32x4( _wwu_transpose_r9, _wwu_transpose_rd, 0x88 ); \
     247   207283266 :     wwu_t _wwu_transpose_ta  = _mm512_shuffle_i32x4( _wwu_transpose_ra, _wwu_transpose_re, 0x88 ); \
     248   207283266 :     wwu_t _wwu_transpose_tb  = _mm512_shuffle_i32x4( _wwu_transpose_rb, _wwu_transpose_rf, 0x88 ); \
     249   207283266 :     wwu_t _wwu_transpose_tc  = _mm512_shuffle_i32x4( _wwu_transpose_r8, _wwu_transpose_rc, 0xdd ); \
     250   207283266 :     wwu_t _wwu_transpose_td  = _mm512_shuffle_i32x4( _wwu_transpose_r9, _wwu_transpose_rd, 0xdd ); \
     251   207283266 :     wwu_t _wwu_transpose_te  = _mm512_shuffle_i32x4( _wwu_transpose_ra, _wwu_transpose_re, 0xdd ); \
     252   207283266 :     wwu_t _wwu_transpose_tf  = _mm512_shuffle_i32x4( _wwu_transpose_rb, _wwu_transpose_rf, 0xdd ); \
     253   207283266 :                                                                                                    \
     254   207283266 :     /**/  _wwu_transpose_r0  = _mm512_shuffle_i32x4( _wwu_transpose_t0, _wwu_transpose_t8, 0x88 ); \
     255   207283266 :     /**/  _wwu_transpose_r1  = _mm512_shuffle_i32x4( _wwu_transpose_t1, _wwu_transpose_t9, 0x88 ); \
     256   207283266 :     /**/  _wwu_transpose_r2  = _mm512_shuffle_i32x4( _wwu_transpose_t2, _wwu_transpose_ta, 0x88 ); \
     257   207283266 :     /**/  _wwu_transpose_r3  = _mm512_shuffle_i32x4( _wwu_transpose_t3, _wwu_transpose_tb, 0x88 ); \
     258   207283266 :     /**/  _wwu_transpose_r4  = _mm512_shuffle_i32x4( _wwu_transpose_t4, _wwu_transpose_tc, 0x88 ); \
     259   207283266 :     /**/  _wwu_transpose_r5  = _mm512_shuffle_i32x4( _wwu_transpose_t5, _wwu_transpose_td, 0x88 ); \
     260   207283266 :     /**/  _wwu_transpose_r6  = _mm512_shuffle_i32x4( _wwu_transpose_t6, _wwu_transpose_te, 0x88 ); \
     261   207283266 :     /**/  _wwu_transpose_r7  = _mm512_shuffle_i32x4( _wwu_transpose_t7, _wwu_transpose_tf, 0x88 ); \
     262   207283266 :     /**/  _wwu_transpose_r8  = _mm512_shuffle_i32x4( _wwu_transpose_t0, _wwu_transpose_t8, 0xdd ); \
     263   207283266 :     /**/  _wwu_transpose_r9  = _mm512_shuffle_i32x4( _wwu_transpose_t1, _wwu_transpose_t9, 0xdd ); \
     264   207283266 :     /**/  _wwu_transpose_ra  = _mm512_shuffle_i32x4( _wwu_transpose_t2, _wwu_transpose_ta, 0xdd ); \
     265   207283266 :     /**/  _wwu_transpose_rb  = _mm512_shuffle_i32x4( _wwu_transpose_t3, _wwu_transpose_tb, 0xdd ); \
     266   207283266 :     /**/  _wwu_transpose_rc  = _mm512_shuffle_i32x4( _wwu_transpose_t4, _wwu_transpose_tc, 0xdd ); \
     267   207283266 :     /**/  _wwu_transpose_rd  = _mm512_shuffle_i32x4( _wwu_transpose_t5, _wwu_transpose_td, 0xdd ); \
     268   207283266 :     /**/  _wwu_transpose_re  = _mm512_shuffle_i32x4( _wwu_transpose_t6, _wwu_transpose_te, 0xdd ); \
     269   207283266 :     /**/  _wwu_transpose_rf  = _mm512_shuffle_i32x4( _wwu_transpose_t7, _wwu_transpose_tf, 0xdd ); \
     270   207283266 :                                                                                                    \
     271   207283266 :     /* Inner 4x4 transpose of 1x1 blocks */                                                        \
     272   207283266 :     /**/  _wwu_transpose_t0  = _mm512_unpacklo_epi32( _wwu_transpose_r0, _wwu_transpose_r2 );      \
     273   207283266 :     /**/  _wwu_transpose_t1  = _mm512_unpacklo_epi32( _wwu_transpose_r1, _wwu_transpose_r3 );      \
     274   207283266 :     /**/  _wwu_transpose_t2  = _mm512_unpackhi_epi32( _wwu_transpose_r0, _wwu_transpose_r2 );      \
     275   207283266 :     /**/  _wwu_transpose_t3  = _mm512_unpackhi_epi32( _wwu_transpose_r1, _wwu_transpose_r3 );      \
     276   207283266 :     /**/  _wwu_transpose_t4  = _mm512_unpacklo_epi32( _wwu_transpose_r4, _wwu_transpose_r6 );      \
     277   207283266 :     /**/  _wwu_transpose_t5  = _mm512_unpacklo_epi32( _wwu_transpose_r5, _wwu_transpose_r7 );      \
     278   207283266 :     /**/  _wwu_transpose_t6  = _mm512_unpackhi_epi32( _wwu_transpose_r4, _wwu_transpose_r6 );      \
     279   207283266 :     /**/  _wwu_transpose_t7  = _mm512_unpackhi_epi32( _wwu_transpose_r5, _wwu_transpose_r7 );      \
     280   207283266 :     /**/  _wwu_transpose_t8  = _mm512_unpacklo_epi32( _wwu_transpose_r8, _wwu_transpose_ra );      \
     281   207283266 :     /**/  _wwu_transpose_t9  = _mm512_unpacklo_epi32( _wwu_transpose_r9, _wwu_transpose_rb );      \
     282   207283266 :     /**/  _wwu_transpose_ta  = _mm512_unpackhi_epi32( _wwu_transpose_r8, _wwu_transpose_ra );      \
     283   207283266 :     /**/  _wwu_transpose_tb  = _mm512_unpackhi_epi32( _wwu_transpose_r9, _wwu_transpose_rb );      \
     284   207283266 :     /**/  _wwu_transpose_tc  = _mm512_unpacklo_epi32( _wwu_transpose_rc, _wwu_transpose_re );      \
     285   207283266 :     /**/  _wwu_transpose_td  = _mm512_unpacklo_epi32( _wwu_transpose_rd, _wwu_transpose_rf );      \
     286   207283266 :     /**/  _wwu_transpose_te  = _mm512_unpackhi_epi32( _wwu_transpose_rc, _wwu_transpose_re );      \
     287   207283266 :     /**/  _wwu_transpose_tf  = _mm512_unpackhi_epi32( _wwu_transpose_rd, _wwu_transpose_rf );      \
     288   207283266 :                                                                                                    \
     289   207283266 :     /**/  (c0)               = _mm512_unpacklo_epi32( _wwu_transpose_t0, _wwu_transpose_t1 );      \
     290   207283266 :     /**/  (c1)               = _mm512_unpackhi_epi32( _wwu_transpose_t0, _wwu_transpose_t1 );      \
     291   207283266 :     /**/  (c2)               = _mm512_unpacklo_epi32( _wwu_transpose_t2, _wwu_transpose_t3 );      \
     292   207283266 :     /**/  (c3)               = _mm512_unpackhi_epi32( _wwu_transpose_t2, _wwu_transpose_t3 );      \
     293   207283266 :     /**/  (c4)               = _mm512_unpacklo_epi32( _wwu_transpose_t4, _wwu_transpose_t5 );      \
     294   207283266 :     /**/  (c5)               = _mm512_unpackhi_epi32( _wwu_transpose_t4, _wwu_transpose_t5 );      \
     295   207283266 :     /**/  (c6)               = _mm512_unpacklo_epi32( _wwu_transpose_t6, _wwu_transpose_t7 );      \
     296   207283266 :     /**/  (c7)               = _mm512_unpackhi_epi32( _wwu_transpose_t6, _wwu_transpose_t7 );      \
     297   207283266 :     /**/  (c8)               = _mm512_unpacklo_epi32( _wwu_transpose_t8, _wwu_transpose_t9 );      \
     298   207283266 :     /**/  (c9)               = _mm512_unpackhi_epi32( _wwu_transpose_t8, _wwu_transpose_t9 );      \
     299   207283266 :     /**/  (ca)               = _mm512_unpacklo_epi32( _wwu_transpose_ta, _wwu_transpose_tb );      \
     300   207283266 :     /**/  (cb)               = _mm512_unpackhi_epi32( _wwu_transpose_ta, _wwu_transpose_tb );      \
     301   207283266 :     /**/  (cc)               = _mm512_unpacklo_epi32( _wwu_transpose_tc, _wwu_transpose_td );      \
     302   207283266 :     /**/  (cd)               = _mm512_unpackhi_epi32( _wwu_transpose_tc, _wwu_transpose_td );      \
     303   207283266 :     /**/  (ce)               = _mm512_unpacklo_epi32( _wwu_transpose_te, _wwu_transpose_tf );      \
     304   207283266 :     /**/  (cf)               = _mm512_unpackhi_epi32( _wwu_transpose_te, _wwu_transpose_tf );      \
     305   207283266 :   } while(0)
     306             : 
     307             : /* wwu_transpose_2x8x8 transposes the 2 8x8 matrices whose rows are
     308             :    held in the lower and upper halves of wwu_t's r0,r1...r7 and
     309             :    stores the result in c0,c1...c7.  In-place operation fine. */
     310             : 
     311             : #define wwu_transpose_2x8x8( r0,r1,r2,r3,r4,r5,r6,r7,                                                \
     312     5289726 :                              c0,c1,c2,c3,c4,c5,c6,c7 ) do {                                          \
     313     5289726 :     wwu_t _wwu_transpose_r0 = (r0); wwu_t _wwu_transpose_r1 = (r1);                                  \
     314     5289726 :     wwu_t _wwu_transpose_r2 = (r2); wwu_t _wwu_transpose_r3 = (r3);                                  \
     315     5289726 :     wwu_t _wwu_transpose_r4 = (r4); wwu_t _wwu_transpose_r5 = (r5);                                  \
     316     5289726 :     wwu_t _wwu_transpose_r6 = (r6); wwu_t _wwu_transpose_r7 = (r7);                                  \
     317     5289726 :                                                                                                      \
     318     5289726 :     /* Outer 2x2 transpose of 4x4 blocks */                                                          \
     319     5289726 :     /* No _mm256_permute2f128_si128 equiv? sigh ... probably a better method possible here */        \
     320     5289726 :     wwu_t _wwu_transpose_p   = wwu( 0U, 1U, 2U, 3U,16U,17U,18U,19U, 8U, 9U,10U,11U,24U,25U,26U,27U); \
     321     5289726 :     wwu_t _wwu_transpose_q   = wwu( 4U, 5U, 6U, 7U,20U,21U,22U,23U,12U,13U,14U,15U,28U,29U,30U,31U); \
     322     5289726 :     wwu_t _wwu_transpose_t0  = wwu_select( _wwu_transpose_p, _wwu_transpose_r0, _wwu_transpose_r4 ); \
     323     5289726 :     wwu_t _wwu_transpose_t1  = wwu_select( _wwu_transpose_p, _wwu_transpose_r1, _wwu_transpose_r5 ); \
     324     5289726 :     wwu_t _wwu_transpose_t2  = wwu_select( _wwu_transpose_p, _wwu_transpose_r2, _wwu_transpose_r6 ); \
     325     5289726 :     wwu_t _wwu_transpose_t3  = wwu_select( _wwu_transpose_p, _wwu_transpose_r3, _wwu_transpose_r7 ); \
     326     5289726 :     wwu_t _wwu_transpose_t4  = wwu_select( _wwu_transpose_q, _wwu_transpose_r0, _wwu_transpose_r4 ); \
     327     5289726 :     wwu_t _wwu_transpose_t5  = wwu_select( _wwu_transpose_q, _wwu_transpose_r1, _wwu_transpose_r5 ); \
     328     5289726 :     wwu_t _wwu_transpose_t6  = wwu_select( _wwu_transpose_q, _wwu_transpose_r2, _wwu_transpose_r6 ); \
     329     5289726 :     wwu_t _wwu_transpose_t7  = wwu_select( _wwu_transpose_q, _wwu_transpose_r3, _wwu_transpose_r7 ); \
     330     5289726 :                                                                                                      \
     331     5289726 :     /* Inner 4x4 transpose of 1x1 blocks */                                                          \
     332     5289726 :     /**/  _wwu_transpose_r0  = _mm512_unpacklo_epi32( _wwu_transpose_t0, _wwu_transpose_t2 );        \
     333     5289726 :     /**/  _wwu_transpose_r1  = _mm512_unpacklo_epi32( _wwu_transpose_t1, _wwu_transpose_t3 );        \
     334     5289726 :     /**/  _wwu_transpose_r2  = _mm512_unpackhi_epi32( _wwu_transpose_t0, _wwu_transpose_t2 );        \
     335     5289726 :     /**/  _wwu_transpose_r3  = _mm512_unpackhi_epi32( _wwu_transpose_t1, _wwu_transpose_t3 );        \
     336     5289726 :     /**/  _wwu_transpose_r4  = _mm512_unpacklo_epi32( _wwu_transpose_t4, _wwu_transpose_t6 );        \
     337     5289726 :     /**/  _wwu_transpose_r5  = _mm512_unpacklo_epi32( _wwu_transpose_t5, _wwu_transpose_t7 );        \
     338     5289726 :     /**/  _wwu_transpose_r6  = _mm512_unpackhi_epi32( _wwu_transpose_t4, _wwu_transpose_t6 );        \
     339     5289726 :     /**/  _wwu_transpose_r7  = _mm512_unpackhi_epi32( _wwu_transpose_t5, _wwu_transpose_t7 );        \
     340     5289726 :                                                                                                      \
     341     5289726 :     /**/  (c0)               = _mm512_unpacklo_epi32( _wwu_transpose_r0, _wwu_transpose_r1 );        \
     342     5289726 :     /**/  (c1)               = _mm512_unpackhi_epi32( _wwu_transpose_r0, _wwu_transpose_r1 );        \
     343     5289726 :     /**/  (c2)               = _mm512_unpacklo_epi32( _wwu_transpose_r2, _wwu_transpose_r3 );        \
     344     5289726 :     /**/  (c3)               = _mm512_unpackhi_epi32( _wwu_transpose_r2, _wwu_transpose_r3 );        \
     345     5289726 :     /**/  (c4)               = _mm512_unpacklo_epi32( _wwu_transpose_r4, _wwu_transpose_r5 );        \
     346     5289726 :     /**/  (c5)               = _mm512_unpackhi_epi32( _wwu_transpose_r4, _wwu_transpose_r5 );        \
     347     5289726 :     /**/  (c6)               = _mm512_unpacklo_epi32( _wwu_transpose_r6, _wwu_transpose_r7 );        \
     348     5289726 :     /**/  (c7)               = _mm512_unpackhi_epi32( _wwu_transpose_r6, _wwu_transpose_r7 );        \
     349     5289726 :   } while(0)

Generated by: LCOV version 1.14