LCOV - code coverage report
Current view: top level - util/simd - fd_avx512_wwi.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 162 162 100.0 %
Date: 2025-01-08 12:08:44 Functions: 9 312 2.9 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_simd_fd_avx512_h
       2             : #error "Do not include this directly; use fd_avx512.h"
       3             : #endif
       4             : 
       5             : /* TODO: REDUCE, EXTRACT, ADDITIONAL LANE OPS, ... */
       6             : 
       7             : /* Vector int API ****************************************************/
       8             : 
       9             : /* A wwi_t is a vector where each 32-bit wide lane holds an signed twos
      10             :    complement 32-bit integer (an "int").
      11             : 
      12             :    These mirror the other APIs as much as possible.  Macros are
      13             :    preferred over static inlines when it is possible to do it robustly
      14             :    to reduce the risk of the compiler mucking it up. */
      15             : 
      16    90000000 : #define wwi_t __m512i
      17             : 
      18             : /* Constructors */
      19             : 
      20             : /* wwi(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf)
      21             :    returns the wwi_t [x0 x1 ... xf] where x* are ints */
      22             : 
      23             : #define wwi(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf) \
      24     8000000 :   _mm512_setr_epi32( (x0), (x1), (x2), (x3), (x4), (x5), (x6), (x7), (x8), (x9), (xa), (xb), (xc), (xd), (xe), (xf) )
      25             : 
      26     2000000 : #define wwi_bcast(x)         _mm512_set1_epi32( (x) ) /* wwi(x, x, ... x) */
      27             : 
      28             : /* wwi_permute(p,x) returns:
      29             :      wwi( x(p(0)), x(p(1)), ... x(p(15)) ).
      30             :    As such p(*) should be ints in [0,15]. */
      31             : 
      32             : #define wwi_permute(p,x)     _mm512_permutexvar_epi32( (p), (x) )
      33             : 
      34             : /* wwi_select(s,x,y) concatenates the wwi_t's x and y into
      35             :      z = [ x0 x1 ... xf y0 y1 ... yf ]
      36             :    and then returns:
      37             :      wwi( z(p(0)), z(p(1)), ... z(p(15)) ).
      38             :    As such p(*) should be ints in [0,31]. */
      39             : 
      40     8000000 : #define wwi_select(p,x,y)    _mm512_permutex2var_epi32( (x), (p), (y) )
      41             : 
      42             : /* Predefined constants */
      43             : 
      44             : #define wwi_zero()           _mm512_setzero_si512()  /* wwi(0, 0, ... 0) */
      45             : #define wwi_one()            _mm512_set1_epi32( 1 )  /* wwi(1, 1, ... 1) */
      46             : 
      47             : /* Memory operations */
      48             : /* Note: wwi_{ld,st} assume m is 64-byte aligned while wwi_{ldu,stu}
      49             :    allow m to have arbitrary alignment */
      50             : 
      51     1000000 : static inline wwi_t wwi_ld( int const * m ) { return _mm512_load_epi32( m ); }  /* wwi( m[0], m[1], ... m[15] ) */
      52   172000000 : static inline void  wwi_st( int * m, wwi_t x ) { _mm512_store_epi32( m, x ); }  /* does m[0] = x0, m[1] = x1, ... m[15] = xf */
      53             : 
      54     1000000 : static inline wwi_t wwi_ldu( void const * m ) { return _mm512_loadu_epi32( m ); } /* wwi( m[0], m[1], ... m[15]) */
      55     1000000 : static inline void  wwi_stu( void * m, wwi_t x ) { _mm512_storeu_epi32( m, x ); } /* does m[0] = x0, m[1] = x1, ... m[15] = xf */
      56             : 
      57             : /* Arithmetic operations */
      58             : 
      59             : #define wwi_neg(x)           _mm512_sub_epi32( _mm512_setzero_si512(), (x) ) /* wwi( -x0,  -x1,  ... -xf  ) */
      60             : #define wwi_abs(x)           _mm512_abs_epi32( (x) )                         /* wwi( |x0|, |x1|, ... |xf| ) */
      61             : 
      62             : #define wwi_min(x,y)         _mm512_min_epi32  ( (x), (y) ) /* wwi( min(x0,y0), min(x1,y1), ... min(xf,yf) ) */
      63             : #define wwi_max(x,y)         _mm512_max_epi32  ( (x), (y) ) /* wwi( max(x0,y0), max(x1,y1), ... max(xf,yf) ) */
      64             : #define wwi_add(x,y)         _mm512_add_epi32  ( (x), (y) ) /* wwi( x0+y0,      x1+y1,      ... xf+yf      ) */
      65             : #define wwi_sub(x,y)         _mm512_sub_epi32  ( (x), (y) ) /* wwi( x0-y0,      x1-y1,      ... xf-yf      ) */
      66             : #define wwi_mul(x,y)         _mm512_mullo_epi32( (x), (y) ) /* wwi( x0*y0,      x1*y1,      ... xf*yf      ) */
      67             : 
      68             : /* Binary operations */
      69             : /* Note: shifts assumes n and or y* in [0,31].  Rotates work for
      70             :    arbitrary values */
      71             : 
      72     4000000 : #define wwi_not(x)           _mm512_xor_epi32( _mm512_set1_epi32( -1 ), (x) )
      73             : 
      74             : #define wwi_shl(x,n)         _mm512_slli_epi32  ( (x), (uint)(n) ) /* wwi( x0<<n,  x1<<n,  ... xf<<n  ) */
      75             : #define wwi_shr(x,n)         _mm512_srai_epi32  ( (x), (uint)(n) ) /* wwi( x0>>n,  x1>>n,  ... xf>>n  ) */
      76             : #define wwi_shru(x,n)        _mm512_srli_epi32  ( (x), (uint)(n) ) /* wwi( x0>>n,  x1>>n,  ... xf>>n  ) (unsigned right shift) */
      77             : #define wwi_shl_vector(x,y)  _mm512_sllv_epi32  ( (x), (y) )       /* wwi( x0<<y0, x1<<y1, ... xf<<yf ) */
      78             : #define wwi_shr_vector(x,y)  _mm512_srav_epi32  ( (x), (y) )       /* wwi( x0>>y0, x1>>y1, ... xf>>yf ) */
      79             : #define wwi_shru_vector(x,y) _mm512_srlv_epi32  ( (x), (y) )       /* wwi( x0>>y0, x1>>y1, ... xf>>yf ) (unsigned right shift) */
      80             : #define wwi_and(x,y)         _mm512_and_epi32   ( (x), (y) )       /* wwi( x0&y0,  x1&y1,  ... xf&yf  ) */
      81             : #define wwi_andnot(x,y)      _mm512_andnot_epi32( (x), (y) )       /* wwi( ~x0&y0, ~x1&y1, ... ~xf&yf ) */
      82     4000000 : #define wwi_or(x,y)          _mm512_or_epi32    ( (x), (y) )       /* wwi( x0|y0,  x1|y1,  ... xf|yf  ) */
      83             : #define wwi_xor(x,y)         _mm512_xor_epi32   ( (x), (y) )       /* wwi( x0^y0,  x1^y1,  ... xf^yf  ) */
      84             : 
      85             : /* wwi_rol(x,n)          returns wwi( rotate_left (x0,n ), rotate_left (x1,n ), ... )
      86             :    wwi_ror(x,n)          returns wwi( rotate_right(x0,n ), rotate_right(x1,n ), ... )
      87             :    wwi_rol_variable(x,n) returns wwi( rotate_left (x0,n ), rotate_left (x1,n ), ... )
      88             :    wwi_ror_variable(x,n) returns wwi( rotate_right(x0,n ), rotate_right(x1,n ), ... )
      89             :    wwi_rol_vector(x,y)   returns wwi( rotate_left (x0,y0), rotate_left (x1,y1), ... )
      90             :    wwi_ror_vector(x,y)   returns wwi( rotate_right(x0,y0), rotate_right(x1,y1), ... )
      91             : 
      92             :    The variable variants are slower but do not require the shift amount
      93             :    to be known at compile time. */
      94             : 
      95     4000000 : #define wwi_rol(a,imm)       _mm512_rol_epi32( (a), (imm)&31 )
      96     4000000 : #define wwi_ror(a,imm)       _mm512_ror_epi32( (a), (imm)&31 )
      97             : 
      98     1000000 : static inline wwi_t wwi_rol_variable( wwi_t a, int n ) { return wwi_or( wwi_shl ( a, n & 31 ), wwi_shru( a, (-n) & 31 ) ); }
      99     1000000 : static inline wwi_t wwi_ror_variable( wwi_t a, int n ) { return wwi_or( wwi_shru( a, n & 31 ), wwi_shl ( a, (-n) & 31 ) ); }
     100             : 
     101             : 
     102     1000000 : static inline wwi_t wwi_rol_vector( wwi_t a, wwi_t b ) {
     103     1000000 :   wwi_t m = wwi_bcast( 31 );
     104     1000000 :   return wwi_or( wwi_shl_vector ( a, wwi_and( b, m ) ), wwi_shru_vector( a, wwi_and( wwi_neg( b ), m ) ) );
     105     1000000 : }
     106             : 
     107     1000000 : static inline wwi_t wwi_ror_vector( wwi_t a, wwi_t b ) {
     108     1000000 :   wwi_t m = wwi_bcast( 31 );
     109     1000000 :   return wwi_or( wwi_shru_vector( a, wwi_and( b, m ) ), wwi_shl_vector ( a, wwi_and( wwi_neg( b ), m ) ) );
     110     1000000 : }
     111             : 
     112             : /* Comparison operations */
     113             : /* mask(c0,c1,...) means (((int)c0)<<0) | (((int)c1)<<1) | ... */
     114             : 
     115             : #define wwi_eq(x,y) ((int)_mm512_cmpeq_epi32_mask(  (x), (y) )) /* mask( x0==y0, x1==y1, ... ) */
     116             : #define wwi_gt(x,y) ((int)_mm512_cmpgt_epi32_mask(  (x), (y) )) /* mask( x0> y0, x1> y1, ... ) */
     117             : #define wwi_lt(x,y) ((int)_mm512_cmplt_epi32_mask(  (x), (y) )) /* mask( x0< y0, x1< y1, ... ) */
     118             : #define wwi_ne(x,y) ((int)_mm512_cmpneq_epi32_mask( (x), (y) )) /* mask( x0!=y0, x1!=y1, ... ) */
     119             : #define wwi_ge(x,y) ((int)_mm512_cmpge_epi32_mask(  (x), (y) )) /* mask( x0>=y0, x1>=y1, ... ) */
     120             : #define wwi_le(x,y) ((int)_mm512_cmple_epi32_mask(  (x), (y) )) /* mask( x0<=y0, x1<=y1, ... ) */
     121             : 
     122             : #define wwi_lnot(x)    wwi_eq( (x), wwi_zero() )                /* mask(  !x0,  !x1, ... ) */
     123             : #define wwi_lnotnot(x) wwi_ne( (x), wwi_zero() )                /* mask( !!x0, !!x1, ... ) */
     124             : 
     125             : /* Conditional operations */
     126             : /* cn means bit n of c */
     127             : 
     128     2000000 : #define wwi_if(c,x,y)          _mm512_mask_blend_epi32 ( (__mmask16)(c), (y), (x) )    /* wwi( c0? x0    :y0, ... ) */
     129             : 
     130             : #define wwi_add_if(c,x,y,z)    _mm512_mask_add_epi32   ( (z), (__mmask16)(c), (x), (y) ) /* wwi( c0?(x0+y0):z0, ... ) */
     131             : #define wwi_sub_if(c,x,y,z)    _mm512_mask_sub_epi32   ( (z), (__mmask16)(c), (x), (y) ) /* wwi( c0?(x0-y0):z0, ... ) */
     132             : 
     133             : #define wwi_and_if(c,x,y,z)    _mm512_mask_and_epi32   ( (z), (__mmask16)(c), (x), (y) ) /* wwi( c0?(x0&y0):z0, ... ) */
     134             : #define wwi_andnot_if(c,x,y,z) _mm512_mask_andnot_epi32( (z), (__mmask16)(c), (x), (y) ) /* wwi( c0?(~x0&y0):z0, ... ) */
     135             : #define wwi_or_if(c,x,y,z)     _mm512_mask_or_epi32    ( (z), (__mmask16)(c), (x), (y) ) /* wwi( c0?(x0|y0):z0, ... ) */
     136             : #define wwi_xor_if(c,x,y,z)    _mm512_mask_xor_epi32   ( (z), (__mmask16)(c), (x), (y) ) /* wwi( c0?(x0^y0):z0, ... ) */
     137             : 
     138             : /* Conversions */
     139             : 
     140             : /* wwi_to_wwu( x )    returns wwi(  (uint)x0,  (uint)x1, ...  (uint)x15 )
     141             : 
     142             :    wwi_to_wwl( x, 0 ) returns wwl(  (long)x0,  (long)x2, ...  (long)x14 )
     143             :    wwi_to_wwl( x, 1 ) returns wwl(  (long)x1,  (long)x3, ...  (long)x15 )
     144             : 
     145             :    wwi_to_wwv( x, 0 ) returns wwv( (ulong)x0, (ulong)x2, ... (ulong)x14 )
     146             :    wwi_to_wwv( x, 1 ) returns wwv( (ulong)x1, (ulong)x3, ... (ulong)x15 )
     147             : 
     148             :    TODO: consider _mm512_cvtepi32_* intrinsics? */
     149             : 
     150             : #define wwi_to_wwu( x ) (x)
     151             : #define wwi_to_wwl( x, odd ) /* trinary should be compile time */ \
     152             :   (__extension__({ wwl_t _wwi_to_wwl_tmp = (x); wwl_shr( (odd) ? _wwi_to_wwl_tmp : wwl_shl( _wwi_to_wwl_tmp, 32 ), 32 ); }))
     153             : #define wwi_to_wwv( x, odd ) /* trinary should be compile time (yes, wwl_shr) */ \
     154             :   (__extension__({ wwv_t _wwi_to_wwv_tmp = (x); wwl_shr( (odd) ? _wwi_to_wwv_tmp : wwv_shl( _wwi_to_wwv_tmp, 32 ), 32 ); }))
     155             : 
     156             : #define wwi_to_wwu_raw(x) (x)
     157             : #define wwi_to_wwl_raw(x) (x)
     158             : #define wwi_to_wwv_raw(x) (x)
     159             : 
     160             : /* Misc operations */
     161             : 
     162             : /* wwi_pack_halves(x,imm0,y,imm1) packs half of x and half of y into a
     163             :    wwi.  imm0/imm1 select which half of x and y to pack.  imm0 / imm1
     164             :    should be in [0,1].  That is, this returns:
     165             : 
     166             :      [ if( imm0, x(8:15), x(0:7) ) if( imm1, y(8:15), y(0:7) ) ]
     167             : 
     168             :    wwi_pack_h0_h1(x,y) does the wwi_pack_halves(x,0,y,1) case faster.
     169             :    Hat tip to Philip Taffet for pointing this out. */
     170             : 
     171             : #define wwi_pack_halves(x,imm0,y,imm1) _mm512_shuffle_i32x4( (x), (y), 68+10*(imm0)+160*(imm1) )
     172             : #define wwi_pack_h0_h1(x,y)            _mm512_mask_blend_epi32( (__mmask16)0xFF00, (x), (y) )
     173             : 
     174             : /* wwi_slide(x,y,imm) treats as a x FIFO with the oldest / newest
     175             :    element at lane 0 / 15.  Returns the result of dequeing x imm times
     176             :    and enqueing the values y0 ... y{imm-1} in that order.  imm should be
     177             :    in [0,15].  For example, with imm==5 case, returns:
     178             :      [ x5 x6 ... xf y0 y1 y2 y3 y4 ]. */
     179             : 
     180             : #define wwi_slide(x,y,imm) _mm512_alignr_epi32( (y), (x), (imm) )
     181             : 
     182             : /* wwv_unpack unpacks the wwv x into its int components x0,x1,...xf. */
     183             : 
     184     1000000 : #define wwi_unpack( x, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xa,xb,xc,xd,xe,xf ) do { \
     185     1000000 :     __m512i _wwi_unpack_x  = (x);                                             \
     186     1000000 :     __m256i _wwi_unpack_xl = _mm512_extracti32x8_epi32( _wwi_unpack_x, 0 );   \
     187     1000000 :     __m256i _wwi_unpack_xh = _mm512_extracti32x8_epi32( _wwi_unpack_x, 1 );   \
     188     1000000 :     (x0) = _mm256_extract_epi32( _wwi_unpack_xl, 0 );                         \
     189     1000000 :     (x1) = _mm256_extract_epi32( _wwi_unpack_xl, 1 );                         \
     190     1000000 :     (x2) = _mm256_extract_epi32( _wwi_unpack_xl, 2 );                         \
     191     1000000 :     (x3) = _mm256_extract_epi32( _wwi_unpack_xl, 3 );                         \
     192     1000000 :     (x4) = _mm256_extract_epi32( _wwi_unpack_xl, 4 );                         \
     193     1000000 :     (x5) = _mm256_extract_epi32( _wwi_unpack_xl, 5 );                         \
     194     1000000 :     (x6) = _mm256_extract_epi32( _wwi_unpack_xl, 6 );                         \
     195     1000000 :     (x7) = _mm256_extract_epi32( _wwi_unpack_xl, 7 );                         \
     196     1000000 :     (x8) = _mm256_extract_epi32( _wwi_unpack_xh, 0 );                         \
     197     1000000 :     (x9) = _mm256_extract_epi32( _wwi_unpack_xh, 1 );                         \
     198     1000000 :     (xa) = _mm256_extract_epi32( _wwi_unpack_xh, 2 );                         \
     199     1000000 :     (xb) = _mm256_extract_epi32( _wwi_unpack_xh, 3 );                         \
     200     1000000 :     (xc) = _mm256_extract_epi32( _wwi_unpack_xh, 4 );                         \
     201     1000000 :     (xd) = _mm256_extract_epi32( _wwi_unpack_xh, 5 );                         \
     202     1000000 :     (xe) = _mm256_extract_epi32( _wwi_unpack_xh, 6 );                         \
     203     1000000 :     (xf) = _mm256_extract_epi32( _wwi_unpack_xh, 7 );                         \
     204     1000000 :   } while(0)
     205             : 
     206             : /* wwi_transpose_16x16 sets wwi_t's c0,c1,...cf to the columns of a
     207             :    16x16 int matrix given the rows of the matrix in wwi_t's r0,r1,...rf.
     208             :    In-place operation fine. */
     209             : 
     210             : #define wwi_transpose_16x16( r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,ra,rb,rc,rd,re,rf,                      \
     211     1000000 :                              c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf ) do {                \
     212     1000000 :     wwi_t _wwi_transpose_r0 = (r0); wwi_t _wwi_transpose_r1 = (r1);                                \
     213     1000000 :     wwi_t _wwi_transpose_r2 = (r2); wwi_t _wwi_transpose_r3 = (r3);                                \
     214     1000000 :     wwi_t _wwi_transpose_r4 = (r4); wwi_t _wwi_transpose_r5 = (r5);                                \
     215     1000000 :     wwi_t _wwi_transpose_r6 = (r6); wwi_t _wwi_transpose_r7 = (r7);                                \
     216     1000000 :     wwi_t _wwi_transpose_r8 = (r8); wwi_t _wwi_transpose_r9 = (r9);                                \
     217     1000000 :     wwi_t _wwi_transpose_ra = (ra); wwi_t _wwi_transpose_rb = (rb);                                \
     218     1000000 :     wwi_t _wwi_transpose_rc = (rc); wwi_t _wwi_transpose_rd = (rd);                                \
     219     1000000 :     wwi_t _wwi_transpose_re = (re); wwi_t _wwi_transpose_rf = (rf);                                \
     220     1000000 :                                                                                                    \
     221     1000000 :     /* Outer 4x4 transpose of 4x4 blocks */                                                        \
     222     1000000 :     wwi_t _wwi_transpose_t0  = _mm512_shuffle_i32x4( _wwi_transpose_r0, _wwi_transpose_r4, 0x88 ); \
     223     1000000 :     wwi_t _wwi_transpose_t1  = _mm512_shuffle_i32x4( _wwi_transpose_r1, _wwi_transpose_r5, 0x88 ); \
     224     1000000 :     wwi_t _wwi_transpose_t2  = _mm512_shuffle_i32x4( _wwi_transpose_r2, _wwi_transpose_r6, 0x88 ); \
     225     1000000 :     wwi_t _wwi_transpose_t3  = _mm512_shuffle_i32x4( _wwi_transpose_r3, _wwi_transpose_r7, 0x88 ); \
     226     1000000 :     wwi_t _wwi_transpose_t4  = _mm512_shuffle_i32x4( _wwi_transpose_r0, _wwi_transpose_r4, 0xdd ); \
     227     1000000 :     wwi_t _wwi_transpose_t5  = _mm512_shuffle_i32x4( _wwi_transpose_r1, _wwi_transpose_r5, 0xdd ); \
     228     1000000 :     wwi_t _wwi_transpose_t6  = _mm512_shuffle_i32x4( _wwi_transpose_r2, _wwi_transpose_r6, 0xdd ); \
     229     1000000 :     wwi_t _wwi_transpose_t7  = _mm512_shuffle_i32x4( _wwi_transpose_r3, _wwi_transpose_r7, 0xdd ); \
     230     1000000 :     wwi_t _wwi_transpose_t8  = _mm512_shuffle_i32x4( _wwi_transpose_r8, _wwi_transpose_rc, 0x88 ); \
     231     1000000 :     wwi_t _wwi_transpose_t9  = _mm512_shuffle_i32x4( _wwi_transpose_r9, _wwi_transpose_rd, 0x88 ); \
     232     1000000 :     wwi_t _wwi_transpose_ta  = _mm512_shuffle_i32x4( _wwi_transpose_ra, _wwi_transpose_re, 0x88 ); \
     233     1000000 :     wwi_t _wwi_transpose_tb  = _mm512_shuffle_i32x4( _wwi_transpose_rb, _wwi_transpose_rf, 0x88 ); \
     234     1000000 :     wwi_t _wwi_transpose_tc  = _mm512_shuffle_i32x4( _wwi_transpose_r8, _wwi_transpose_rc, 0xdd ); \
     235     1000000 :     wwi_t _wwi_transpose_td  = _mm512_shuffle_i32x4( _wwi_transpose_r9, _wwi_transpose_rd, 0xdd ); \
     236     1000000 :     wwi_t _wwi_transpose_te  = _mm512_shuffle_i32x4( _wwi_transpose_ra, _wwi_transpose_re, 0xdd ); \
     237     1000000 :     wwi_t _wwi_transpose_tf  = _mm512_shuffle_i32x4( _wwi_transpose_rb, _wwi_transpose_rf, 0xdd ); \
     238     1000000 :                                                                                                    \
     239     1000000 :     /**/  _wwi_transpose_r0  = _mm512_shuffle_i32x4( _wwi_transpose_t0, _wwi_transpose_t8, 0x88 ); \
     240     1000000 :     /**/  _wwi_transpose_r1  = _mm512_shuffle_i32x4( _wwi_transpose_t1, _wwi_transpose_t9, 0x88 ); \
     241     1000000 :     /**/  _wwi_transpose_r2  = _mm512_shuffle_i32x4( _wwi_transpose_t2, _wwi_transpose_ta, 0x88 ); \
     242     1000000 :     /**/  _wwi_transpose_r3  = _mm512_shuffle_i32x4( _wwi_transpose_t3, _wwi_transpose_tb, 0x88 ); \
     243     1000000 :     /**/  _wwi_transpose_r4  = _mm512_shuffle_i32x4( _wwi_transpose_t4, _wwi_transpose_tc, 0x88 ); \
     244     1000000 :     /**/  _wwi_transpose_r5  = _mm512_shuffle_i32x4( _wwi_transpose_t5, _wwi_transpose_td, 0x88 ); \
     245     1000000 :     /**/  _wwi_transpose_r6  = _mm512_shuffle_i32x4( _wwi_transpose_t6, _wwi_transpose_te, 0x88 ); \
     246     1000000 :     /**/  _wwi_transpose_r7  = _mm512_shuffle_i32x4( _wwi_transpose_t7, _wwi_transpose_tf, 0x88 ); \
     247     1000000 :     /**/  _wwi_transpose_r8  = _mm512_shuffle_i32x4( _wwi_transpose_t0, _wwi_transpose_t8, 0xdd ); \
     248     1000000 :     /**/  _wwi_transpose_r9  = _mm512_shuffle_i32x4( _wwi_transpose_t1, _wwi_transpose_t9, 0xdd ); \
     249     1000000 :     /**/  _wwi_transpose_ra  = _mm512_shuffle_i32x4( _wwi_transpose_t2, _wwi_transpose_ta, 0xdd ); \
     250     1000000 :     /**/  _wwi_transpose_rb  = _mm512_shuffle_i32x4( _wwi_transpose_t3, _wwi_transpose_tb, 0xdd ); \
     251     1000000 :     /**/  _wwi_transpose_rc  = _mm512_shuffle_i32x4( _wwi_transpose_t4, _wwi_transpose_tc, 0xdd ); \
     252     1000000 :     /**/  _wwi_transpose_rd  = _mm512_shuffle_i32x4( _wwi_transpose_t5, _wwi_transpose_td, 0xdd ); \
     253     1000000 :     /**/  _wwi_transpose_re  = _mm512_shuffle_i32x4( _wwi_transpose_t6, _wwi_transpose_te, 0xdd ); \
     254     1000000 :     /**/  _wwi_transpose_rf  = _mm512_shuffle_i32x4( _wwi_transpose_t7, _wwi_transpose_tf, 0xdd ); \
     255     1000000 :                                                                                                    \
     256     1000000 :     /* Inner 4x4 transpose of 1x1 blocks */                                                        \
     257     1000000 :     /**/  _wwi_transpose_t0  = _mm512_unpacklo_epi32( _wwi_transpose_r0, _wwi_transpose_r2 );      \
     258     1000000 :     /**/  _wwi_transpose_t1  = _mm512_unpacklo_epi32( _wwi_transpose_r1, _wwi_transpose_r3 );      \
     259     1000000 :     /**/  _wwi_transpose_t2  = _mm512_unpackhi_epi32( _wwi_transpose_r0, _wwi_transpose_r2 );      \
     260     1000000 :     /**/  _wwi_transpose_t3  = _mm512_unpackhi_epi32( _wwi_transpose_r1, _wwi_transpose_r3 );      \
     261     1000000 :     /**/  _wwi_transpose_t4  = _mm512_unpacklo_epi32( _wwi_transpose_r4, _wwi_transpose_r6 );      \
     262     1000000 :     /**/  _wwi_transpose_t5  = _mm512_unpacklo_epi32( _wwi_transpose_r5, _wwi_transpose_r7 );      \
     263     1000000 :     /**/  _wwi_transpose_t6  = _mm512_unpackhi_epi32( _wwi_transpose_r4, _wwi_transpose_r6 );      \
     264     1000000 :     /**/  _wwi_transpose_t7  = _mm512_unpackhi_epi32( _wwi_transpose_r5, _wwi_transpose_r7 );      \
     265     1000000 :     /**/  _wwi_transpose_t8  = _mm512_unpacklo_epi32( _wwi_transpose_r8, _wwi_transpose_ra );      \
     266     1000000 :     /**/  _wwi_transpose_t9  = _mm512_unpacklo_epi32( _wwi_transpose_r9, _wwi_transpose_rb );      \
     267     1000000 :     /**/  _wwi_transpose_ta  = _mm512_unpackhi_epi32( _wwi_transpose_r8, _wwi_transpose_ra );      \
     268     1000000 :     /**/  _wwi_transpose_tb  = _mm512_unpackhi_epi32( _wwi_transpose_r9, _wwi_transpose_rb );      \
     269     1000000 :     /**/  _wwi_transpose_tc  = _mm512_unpacklo_epi32( _wwi_transpose_rc, _wwi_transpose_re );      \
     270     1000000 :     /**/  _wwi_transpose_td  = _mm512_unpacklo_epi32( _wwi_transpose_rd, _wwi_transpose_rf );      \
     271     1000000 :     /**/  _wwi_transpose_te  = _mm512_unpackhi_epi32( _wwi_transpose_rc, _wwi_transpose_re );      \
     272     1000000 :     /**/  _wwi_transpose_tf  = _mm512_unpackhi_epi32( _wwi_transpose_rd, _wwi_transpose_rf );      \
     273     1000000 :                                                                                                    \
     274     1000000 :     /**/  (c0)               = _mm512_unpacklo_epi32( _wwi_transpose_t0, _wwi_transpose_t1 );      \
     275     1000000 :     /**/  (c1)               = _mm512_unpackhi_epi32( _wwi_transpose_t0, _wwi_transpose_t1 );      \
     276     1000000 :     /**/  (c2)               = _mm512_unpacklo_epi32( _wwi_transpose_t2, _wwi_transpose_t3 );      \
     277     1000000 :     /**/  (c3)               = _mm512_unpackhi_epi32( _wwi_transpose_t2, _wwi_transpose_t3 );      \
     278     1000000 :     /**/  (c4)               = _mm512_unpacklo_epi32( _wwi_transpose_t4, _wwi_transpose_t5 );      \
     279     1000000 :     /**/  (c5)               = _mm512_unpackhi_epi32( _wwi_transpose_t4, _wwi_transpose_t5 );      \
     280     1000000 :     /**/  (c6)               = _mm512_unpacklo_epi32( _wwi_transpose_t6, _wwi_transpose_t7 );      \
     281     1000000 :     /**/  (c7)               = _mm512_unpackhi_epi32( _wwi_transpose_t6, _wwi_transpose_t7 );      \
     282     1000000 :     /**/  (c8)               = _mm512_unpacklo_epi32( _wwi_transpose_t8, _wwi_transpose_t9 );      \
     283     1000000 :     /**/  (c9)               = _mm512_unpackhi_epi32( _wwi_transpose_t8, _wwi_transpose_t9 );      \
     284     1000000 :     /**/  (ca)               = _mm512_unpacklo_epi32( _wwi_transpose_ta, _wwi_transpose_tb );      \
     285     1000000 :     /**/  (cb)               = _mm512_unpackhi_epi32( _wwi_transpose_ta, _wwi_transpose_tb );      \
     286     1000000 :     /**/  (cc)               = _mm512_unpacklo_epi32( _wwi_transpose_tc, _wwi_transpose_td );      \
     287     1000000 :     /**/  (cd)               = _mm512_unpackhi_epi32( _wwi_transpose_tc, _wwi_transpose_td );      \
     288     1000000 :     /**/  (ce)               = _mm512_unpacklo_epi32( _wwi_transpose_te, _wwi_transpose_tf );      \
     289     1000000 :     /**/  (cf)               = _mm512_unpackhi_epi32( _wwi_transpose_te, _wwi_transpose_tf );      \
     290     1000000 :   } while(0)
     291             : 
     292             : /* wwi_transpose_2x8x8 transposes the 2 8x8 matrices whose rows are
     293             :    held in the lower and upper halves of wwi_t's r0,r1...r7 and
     294             :    stores the result in c0,c1...c7.  In-place operation fine. */
     295             : 
     296             : #define wwi_transpose_2x8x8( r0,r1,r2,r3,r4,r5,r6,r7,                                                \
     297     1000000 :                              c0,c1,c2,c3,c4,c5,c6,c7 ) {                                             \
     298     1000000 :     wwi_t _wwi_transpose_r0 = (r0); wwi_t _wwi_transpose_r1 = (r1);                                  \
     299     1000000 :     wwi_t _wwi_transpose_r2 = (r2); wwi_t _wwi_transpose_r3 = (r3);                                  \
     300     1000000 :     wwi_t _wwi_transpose_r4 = (r4); wwi_t _wwi_transpose_r5 = (r5);                                  \
     301     1000000 :     wwi_t _wwi_transpose_r6 = (r6); wwi_t _wwi_transpose_r7 = (r7);                                  \
     302     1000000 :                                                                                                      \
     303     1000000 :     /* Outer 2x2 transpose of 4x4 blocks */                                                          \
     304     1000000 :     /* No _mm256_permute2f128_si128 equiv? sigh ... probably a better method possible here */        \
     305     1000000 :     wwi_t _wwi_transpose_p   = wwi( 0, 1, 2, 3,16,17,18,19, 8, 9,10,11,24,25,26,27);                 \
     306     1000000 :     wwi_t _wwi_transpose_q   = wwi( 4, 5, 6, 7,20,21,22,23,12,13,14,15,28,29,30,31);                 \
     307     1000000 :     wwi_t _wwi_transpose_t0  = wwi_select( _wwi_transpose_p, _wwi_transpose_r0, _wwi_transpose_r4 ); \
     308     1000000 :     wwi_t _wwi_transpose_t1  = wwi_select( _wwi_transpose_p, _wwi_transpose_r1, _wwi_transpose_r5 ); \
     309     1000000 :     wwi_t _wwi_transpose_t2  = wwi_select( _wwi_transpose_p, _wwi_transpose_r2, _wwi_transpose_r6 ); \
     310     1000000 :     wwi_t _wwi_transpose_t3  = wwi_select( _wwi_transpose_p, _wwi_transpose_r3, _wwi_transpose_r7 ); \
     311     1000000 :     wwi_t _wwi_transpose_t4  = wwi_select( _wwi_transpose_q, _wwi_transpose_r0, _wwi_transpose_r4 ); \
     312     1000000 :     wwi_t _wwi_transpose_t5  = wwi_select( _wwi_transpose_q, _wwi_transpose_r1, _wwi_transpose_r5 ); \
     313     1000000 :     wwi_t _wwi_transpose_t6  = wwi_select( _wwi_transpose_q, _wwi_transpose_r2, _wwi_transpose_r6 ); \
     314     1000000 :     wwi_t _wwi_transpose_t7  = wwi_select( _wwi_transpose_q, _wwi_transpose_r3, _wwi_transpose_r7 ); \
     315     1000000 :                                                                                                      \
     316     1000000 :     /* Inner 4x4 transpose of 1x1 blocks */                                                          \
     317     1000000 :     /**/  _wwi_transpose_r0  = _mm512_unpacklo_epi32( _wwi_transpose_t0, _wwi_transpose_t2 );        \
     318     1000000 :     /**/  _wwi_transpose_r1  = _mm512_unpacklo_epi32( _wwi_transpose_t1, _wwi_transpose_t3 );        \
     319     1000000 :     /**/  _wwi_transpose_r2  = _mm512_unpackhi_epi32( _wwi_transpose_t0, _wwi_transpose_t2 );        \
     320     1000000 :     /**/  _wwi_transpose_r3  = _mm512_unpackhi_epi32( _wwi_transpose_t1, _wwi_transpose_t3 );        \
     321     1000000 :     /**/  _wwi_transpose_r4  = _mm512_unpacklo_epi32( _wwi_transpose_t4, _wwi_transpose_t6 );        \
     322     1000000 :     /**/  _wwi_transpose_r5  = _mm512_unpacklo_epi32( _wwi_transpose_t5, _wwi_transpose_t7 );        \
     323     1000000 :     /**/  _wwi_transpose_r6  = _mm512_unpackhi_epi32( _wwi_transpose_t4, _wwi_transpose_t6 );        \
     324     1000000 :     /**/  _wwi_transpose_r7  = _mm512_unpackhi_epi32( _wwi_transpose_t5, _wwi_transpose_t7 );        \
     325     1000000 :                                                                                                      \
     326     1000000 :     /**/  (c0)               = _mm512_unpacklo_epi32( _wwi_transpose_r0, _wwi_transpose_r1 );        \
     327     1000000 :     /**/  (c1)               = _mm512_unpackhi_epi32( _wwi_transpose_r0, _wwi_transpose_r1 );        \
     328     1000000 :     /**/  (c2)               = _mm512_unpacklo_epi32( _wwi_transpose_r2, _wwi_transpose_r3 );        \
     329     1000000 :     /**/  (c3)               = _mm512_unpackhi_epi32( _wwi_transpose_r2, _wwi_transpose_r3 );        \
     330     1000000 :     /**/  (c4)               = _mm512_unpacklo_epi32( _wwi_transpose_r4, _wwi_transpose_r5 );        \
     331     1000000 :     /**/  (c5)               = _mm512_unpackhi_epi32( _wwi_transpose_r4, _wwi_transpose_r5 );        \
     332     1000000 :     /**/  (c6)               = _mm512_unpacklo_epi32( _wwi_transpose_r6, _wwi_transpose_r7 );        \
     333     1000000 :     /**/  (c7)               = _mm512_unpackhi_epi32( _wwi_transpose_r6, _wwi_transpose_r7 );        \
     334     1000000 :   } while(0)

Generated by: LCOV version 1.14