LCOV - code coverage report
Current view: top level - ballet/blake3 - fd_blake3_avx512.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 536 538 99.6 %
Date: 2025-10-13 04:42:14 Functions: 3 3 100.0 %

          Line data    Source code
       1             : 
       2             : // Source originally from https://github.com/BLAKE3-team/BLAKE3
       3             : // From commit: c0ea395cf91d242f078c23d5f8d87eb9dd5f7b78
       4             : 
       5             : #include "fd_blake3_private.h"
       6             : #include "../../util/simd/fd_avx512.h"
       7             : #include "../../util/simd/fd_avx.h"
       8             : 
       9             : static inline __attribute__((always_inline)) void
      10             : round_fn16( wwu_t v[16],
      11             :             wwu_t m[16],
      12   732738433 :             ulong r ) {
      13   732738433 :   v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][0]]);
      14   732738433 :   v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][2]]);
      15   732738433 :   v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][4]]);
      16   732738433 :   v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][6]]);
      17   732738433 :   v[0x0] = wwu_add(v[0x0], v[0x4]);
      18   732738433 :   v[0x1] = wwu_add(v[0x1], v[0x5]);
      19   732738433 :   v[0x2] = wwu_add(v[0x2], v[0x6]);
      20   732738433 :   v[0x3] = wwu_add(v[0x3], v[0x7]);
      21   732738433 :   v[0xc] = wwu_xor(v[0xc], v[0x0]);
      22   732738433 :   v[0xd] = wwu_xor(v[0xd], v[0x1]);
      23   732738433 :   v[0xe] = wwu_xor(v[0xe], v[0x2]);
      24   732738433 :   v[0xf] = wwu_xor(v[0xf], v[0x3]);
      25   732738433 :   v[0xc] = wwu_ror(v[0xc], 16);
      26   732738433 :   v[0xd] = wwu_ror(v[0xd], 16);
      27   732738433 :   v[0xe] = wwu_ror(v[0xe], 16);
      28   732738433 :   v[0xf] = wwu_ror(v[0xf], 16);
      29   732738433 :   v[0x8] = wwu_add(v[0x8], v[0xc]);
      30   732738433 :   v[0x9] = wwu_add(v[0x9], v[0xd]);
      31   732738433 :   v[0xa] = wwu_add(v[0xa], v[0xe]);
      32   732738433 :   v[0xb] = wwu_add(v[0xb], v[0xf]);
      33   732738433 :   v[0x4] = wwu_xor(v[0x4], v[0x8]);
      34   732738433 :   v[0x5] = wwu_xor(v[0x5], v[0x9]);
      35   732738433 :   v[0x6] = wwu_xor(v[0x6], v[0xa]);
      36   732738433 :   v[0x7] = wwu_xor(v[0x7], v[0xb]);
      37   732738433 :   v[0x4] = wwu_ror(v[0x4], 12);
      38   732738433 :   v[0x5] = wwu_ror(v[0x5], 12);
      39   732738433 :   v[0x6] = wwu_ror(v[0x6], 12);
      40   732738433 :   v[0x7] = wwu_ror(v[0x7], 12);
      41   732738433 :   v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][1]]);
      42   732738433 :   v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][3]]);
      43   732738433 :   v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][5]]);
      44   732738433 :   v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][7]]);
      45   732738433 :   v[0x0] = wwu_add(v[0x0], v[0x4]);
      46   732738433 :   v[0x1] = wwu_add(v[0x1], v[0x5]);
      47   732738433 :   v[0x2] = wwu_add(v[0x2], v[0x6]);
      48   732738433 :   v[0x3] = wwu_add(v[0x3], v[0x7]);
      49   732738433 :   v[0xc] = wwu_xor(v[0xc], v[0x0]);
      50   732738433 :   v[0xd] = wwu_xor(v[0xd], v[0x1]);
      51   732738433 :   v[0xe] = wwu_xor(v[0xe], v[0x2]);
      52   732738433 :   v[0xf] = wwu_xor(v[0xf], v[0x3]);
      53   732738433 :   v[0xc] = wwu_ror(v[0xc], 8);
      54   732738433 :   v[0xd] = wwu_ror(v[0xd], 8);
      55   732738433 :   v[0xe] = wwu_ror(v[0xe], 8);
      56   732738433 :   v[0xf] = wwu_ror(v[0xf], 8);
      57   732738433 :   v[0x8] = wwu_add(v[0x8], v[0xc]);
      58   732738433 :   v[0x9] = wwu_add(v[0x9], v[0xd]);
      59   732738433 :   v[0xa] = wwu_add(v[0xa], v[0xe]);
      60   732738433 :   v[0xb] = wwu_add(v[0xb], v[0xf]);
      61   732738433 :   v[0x4] = wwu_xor(v[0x4], v[0x8]);
      62   732738433 :   v[0x5] = wwu_xor(v[0x5], v[0x9]);
      63   732738433 :   v[0x6] = wwu_xor(v[0x6], v[0xa]);
      64   732738433 :   v[0x7] = wwu_xor(v[0x7], v[0xb]);
      65   732738433 :   v[0x4] = wwu_ror(v[0x4], 7);
      66   732738433 :   v[0x5] = wwu_ror(v[0x5], 7);
      67   732738433 :   v[0x6] = wwu_ror(v[0x6], 7);
      68   732738433 :   v[0x7] = wwu_ror(v[0x7], 7);
      69             : 
      70   732738433 :   v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][8]]);
      71   732738433 :   v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][10]]);
      72   732738433 :   v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][12]]);
      73   732738433 :   v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][14]]);
      74   732738433 :   v[0x0] = wwu_add(v[0x0], v[0x5]);
      75   732738433 :   v[0x1] = wwu_add(v[0x1], v[0x6]);
      76   732738433 :   v[0x2] = wwu_add(v[0x2], v[0x7]);
      77   732738433 :   v[0x3] = wwu_add(v[0x3], v[0x4]);
      78   732738433 :   v[0xf] = wwu_xor(v[0xf], v[0x0]);
      79   732738433 :   v[0xc] = wwu_xor(v[0xc], v[0x1]);
      80   732738433 :   v[0xd] = wwu_xor(v[0xd], v[0x2]);
      81   732738433 :   v[0xe] = wwu_xor(v[0xe], v[0x3]);
      82   732738433 :   v[0xf] = wwu_ror(v[0xf], 16);
      83   732738433 :   v[0xc] = wwu_ror(v[0xc], 16);
      84   732738433 :   v[0xd] = wwu_ror(v[0xd], 16);
      85   732738433 :   v[0xe] = wwu_ror(v[0xe], 16);
      86   732738433 :   v[0xa] = wwu_add(v[0xa], v[0xf]);
      87   732738433 :   v[0xb] = wwu_add(v[0xb], v[0xc]);
      88   732738433 :   v[0x8] = wwu_add(v[0x8], v[0xd]);
      89   732738433 :   v[0x9] = wwu_add(v[0x9], v[0xe]);
      90   732738433 :   v[0x5] = wwu_xor(v[0x5], v[0xa]);
      91   732738433 :   v[0x6] = wwu_xor(v[0x6], v[0xb]);
      92   732738433 :   v[0x7] = wwu_xor(v[0x7], v[0x8]);
      93   732738433 :   v[0x4] = wwu_xor(v[0x4], v[0x9]);
      94   732738433 :   v[0x5] = wwu_ror(v[0x5], 12);
      95   732738433 :   v[0x6] = wwu_ror(v[0x6], 12);
      96   732738433 :   v[0x7] = wwu_ror(v[0x7], 12);
      97   732738433 :   v[0x4] = wwu_ror(v[0x4], 12);
      98   732738433 :   v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][9]]);
      99   732738433 :   v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][11]]);
     100   732738433 :   v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][13]]);
     101   732738433 :   v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][15]]);
     102   732738433 :   v[0x0] = wwu_add(v[0x0], v[0x5]);
     103   732738433 :   v[0x1] = wwu_add(v[0x1], v[0x6]);
     104   732738433 :   v[0x2] = wwu_add(v[0x2], v[0x7]);
     105   732738433 :   v[0x3] = wwu_add(v[0x3], v[0x4]);
     106   732738433 :   v[0xf] = wwu_xor(v[0xf], v[0x0]);
     107   732738433 :   v[0xc] = wwu_xor(v[0xc], v[0x1]);
     108   732738433 :   v[0xd] = wwu_xor(v[0xd], v[0x2]);
     109   732738433 :   v[0xe] = wwu_xor(v[0xe], v[0x3]);
     110   732738433 :   v[0xf] = wwu_ror(v[0xf], 8);
     111   732738433 :   v[0xc] = wwu_ror(v[0xc], 8);
     112   732738433 :   v[0xd] = wwu_ror(v[0xd], 8);
     113   732738433 :   v[0xe] = wwu_ror(v[0xe], 8);
     114   732738433 :   v[0xa] = wwu_add(v[0xa], v[0xf]);
     115   732738433 :   v[0xb] = wwu_add(v[0xb], v[0xc]);
     116   732738433 :   v[0x8] = wwu_add(v[0x8], v[0xd]);
     117   732738433 :   v[0x9] = wwu_add(v[0x9], v[0xe]);
     118   732738433 :   v[0x5] = wwu_xor(v[0x5], v[0xa]);
     119   732738433 :   v[0x6] = wwu_xor(v[0x6], v[0xb]);
     120   732738433 :   v[0x7] = wwu_xor(v[0x7], v[0x8]);
     121   732738433 :   v[0x4] = wwu_xor(v[0x4], v[0x9]);
     122   732738433 :   v[0x5] = wwu_ror(v[0x5], 7);
     123   732738433 :   v[0x6] = wwu_ror(v[0x6], 7);
     124   732738433 :   v[0x7] = wwu_ror(v[0x7], 7);
     125   732738433 :   v[0x4] = wwu_ror(v[0x4], 7);
     126   732738433 : }
     127             : 
     128             : void
     129             : fd_blake3_avx512_compress16( ulong                   batch_cnt,
     130             :                              void const   * restrict _batch_data,
     131             :                              uint const   * restrict batch_sz,
     132             :                              ulong const  * restrict ctr_vec,
     133             :                              uint const   * restrict batch_flags,
     134             :                              void * const * restrict _batch_hash,
     135             :                              ushort *       restrict lthash,
     136             :                              uint                    out_sz,
     137    51377067 :                              void const *   restrict batch_cv ) {
     138    51377067 :   if( FD_UNLIKELY( lthash && batch_cnt!=16 ) ) FD_LOG_ERR(( "Lane masking not supported for fd_blake3_avx512_compress16 in LtHash mode" ));
     139    51377067 :   if( FD_UNLIKELY( batch_cnt==0 || batch_cnt>16 ) ) FD_LOG_ERR(( "Invalid batch_cnt %lu", batch_cnt ));
     140             : 
     141             :   /* We can only process input blocks of 64 bytes, but message data size
     142             :      is not necessarily a multiple of 64.  We compute the tail block of
     143             :      each message here.  We then process complete blocks of the original
     144             :      message in place, switching to processing to these  tail blocks in
     145             :      the same pass toward the end. */
     146             : 
     147    51377067 :   ulong const * batch_data = (ulong const *)_batch_data;
     148             : 
     149    51377067 :   if( FD_UNLIKELY( batch_cnt==1 ) ) {
     150    20410160 :     fd_blake3_sse_compress1( (uchar *)(_batch_hash[0]),
     151    20410160 :                              (uchar const *)(batch_data[0]),
     152    20410160 :                              batch_sz[0],
     153    20410160 :                              ctr_vec[0],
     154    20410160 :                              batch_flags[0],
     155    20410160 :                              NULL,
     156    20410160 :                              NULL );
     157    20410160 :     return;
     158    20410160 :   }
     159             : 
     160             : #if FD_BLAKE3_TRACING
     161             :   /* This log_line buffer is oversized by a fair bit (due to all the
     162             :      NULL terminators) but that's fine */
     163             :   char log_line[
     164             :       sizeof( "fd_blake3_avx512_compress16" )+
     165             :       sizeof( "(batch_cnt=" )+21+
     166             :       sizeof( ",sz=["       )+(16*11)+sizeof( "]" )+
     167             :       sizeof( ",counter=["  )+(16*21)+sizeof( "]" )+
     168             :       sizeof( ",flags=["    )+(16* 2)+sizeof( "]" )+
     169             :       sizeof( ",custom_cv"  )+
     170             :       sizeof( ",lthash" )+
     171             :       sizeof( ")" ) ];
     172             : 
     173             :   char * p = fd_cstr_init( log_line );
     174             :   p = fd_cstr_append_text( p, "fd_blake3_avx512_compress16(batch_cnt=", 38UL );
     175             :   p = fd_cstr_append_ulong_as_text( p, 0, 0, batch_cnt, fd_uchar_base10_dig_cnt( (uchar)batch_cnt ) );
     176             :   p = fd_cstr_append_text( p, ",sz=[", 5UL );
     177             :   for( ulong i=0UL; i<batch_cnt; i++ ) {
     178             :     p = fd_cstr_append_uint_as_text( p, ' ', 0, batch_sz[ i ], fd_uint_base10_dig_cnt( batch_sz[ i ] ) );
     179             :     if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
     180             :   }
     181             :   p = fd_cstr_append_text( p, "],counter=[", 11UL );
     182             :   for( ulong i=0UL; i<batch_cnt; i++ ) {
     183             :     p = fd_cstr_append_ulong_as_text( p, ' ', 0, ctr_vec[ i ], fd_ulong_base10_dig_cnt( ctr_vec[ i ] ) );
     184             :     if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
     185             :   }
     186             :   p = fd_cstr_append_text( p, "],flags=[", 9UL );
     187             :   for( ulong i=0UL; i<batch_cnt; i++ ) {
     188             :     static char const hex_lut[ 16 ] = {
     189             :       '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
     190             :     };
     191             :     p = fd_cstr_append_char( p, hex_lut[ batch_flags[ i ]&0xf ] );
     192             :     if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
     193             :   }
     194             :   p = fd_cstr_append_char( p, ']' );
     195             :   if( batch_cv ) p = fd_cstr_append_text( p, ",custom_cv", 10UL );
     196             :   if( lthash   ) p = fd_cstr_append_text( p, ",lthash", 7UL );
     197             :   p = fd_cstr_append_char( p, ')' );
     198             :   ulong line_len = (ulong)( p-log_line );
     199             :   fd_cstr_fini( p );
     200             : 
     201             :   FD_BLAKE3_TRACE(( "%.*s", (int)line_len, log_line ));
     202             : #endif
     203             : 
     204    30966907 :   ulong batch_tail_data[ 16 ] __attribute__((aligned(64)));
     205    30966907 :   ulong batch_tail_rem [ 16 ] __attribute__((aligned(64)));
     206             : 
     207    30966907 :   uchar scratch[ 16*FD_BLAKE3_BLOCK_SZ ] __attribute__((aligned(128)));
     208    30966907 :   do {
     209    30966907 :     ulong scratch_free = (ulong)scratch;
     210             : 
     211    30966907 :     wwv_t zero = wwv_zero();
     212             : 
     213   446224478 :     for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
     214             : 
     215             :       /* Allocate the tail blocks for this message */
     216             : 
     217   415257571 :       ulong data = batch_data[ batch_idx ];
     218   415257571 :       ulong sz   = batch_sz  [ batch_idx ];
     219             : 
     220   415257571 :       ulong tail_data     = scratch_free;
     221   415257571 :       ulong tail_data_sz  = sz & (FD_BLAKE3_BLOCK_SZ-1UL);
     222   415257571 :       ulong tail_data_off = fd_ulong_align_dn( sz, FD_BLAKE3_BLOCK_SZ );
     223             : 
     224   415257571 :       batch_tail_data[ batch_idx ] = tail_data;
     225   415257571 :       batch_tail_rem [ batch_idx ] = (!!tail_data_sz) ^ (!sz);  /* (hash 1 tail block if 0 sz) */
     226             : 
     227   415257571 :       scratch_free += FD_BLAKE3_BLOCK_SZ;
     228             : 
     229             :       /* Populate the tail blocks.  We first clear the blocks.  Then we
     230             :          copy any straggler data bytes into the tail. */
     231             : 
     232   415257571 :       wwv_st( (ulong *) tail_data, zero );
     233             : 
     234   415257571 : #     if 1
     235             :       /* See fd_sha256_private_batch_avx */
     236   415257571 :       ulong src = (ulong)data + tail_data_off;
     237   415257571 :       ulong dst = tail_data;
     238   415257571 :       ulong rem = tail_data_sz;
     239   454450650 :       while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
     240   532947736 :       while( rem>= 8UL ) { *(ulong  *)dst = FD_LOAD( ulong,  src );             dst +=  8UL; src +=  8UL; rem -=  8UL; }
     241   415257571 :       if   ( rem>= 4UL ) { *(uint   *)dst = FD_LOAD( uint,   src );             dst +=  4UL; src +=  4UL; rem -=  4UL; }
     242   415257571 :       if   ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src );             dst +=  2UL; src +=  2UL; rem -=  2UL; }
     243   415257571 :       if   ( rem       ) { *(uchar  *)dst = FD_LOAD( uchar,  src );             dst++;                                 }
     244             : #     else
     245             :       fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
     246             : #     endif
     247   415257571 :     }
     248    30966907 :   } while(0);
     249             : 
     250             : 
     251    30966907 :   wwu_t const iv0 = wwu_bcast( FD_BLAKE3_IV[0] );
     252    30966907 :   wwu_t const iv1 = wwu_bcast( FD_BLAKE3_IV[1] );
     253    30966907 :   wwu_t const iv2 = wwu_bcast( FD_BLAKE3_IV[2] );
     254    30966907 :   wwu_t const iv3 = wwu_bcast( FD_BLAKE3_IV[3] );
     255    30966907 :   wwu_t const iv4 = wwu_bcast( FD_BLAKE3_IV[4] );
     256    30966907 :   wwu_t const iv5 = wwu_bcast( FD_BLAKE3_IV[5] );
     257    30966907 :   wwu_t const iv6 = wwu_bcast( FD_BLAKE3_IV[6] );
     258    30966907 :   wwu_t const iv7 = wwu_bcast( FD_BLAKE3_IV[7] );
     259             : 
     260    30966907 :   wwu_t h0=iv0; wwu_t h1=iv1; wwu_t h2=iv2; wwu_t h3=iv3;
     261    30966907 :   wwu_t h4=iv4; wwu_t h5=iv5; wwu_t h6=iv6; wwu_t h7=iv7;
     262    30966907 :   if( FD_UNLIKELY( batch_cv ) ) {
     263             :     /* If the input chaining value is overridden, transpose the input
     264             :        to AVX512 representation.  (wwu 16x8 transpose)  FIXME There's
     265             :        probably a way to do this using AVX512 instead of AVX. */
     266    23563226 :     __m256i const ** cv_vec = (__m256i const **)batch_cv;
     267    23563226 :     wu_t cv_lo[8]; wu_t cv_hi[8];
     268    23563226 :     cv_lo[ 0 ] = _mm256_loadu_si256( cv_vec[  0 ] );
     269    23563226 :     cv_lo[ 1 ] = _mm256_loadu_si256( cv_vec[  1 ] );
     270    23563226 :     cv_lo[ 2 ] = _mm256_loadu_si256( cv_vec[  2 ] );
     271    23563226 :     cv_lo[ 3 ] = _mm256_loadu_si256( cv_vec[  3 ] );
     272    23563226 :     cv_lo[ 4 ] = _mm256_loadu_si256( cv_vec[  4 ] );
     273    23563226 :     cv_lo[ 5 ] = _mm256_loadu_si256( cv_vec[  5 ] );
     274    23563226 :     cv_lo[ 6 ] = _mm256_loadu_si256( cv_vec[  6 ] );
     275    23563226 :     cv_lo[ 7 ] = _mm256_loadu_si256( cv_vec[  7 ] );
     276    23563226 :     cv_hi[ 0 ] = _mm256_loadu_si256( cv_vec[  8 ] );
     277    23563226 :     cv_hi[ 1 ] = _mm256_loadu_si256( cv_vec[  9 ] );
     278    23563226 :     cv_hi[ 2 ] = _mm256_loadu_si256( cv_vec[ 10 ] );
     279    23563226 :     cv_hi[ 3 ] = _mm256_loadu_si256( cv_vec[ 11 ] );
     280    23563226 :     cv_hi[ 4 ] = _mm256_loadu_si256( cv_vec[ 12 ] );
     281    23563226 :     cv_hi[ 5 ] = _mm256_loadu_si256( cv_vec[ 13 ] );
     282    23563226 :     cv_hi[ 6 ] = _mm256_loadu_si256( cv_vec[ 14 ] );
     283    23563226 :     cv_hi[ 7 ] = _mm256_loadu_si256( cv_vec[ 15 ] );
     284    23563226 :     wu_transpose_8x8( cv_lo[0], cv_lo[1], cv_lo[2], cv_lo[3], cv_lo[4], cv_lo[5], cv_lo[6], cv_lo[7],
     285    23563226 :                       cv_lo[0], cv_lo[1], cv_lo[2], cv_lo[3], cv_lo[4], cv_lo[5], cv_lo[6], cv_lo[7] );
     286    23563226 :     wu_transpose_8x8( cv_hi[0], cv_hi[1], cv_hi[2], cv_hi[3], cv_hi[4], cv_hi[5], cv_hi[6], cv_hi[7],
     287    23563226 :                       cv_hi[0], cv_hi[1], cv_hi[2], cv_hi[3], cv_hi[4], cv_hi[5], cv_hi[6], cv_hi[7] );
     288    23563226 :     h0 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 0 ] ), cv_hi[ 0 ], 1 );
     289    23563226 :     h1 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 1 ] ), cv_hi[ 1 ], 1 );
     290    23563226 :     h2 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 2 ] ), cv_hi[ 2 ], 1 );
     291    23563226 :     h3 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 3 ] ), cv_hi[ 3 ], 1 );
     292    23563226 :     h4 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 4 ] ), cv_hi[ 4 ], 1 );
     293    23563226 :     h5 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 5 ] ), cv_hi[ 5 ], 1 );
     294    23563226 :     h6 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 6 ] ), cv_hi[ 6 ], 1 );
     295    23563226 :     h7 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 7 ] ), cv_hi[ 7 ], 1 );
     296    23563226 :   }
     297             : 
     298    30966907 :   wwu_t ctr_lo = wwu( ctr_vec[ 0],     ctr_vec[ 1],     ctr_vec[ 2],     ctr_vec[ 3],
     299    30966907 :                       ctr_vec[ 4],     ctr_vec[ 5],     ctr_vec[ 6],     ctr_vec[ 7],
     300    30966907 :                       ctr_vec[ 8],     ctr_vec[ 9],     ctr_vec[10],     ctr_vec[11],
     301    30966907 :                       ctr_vec[12],     ctr_vec[13],     ctr_vec[14],     ctr_vec[15] );
     302    30966907 :   wwu_t ctr_hi = wwu( ctr_vec[ 0]>>32, ctr_vec[ 1]>>32, ctr_vec[ 2]>>32, ctr_vec[ 3]>>32,
     303    30966907 :                       ctr_vec[ 4]>>32, ctr_vec[ 5]>>32, ctr_vec[ 6]>>32, ctr_vec[ 7]>>32,
     304    30966907 :                       ctr_vec[ 8]>>32, ctr_vec[ 9]>>32, ctr_vec[10]>>32, ctr_vec[11]>>32,
     305    30966907 :                       ctr_vec[12]>>32, ctr_vec[13]>>32, ctr_vec[14]>>32, ctr_vec[15]>>32 );
     306    30966907 :   wwu_t flags = wwu_ldu( batch_flags );
     307    30966907 :   wwu_t off   = wwu_zero();
     308    30966907 :   wwu_t sz    = wwu_ldu( batch_sz    );
     309             : 
     310    30966907 :   wwv_t zero         = wwv_zero();
     311    30966907 :   wwv_t one          = wwv_one();
     312    30966907 :   wwu_t wwu_64       = wwu_bcast( FD_BLAKE3_BLOCK_SZ );
     313    30966907 :   wwv_t wwv_64       = wwv_bcast( FD_BLAKE3_BLOCK_SZ );
     314    30966907 :   wwv_t W_sentinel   = wwv_bcast( (ulong)scratch );
     315             :   //wwc_t batch_lane   = wc_unpack( (1<<batch_cnt)-1 );
     316             : 
     317    30966907 :   wwv_t tail_lo      = wwv_ld( batch_tail_data   );
     318    30966907 :   wwv_t tail_hi      = wwv_ld( batch_tail_data+8 );
     319             : 
     320    30966907 :   wwv_t tail_rem_lo  = wwv_ld( batch_tail_rem    );
     321    30966907 :   wwv_t tail_rem_hi  = wwv_ld( batch_tail_rem+8  );
     322             : 
     323    30966907 :   wwv_t W_lo         = wwv_ld( batch_data        );
     324    30966907 :   wwv_t W_hi         = wwv_ld( batch_data+8      );
     325             : 
     326    30966907 :   wwv_t batch_sz_lo  = _mm512_cvtepi32_epi64( _mm512_extracti32x8_epi32( sz, 0 ) );
     327    30966907 :   wwv_t batch_sz_hi  = _mm512_cvtepi32_epi64( _mm512_extracti32x8_epi32( sz, 1 ) );
     328             : 
     329    30966907 :   wwv_t block_rem_lo = wwv_if( ((1<<batch_cnt)-1) & 0xff,
     330    30966907 :                                wwv_add( wwv_shr( batch_sz_lo, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_lo ), zero );
     331    30966907 :   wwv_t block_rem_hi = wwv_if( ((1<<batch_cnt)-1) >> 8,
     332    30966907 :                                wwv_add( wwv_shr( batch_sz_hi, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_hi ), zero );
     333             : 
     334             :   /* Upper half of the compression function output.
     335             :      Usually thrown away, but kept in the final compression round if
     336             :      out_sz==64. */
     337    30966907 :   wwu_t hu[8] = {0};
     338             : 
     339    30966907 :   ulong lthash_rem    = lthash ? 32 : 0; /* Number of LtHash (XOF) blocks remaining */
     340    30966907 :   int   compress_done = 0;
     341   105809102 :   for(;;) {
     342             :     /* Switch lanes that have hit the end of their in-place bulk
     343             :        processing to their out-of-place scratch tail regions as
     344             :        necessary. */
     345             : 
     346   105809102 :     W_lo = wwv_if( wwv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
     347   105809102 :     W_hi = wwv_if( wwv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
     348             : 
     349             :     /* Derive per-block flags and block sizes */
     350             : 
     351   105809102 :     int block_first = wwu_eq( off, wwu_zero() );
     352   105809102 :     int block_last  = wwi_le( sz, wwu_add( off, wwu_bcast( FD_BLAKE3_BLOCK_SZ ) ) );
     353             : 
     354             :     /* Suppress root flag unless last block */
     355             : 
     356   105809102 :     wwu_t root_mask   = wwu_if( block_last, wwu_bcast( UINT_MAX ), wwu_bcast( ~FD_BLAKE3_FLAG_ROOT ) );
     357   105809102 :     wwu_t block_flags = wwu_and( flags, root_mask );
     358             : 
     359             :     /* Mask lanes that completed */
     360             : 
     361   105809102 :     int active_lane_lo;
     362   105809102 :     int active_lane_hi;
     363   105809102 :     if( FD_UNLIKELY( lthash ) ) {
     364             :       /* Compress until root block */
     365     1648055 :       wwu_t all_root = wwu_bcast( FD_BLAKE3_FLAG_ROOT );
     366     1648055 :       int   not_root = wwu_ne( wwu_and( block_flags, all_root ), all_root );
     367     1648055 :       active_lane_lo = (int)(__mmask8)not_root;
     368     1648055 :       active_lane_hi = (int)(__mmask8)(not_root>>8);
     369   104161047 :     } else {
     370             :       /* Complete when there is no more input data */
     371   104161047 :       active_lane_lo = wwv_ne( block_rem_lo, zero );
     372   104161047 :       active_lane_hi = wwv_ne( block_rem_hi, zero );
     373   104161047 :     }
     374             : 
     375             :     /* Suppress CHUNK_{START,END} flags unless leaf node */
     376             : 
     377   105809102 :     int is_parent = wwu_ne( wwu_and( flags, wwu_bcast( FD_BLAKE3_FLAG_PARENT ) ), wwu_zero() );
     378   105809102 :     wwu_t chunk_flags = wwu_if( block_last, wwu_bcast( FD_BLAKE3_FLAG_CHUNK_END ), wwu_zero() );
     379   105809102 :     if( out_sz==32 ) {
     380             :       /* Hacky: out_sz==64 is only used for post-compress XOF hashing,
     381             :          so use that as a hint when to suppress the 'CHUNK_START' flag. */
     382    58682650 :       chunk_flags = wwu_or( chunk_flags, wwu_if( block_first, wwu_bcast( FD_BLAKE3_FLAG_CHUNK_START ), wwu_zero() ) );
     383    58682650 :     }
     384   105809102 :     wwu_t block_sz = wwu_min( wwu_sub( sz, off ), wwu_64 );
     385   105809102 :     block_flags = wwu_or( block_flags, wwu_if( is_parent, wwu_zero(), chunk_flags ) );
     386             : 
     387             :     /* Check if we are done compressing */
     388             : 
     389   105809102 :     compress_done |= !(active_lane_lo | active_lane_hi);
     390   105809102 :     if( FD_UNLIKELY( compress_done ) ) {
     391    30966907 :       if( FD_UNLIKELY( !lthash_rem ) ) break;
     392      200101 :       active_lane_lo = 0xff;
     393      200101 :       active_lane_hi = 0xff;
     394             :       /* Load the next message block and fall through to XOF expansion */
     395      200101 :     }
     396             : 
     397             :     /* At this point, we have at least 1 block in this message segment
     398             :        pass that has not been processed.  Load the next 64 bytes of
     399             :        each unprocessed block.  Inactive lanes (e.g. message segments
     400             :        in this pass for which we've already processed all the blocks)
     401             :        will load garbage from a sentinel location (and the result of
     402             :        the state computations for the inactive lane will be ignored). */
     403             : 
     404    75042296 :     ulong _W0; ulong _W1; ulong _W2; ulong _W3; ulong _W4; ulong _W5; ulong _W6; ulong _W7;
     405    75042296 :     ulong _W8; ulong _W9; ulong _Wa; ulong _Wb; ulong _Wc; ulong _Wd; ulong _We; ulong _Wf;
     406    75042296 :     wwv_unpack( wwv_if( active_lane_lo, W_lo, W_sentinel ), _W0, _W1, _W2, _W3, _W4, _W5, _W6, _W7 );
     407    75042296 :     wwv_unpack( wwv_if( active_lane_hi, W_hi, W_sentinel ), _W8, _W9, _Wa, _Wb, _Wc, _Wd, _We, _Wf );
     408    75042296 :     uchar const * W0 = (uchar const *)_W0; uchar const * W1 = (uchar const *)_W1;
     409    75042296 :     uchar const * W2 = (uchar const *)_W2; uchar const * W3 = (uchar const *)_W3;
     410    75042296 :     uchar const * W4 = (uchar const *)_W4; uchar const * W5 = (uchar const *)_W5;
     411    75042296 :     uchar const * W6 = (uchar const *)_W6; uchar const * W7 = (uchar const *)_W7;
     412    75042296 :     uchar const * W8 = (uchar const *)_W8; uchar const * W9 = (uchar const *)_W9;
     413    75042296 :     uchar const * Wa = (uchar const *)_Wa; uchar const * Wb = (uchar const *)_Wb;
     414    75042296 :     uchar const * Wc = (uchar const *)_Wc; uchar const * Wd = (uchar const *)_Wd;
     415    75042296 :     uchar const * We = (uchar const *)_We; uchar const * Wf = (uchar const *)_Wf;
     416             : 
     417    75042296 :     wwu_t m[16];
     418    75042296 :     m[0x0] = wwu_ldu( W0 );  m[0x1] = wwu_ldu( W1 );
     419    75042296 :     m[0x2] = wwu_ldu( W2 );  m[0x3] = wwu_ldu( W3 );
     420    75042296 :     m[0x4] = wwu_ldu( W4 );  m[0x5] = wwu_ldu( W5 );
     421    75042296 :     m[0x6] = wwu_ldu( W6 );  m[0x7] = wwu_ldu( W7 );
     422    75042296 :     m[0x8] = wwu_ldu( W8 );  m[0x9] = wwu_ldu( W9 );
     423    75042296 :     m[0xa] = wwu_ldu( Wa );  m[0xb] = wwu_ldu( Wb );
     424    75042296 :     m[0xc] = wwu_ldu( Wc );  m[0xd] = wwu_ldu( Wd );
     425    75042296 :     m[0xe] = wwu_ldu( We );  m[0xf] = wwu_ldu( Wf );
     426             : 
     427    75042296 :     wwu_transpose_16x16( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
     428    75042296 :                          m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
     429    75042296 :                          m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
     430    75042296 :                          m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
     431             : 
     432             :     /* Compute the BLAKE3 compression function updates */
     433             : 
     434    81245427 : compress: (void)0;
     435    81245427 :     wwu_t v[16] = {
     436    81245427 :         h0,     h1,     h2,       h3,
     437    81245427 :         h4,     h5,     h6,       h7,
     438    81245427 :         iv0,    iv1,    iv2,      iv3,
     439    81245427 :         ctr_lo, ctr_hi, block_sz, block_flags,
     440    81245427 :     };
     441             : 
     442             :     /* Debug utility */
     443    81245427 : #define STATE_FMT         "[%u] =\n  %08x %08x %08x %08x\n  %08x %08x %08x %08x\n  %08x %08x %08x %08x\n  %08x %08x %08x %08x"
     444    81245427 : #define STATE_FMT_ARGS(v,i) (uint)i,\
     445    81245427 :         fd_uint_bswap(wwu_extract(v[0x0],i)),fd_uint_bswap(wwu_extract(v[0x1],i)),fd_uint_bswap(wwu_extract(v[0x2],i)),fd_uint_bswap(wwu_extract(v[0x3],i)),\
     446    81245427 :         fd_uint_bswap(wwu_extract(v[0x4],i)),fd_uint_bswap(wwu_extract(v[0x5],i)),fd_uint_bswap(wwu_extract(v[0x6],i)),fd_uint_bswap(wwu_extract(v[0x7],i)),\
     447    81245427 :         fd_uint_bswap(wwu_extract(v[0x8],i)),fd_uint_bswap(wwu_extract(v[0x9],i)),fd_uint_bswap(wwu_extract(v[0xa],i)),fd_uint_bswap(wwu_extract(v[0xb],i)),\
     448    81245427 :         fd_uint_bswap(wwu_extract(v[0xc],i)),fd_uint_bswap(wwu_extract(v[0xd],i)),fd_uint_bswap(wwu_extract(v[0xe],i)),fd_uint_bswap(wwu_extract(v[0xf],i))
     449             : 
     450             :     // FD_LOG_NOTICE(( "pre " STATE_FMT, STATE_FMT_ARGS(v,0) ));
     451    81245427 :     round_fn16( v, m, 0 );
     452    81245427 :     round_fn16( v, m, 1 );
     453    81245427 :     round_fn16( v, m, 2 );
     454    81245427 :     round_fn16( v, m, 3 );
     455    81245427 :     round_fn16( v, m, 4 );
     456    81245427 :     round_fn16( v, m, 5 );
     457    81245427 :     round_fn16( v, m, 6 );
     458             :     // FD_LOG_NOTICE(( "post" STATE_FMT, STATE_FMT_ARGS(v,0) ));
     459             : 
     460    81245427 :     if( FD_LIKELY( !compress_done ) ) {
     461             : 
     462             :       /* Apply the state updates to the active lanes */
     463             : 
     464    74842195 :       int active_lane = active_lane_lo | (active_lane_hi<<8);
     465    74842195 :       FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: compress lanes %02x%02x", active_lane_hi, active_lane_lo ));
     466             : 
     467    74842195 :       if( FD_UNLIKELY( out_sz==64 ) ) {
     468             :         /* FIXME only export in the last iteration */
     469    23563226 :         hu[0] = wwu_xor_if( active_lane, h0, v[ 8], hu[0] );
     470    23563226 :         hu[1] = wwu_xor_if( active_lane, h1, v[ 9], hu[1] );
     471    23563226 :         hu[2] = wwu_xor_if( active_lane, h2, v[10], hu[2] );
     472    23563226 :         hu[3] = wwu_xor_if( active_lane, h3, v[11], hu[3] );
     473    23563226 :         hu[4] = wwu_xor_if( active_lane, h4, v[12], hu[4] );
     474    23563226 :         hu[5] = wwu_xor_if( active_lane, h5, v[13], hu[5] );
     475    23563226 :         hu[6] = wwu_xor_if( active_lane, h6, v[14], hu[6] );
     476    23563226 :         hu[7] = wwu_xor_if( active_lane, h7, v[15], hu[7] );
     477    23563226 :       }
     478    74842195 :       h0 = wwu_xor_if( active_lane, v[ 0], v[ 8], h0 );
     479    74842195 :       h1 = wwu_xor_if( active_lane, v[ 1], v[ 9], h1 );
     480    74842195 :       h2 = wwu_xor_if( active_lane, v[ 2], v[10], h2 );
     481    74842195 :       h3 = wwu_xor_if( active_lane, v[ 3], v[11], h3 );
     482    74842195 :       h4 = wwu_xor_if( active_lane, v[ 4], v[12], h4 );
     483    74842195 :       h5 = wwu_xor_if( active_lane, v[ 5], v[13], h5 );
     484    74842195 :       h6 = wwu_xor_if( active_lane, v[ 6], v[14], h6 );
     485    74842195 :       h7 = wwu_xor_if( active_lane, v[ 7], v[15], h7 );
     486             : 
     487             :       /* Advance to the next message segment blocks.  In pseudo code,
     488             :          the below is:
     489             : 
     490             :            W += 64; if( block_rem ) block_rem--;
     491             : 
     492             :          Since we do not load anything at W(lane) above unless
     493             :          block_rem(lane) is non-zero, we can omit vector conditional
     494             :          operations for W(lane) below. */
     495             : 
     496    74842195 :       W_lo = wwv_add_if( active_lane_lo, W_lo, wwv_64, W_lo );
     497    74842195 :       W_hi = wwv_add_if( active_lane_hi, W_hi, wwv_64, W_hi );
     498    74842195 :       off  = wwu_add_if( active_lane,    off,  wwu_64, off  );
     499             : 
     500    74842195 :       block_rem_lo = wwv_sub_if( active_lane_lo, block_rem_lo, one, block_rem_lo );
     501    74842195 :       block_rem_hi = wwv_sub_if( active_lane_hi, block_rem_hi, one, block_rem_hi );
     502             : 
     503    74842195 :     } else { /* LtHash mode */
     504             : 
     505             :       /* d[i] contains output_off+(i*4) 32-bit words across output[0..8] */
     506     6403232 :       FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: expand lanes" ));
     507     6403232 :       wwu_t d[ 16 ] = {
     508     6403232 :         wwu_xor( v[0x0], v[0x8] ),
     509     6403232 :         wwu_xor( v[0x1], v[0x9] ),
     510     6403232 :         wwu_xor( v[0x2], v[0xa] ),
     511     6403232 :         wwu_xor( v[0x3], v[0xb] ),
     512     6403232 :         wwu_xor( v[0x4], v[0xc] ),
     513     6403232 :         wwu_xor( v[0x5], v[0xd] ),
     514     6403232 :         wwu_xor( v[0x6], v[0xe] ),
     515     6403232 :         wwu_xor( v[0x7], v[0xf] ),
     516     6403232 :         wwu_xor( h0,     v[0x8] ),
     517     6403232 :         wwu_xor( h1,     v[0x9] ),
     518     6403232 :         wwu_xor( h2,     v[0xa] ),
     519     6403232 :         wwu_xor( h3,     v[0xb] ),
     520     6403232 :         wwu_xor( h4,     v[0xc] ),
     521     6403232 :         wwu_xor( h5,     v[0xd] ),
     522     6403232 :         wwu_xor( h6,     v[0xe] ),
     523     6403232 :         wwu_xor( h7,     v[0xf] )
     524     6403232 :       };
     525             : 
     526             :       /* Transpose each 8x8 block */
     527     6403232 :       wwu_transpose_16x16( d[0x0], d[0x1], d[0x2], d[0x3], d[0x4], d[0x5], d[0x6], d[0x7],
     528     6403232 :                            d[0x8], d[0x9], d[0xa], d[0xb], d[0xc], d[0xd], d[0xe], d[0xf],
     529     6403232 :                            d[0x0], d[0x1], d[0x2], d[0x3], d[0x4], d[0x5], d[0x6], d[0x7],
     530     6403232 :                            d[0x8], d[0x9], d[0xa], d[0xb], d[0xc], d[0xd], d[0xe], d[0xf] );
     531             : 
     532             :       /* Reduce-add into d[0] */
     533     6403232 :       d[0x0] = wwh_add( d[0x0], d[0x1] ); /* sum(l[0 1]) */
     534     6403232 :       d[0x2] = wwh_add( d[0x2], d[0x3] ); /* sum(l[2 3]) */
     535     6403232 :       d[0x4] = wwh_add( d[0x4], d[0x5] ); /* sum(l[4 5]) */
     536     6403232 :       d[0x6] = wwh_add( d[0x6], d[0x7] ); /* sum(l[6 7]) */
     537     6403232 :       d[0x8] = wwh_add( d[0x8], d[0x9] ); /* sum(l[8 9]) */
     538     6403232 :       d[0xa] = wwh_add( d[0xa], d[0xb] ); /* sum(l[a b]) */
     539     6403232 :       d[0xc] = wwh_add( d[0xc], d[0xd] ); /* sum(l[c d]) */
     540     6403232 :       d[0xe] = wwh_add( d[0xe], d[0xf] ); /* sum(l[e f]) */
     541     6403232 :       d[0x0] = wwh_add( d[0x0], d[0x2] ); /* sum(l[0 1 2 3]) */
     542     6403232 :       d[0x4] = wwh_add( d[0x4], d[0x6] ); /* sum(l[4 5 6 7]) */
     543     6403232 :       d[0x8] = wwh_add( d[0x8], d[0xa] ); /* sum(l[8 9 a b]) */
     544     6403232 :       d[0xc] = wwh_add( d[0xc], d[0xe] ); /* sum(l[c d e f]) */
     545     6403232 :       d[0x0] = wwh_add( d[0x0], d[0x4] ); /* sum(l[0 1 2 3 4 5 6 7]) */
     546     6403232 :       d[0x8] = wwh_add( d[0x8], d[0xc] ); /* sum(l[8 9 a b c d e f]) */
     547     6403232 :       d[0x0] = wwh_add( d[0x0], d[0x8] ); /* sum(l[0 1 2 3 4 5 6 7 8 9 a b c d e f]) */
     548     6403232 :       wwh_st( lthash, d[0x0] );
     549             : 
     550             :       /* Wind up for next iteration */
     551     6403232 :       lthash += 32; /* 64 byte stride */
     552     6403232 :       lthash_rem--;
     553     6403232 :       wwu_t ctr_add   = wwu_bcast( 1 );
     554     6403232 :       /**/  ctr_lo    = wwu_add( ctr_lo, ctr_add );
     555     6403232 :       int   ctr_carry = wwi_gt ( wwu_xor( ctr_add, wwu_bcast( 0x80000000 ) ),
     556     6403232 :                                  wwu_xor( ctr_lo,  wwu_bcast( 0x80000000 ) ) );
     557     6403232 :       /**/  ctr_hi    = wwu_add_if( ctr_carry, ctr_hi, wwu_one(), ctr_hi );
     558     6403232 :       if( FD_UNLIKELY( !lthash_rem ) ) {
     559      200101 :         FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: done (lthash para)" ));
     560      200101 :         return;
     561      200101 :       }
     562     6203131 :       goto compress;
     563             : 
     564     6403232 : #   undef STATE_FMT
     565     6403232 : #   undef STATE_FMT_ARGS
     566     6403232 :     }
     567    81245427 :   }
     568             : 
     569             :   /* Store the results */
     570             : 
     571    30766806 :   wwu_t o0; wwu_t o1; wwu_t o2; wwu_t o3; wwu_t o4; wwu_t o5; wwu_t o6; wwu_t o7;
     572    30766806 :   wwu_t o8; wwu_t o9; wwu_t oA; wwu_t oB; wwu_t oC; wwu_t oD; wwu_t oE; wwu_t oF;
     573             : 
     574    30766806 :   wwu_transpose_16x16( h0,   h1,   h2,   h3,   h4,   h5,   h6,   h7,
     575    30766806 :                        hu[0],hu[1],hu[2],hu[3],hu[4],hu[5],hu[6],hu[7],
     576    30766806 :                        o0,   o1,   o2,   o3,   o4,   o5,   o6,   o7,
     577    30766806 :                        o8,   o9,   oA,   oB,   oC,   oD,   oE,   oF );
     578             : 
     579    30766806 :   uint * const * batch_hash = (uint * const *)_batch_hash;
     580    30766806 :   if( FD_LIKELY( out_sz==32 ) ) {
     581     7203580 :     switch( batch_cnt ) { /* application dependent prob */
     582      316956 :     case 16UL: wu_stu( batch_hash[15], _mm512_castsi512_si256( oF ) ); __attribute__((fallthrough));
     583      410197 :     case 15UL: wu_stu( batch_hash[14], _mm512_castsi512_si256( oE ) ); __attribute__((fallthrough));
     584      508669 :     case 14UL: wu_stu( batch_hash[13], _mm512_castsi512_si256( oD ) ); __attribute__((fallthrough));
     585      613180 :     case 13UL: wu_stu( batch_hash[12], _mm512_castsi512_si256( oC ) ); __attribute__((fallthrough));
     586      724777 :     case 12UL: wu_stu( batch_hash[11], _mm512_castsi512_si256( oB ) ); __attribute__((fallthrough));
     587      844769 :     case 11UL: wu_stu( batch_hash[10], _mm512_castsi512_si256( oA ) ); __attribute__((fallthrough));
     588      974880 :     case 10UL: wu_stu( batch_hash[ 9], _mm512_castsi512_si256( o9 ) ); __attribute__((fallthrough));
     589     1117442 :     case  9UL: wu_stu( batch_hash[ 8], _mm512_castsi512_si256( o8 ) ); __attribute__((fallthrough));
     590     1342729 :     case  8UL: wu_stu( batch_hash[ 7], _mm512_castsi512_si256( o7 ) ); __attribute__((fallthrough));
     591     1674518 :     case  7UL: wu_stu( batch_hash[ 6], _mm512_castsi512_si256( o6 ) ); __attribute__((fallthrough));
     592     2055808 :     case  6UL: wu_stu( batch_hash[ 5], _mm512_castsi512_si256( o5 ) ); __attribute__((fallthrough));
     593     2505447 :     case  5UL: wu_stu( batch_hash[ 4], _mm512_castsi512_si256( o4 ) ); __attribute__((fallthrough));
     594     3089878 :     case  4UL: wu_stu( batch_hash[ 3], _mm512_castsi512_si256( o3 ) ); __attribute__((fallthrough));
     595     4457929 :     case  3UL: wu_stu( batch_hash[ 2], _mm512_castsi512_si256( o2 ) ); __attribute__((fallthrough));
     596     7203580 :     case  2UL: wu_stu( batch_hash[ 1], _mm512_castsi512_si256( o1 ) ); __attribute__((fallthrough));
     597     7203580 :     case  1UL: wu_stu( batch_hash[ 0], _mm512_castsi512_si256( o0 ) ); __attribute__((fallthrough));
     598     7203580 :     default: break;
     599     7203580 :     }
     600     7203580 :     FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: done" ));
     601    23563226 :   } else if( out_sz==64 ) {
     602    23563226 :     switch( batch_cnt ) { /* application dependent prob */
     603    23563226 :     case 16UL: wwu_stu( batch_hash[15], oF ); __attribute__((fallthrough));
     604    23563226 :     case 15UL: wwu_stu( batch_hash[14], oE ); __attribute__((fallthrough));
     605    23563226 :     case 14UL: wwu_stu( batch_hash[13], oD ); __attribute__((fallthrough));
     606    23563226 :     case 13UL: wwu_stu( batch_hash[12], oC ); __attribute__((fallthrough));
     607    23563226 :     case 12UL: wwu_stu( batch_hash[11], oB ); __attribute__((fallthrough));
     608    23563226 :     case 11UL: wwu_stu( batch_hash[10], oA ); __attribute__((fallthrough));
     609    23563226 :     case 10UL: wwu_stu( batch_hash[ 9], o9 ); __attribute__((fallthrough));
     610    23563226 :     case  9UL: wwu_stu( batch_hash[ 8], o8 ); __attribute__((fallthrough));
     611    23563226 :     case  8UL: wwu_stu( batch_hash[ 7], o7 ); __attribute__((fallthrough));
     612    23563226 :     case  7UL: wwu_stu( batch_hash[ 6], o6 ); __attribute__((fallthrough));
     613    23563226 :     case  6UL: wwu_stu( batch_hash[ 5], o5 ); __attribute__((fallthrough));
     614    23563226 :     case  5UL: wwu_stu( batch_hash[ 4], o4 ); __attribute__((fallthrough));
     615    23563226 :     case  4UL: wwu_stu( batch_hash[ 3], o3 ); __attribute__((fallthrough));
     616    23563226 :     case  3UL: wwu_stu( batch_hash[ 2], o2 ); __attribute__((fallthrough));
     617    23563226 :     case  2UL: wwu_stu( batch_hash[ 1], o1 ); __attribute__((fallthrough));
     618    23563226 :     case  1UL: wwu_stu( batch_hash[ 0], o0 ); __attribute__((fallthrough));
     619    23563226 :     default: break;
     620    23563226 :     }
     621    23563226 :     FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: done (out_sz=64)" ));
     622    23563226 :   } else {
     623           0 :     FD_LOG_ERR(( "Invalid out_sz %u", out_sz ));
     624           0 :   }
     625    30766806 : }
     626             : 
     627             : void
     628             : fd_blake3_avx512_compress16_fast( uchar const * restrict msg,
     629             :                                   uchar       * restrict out,
     630             :                                   ulong                  counter,
     631     1987987 :                                   uchar                  flags ) {
     632     1987987 :   FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16_fast(msg=%p,out=%p,counter=%lu,flags=%02x)", (void *)msg, (void *)out, counter, flags ));
     633             : 
     634     1987987 :   int   parent = flags & FD_BLAKE3_FLAG_PARENT;
     635     1987987 :   int   lg_sz  = fd_int_if( parent, FD_BLAKE3_OUTCHAIN_LG_SZ+1, FD_BLAKE3_CHUNK_LG_SZ );
     636     1987987 :   ulong sz     = 1UL<<lg_sz;
     637             : 
     638             :   /* counters stay the same for each block.  Across chunks, they
     639             :      increment if we are hashing leaves.  Otherwise, they are zero. */
     640             : 
     641     1987987 :   wwu_t ctr_add   = wwu_and( wwu_bcast( parent ? 0 : UINT_MAX ),
     642     1987987 :                              wwu( 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
     643     1987987 :                                   0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf ) );
     644     1987987 :   wwu_t ctr_lo    = wwu_add( wwu_bcast( counter ), ctr_add );
     645     1987987 :   int   ctr_carry = wwi_gt ( wwu_xor( ctr_add, wwu_bcast( 0x80000000 ) ),
     646     1987987 :                              wwu_xor( ctr_lo,  wwu_bcast( 0x80000000 ) ) );
     647     1987987 :   wwu_t ctr_hi    = wwu_add_if( ctr_carry, wwu_bcast( counter>>32 ), wwu_one(), wwu_bcast( counter>>32 ) );
     648     1987987 :   wwu_t sz_vec    = wwu_bcast( FD_BLAKE3_BLOCK_SZ );
     649             : 
     650     1987987 :   wwu_t const iv0 = wwu_bcast( FD_BLAKE3_IV[0] );
     651     1987987 :   wwu_t const iv1 = wwu_bcast( FD_BLAKE3_IV[1] );
     652     1987987 :   wwu_t const iv2 = wwu_bcast( FD_BLAKE3_IV[2] );
     653     1987987 :   wwu_t const iv3 = wwu_bcast( FD_BLAKE3_IV[3] );
     654     1987987 :   wwu_t const iv4 = wwu_bcast( FD_BLAKE3_IV[4] );
     655     1987987 :   wwu_t const iv5 = wwu_bcast( FD_BLAKE3_IV[5] );
     656     1987987 :   wwu_t const iv6 = wwu_bcast( FD_BLAKE3_IV[6] );
     657     1987987 :   wwu_t const iv7 = wwu_bcast( FD_BLAKE3_IV[7] );
     658             : 
     659     1987987 :   wwu_t h0=iv0; wwu_t h1=iv1; wwu_t h2=iv2; wwu_t h3=iv3;
     660     1987987 :   wwu_t h4=iv4; wwu_t h5=iv5; wwu_t h6=iv6; wwu_t h7=iv7;
     661             : 
     662     1987987 :   ulong off = 0UL;
     663    23431492 :   do {
     664    23431492 :     ulong const off_next = off+FD_BLAKE3_BLOCK_SZ;
     665    23431492 :     int chunk_flags =
     666    23431492 :         ( off     ==0UL ? FD_BLAKE3_FLAG_CHUNK_START : 0 ) |
     667    23431492 :         ( off_next==sz  ? FD_BLAKE3_FLAG_CHUNK_END   : 0 );
     668    23431492 :     int flags_ = flags | fd_int_if( parent, 0, chunk_flags );
     669    23431492 :     wwu_t flags_vec = wwu_bcast( flags_ );
     670             : 
     671    23431492 :     wwu_t m[16];
     672    23431492 :     m[0x0] = wwu_ldu( msg + (0x0<<lg_sz) + off );
     673    23431492 :     m[0x1] = wwu_ldu( msg + (0x1<<lg_sz) + off );
     674    23431492 :     m[0x2] = wwu_ldu( msg + (0x2<<lg_sz) + off );
     675    23431492 :     m[0x3] = wwu_ldu( msg + (0x3<<lg_sz) + off );
     676    23431492 :     m[0x4] = wwu_ldu( msg + (0x4<<lg_sz) + off );
     677    23431492 :     m[0x5] = wwu_ldu( msg + (0x5<<lg_sz) + off );
     678    23431492 :     m[0x6] = wwu_ldu( msg + (0x6<<lg_sz) + off );
     679    23431492 :     m[0x7] = wwu_ldu( msg + (0x7<<lg_sz) + off );
     680    23431492 :     m[0x8] = wwu_ldu( msg + (0x8<<lg_sz) + off );
     681    23431492 :     m[0x9] = wwu_ldu( msg + (0x9<<lg_sz) + off );
     682    23431492 :     m[0xa] = wwu_ldu( msg + (0xa<<lg_sz) + off );
     683    23431492 :     m[0xb] = wwu_ldu( msg + (0xb<<lg_sz) + off );
     684    23431492 :     m[0xc] = wwu_ldu( msg + (0xc<<lg_sz) + off );
     685    23431492 :     m[0xd] = wwu_ldu( msg + (0xd<<lg_sz) + off );
     686    23431492 :     m[0xe] = wwu_ldu( msg + (0xe<<lg_sz) + off );
     687    23431492 :     m[0xf] = wwu_ldu( msg + (0xf<<lg_sz) + off );
     688             : 
     689    23431492 :     wwu_transpose_16x16( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
     690    23431492 :                          m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
     691    23431492 :                          m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
     692    23431492 :                          m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
     693             : 
     694    23431492 :     wwu_t v[16] = {
     695    23431492 :         h0,     h1,     h2,     h3,
     696    23431492 :         h4,     h5,     h6,     h7,
     697    23431492 :         iv0,    iv1,    iv2,    iv3,
     698    23431492 :         ctr_lo, ctr_hi, sz_vec, flags_vec,
     699    23431492 :     };
     700             : 
     701    23431492 :     round_fn16( v, m, 0 );
     702    23431492 :     round_fn16( v, m, 1 );
     703    23431492 :     round_fn16( v, m, 2 );
     704    23431492 :     round_fn16( v, m, 3 );
     705    23431492 :     round_fn16( v, m, 4 );
     706    23431492 :     round_fn16( v, m, 5 );
     707    23431492 :     round_fn16( v, m, 6 );
     708             : 
     709    23431492 :     h0 = wwu_xor( v[ 0], v[ 8] );
     710    23431492 :     h1 = wwu_xor( v[ 1], v[ 9] );
     711    23431492 :     h2 = wwu_xor( v[ 2], v[10] );
     712    23431492 :     h3 = wwu_xor( v[ 3], v[11] );
     713    23431492 :     h4 = wwu_xor( v[ 4], v[12] );
     714    23431492 :     h5 = wwu_xor( v[ 5], v[13] );
     715    23431492 :     h6 = wwu_xor( v[ 6], v[14] );
     716    23431492 :     h7 = wwu_xor( v[ 7], v[15] );
     717             : 
     718    23431492 :     off = off_next;
     719    23431492 :   } while( off!=sz );
     720             : 
     721     1987987 :   wwu_t o0; wwu_t o1; wwu_t o2; wwu_t o3; wwu_t o4; wwu_t o5; wwu_t o6; wwu_t o7;
     722     1987987 :   wwu_t o8; wwu_t o9; wwu_t oA; wwu_t oB; wwu_t oC; wwu_t oD; wwu_t oE; wwu_t oF;
     723             : 
     724     1987987 :   wwu_t zero = wwu_zero();
     725     1987987 :   wwu_transpose_16x16( h0,   h1,   h2,   h3,   h4,   h5,   h6,   h7,
     726     1987987 :                        zero, zero, zero, zero, zero, zero, zero, zero,
     727     1987987 :                        o0,   o1,   o2,   o3,   o4,   o5,   o6,   o7,
     728     1987987 :                        o8,   o9,   oA,   oB,   oC,   oD,   oE,   oF );
     729             : 
     730     1987987 :   wb_st( out + (0x0UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o0 ) );
     731     1987987 :   wb_st( out + (0x1UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o1 ) );
     732     1987987 :   wb_st( out + (0x2UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o2 ) );
     733     1987987 :   wb_st( out + (0x3UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o3 ) );
     734     1987987 :   wb_st( out + (0x4UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o4 ) );
     735     1987987 :   wb_st( out + (0x5UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o5 ) );
     736     1987987 :   wb_st( out + (0x6UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o6 ) );
     737     1987987 :   wb_st( out + (0x7UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o7 ) );
     738     1987987 :   wb_st( out + (0x8UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o8 ) );
     739     1987987 :   wb_st( out + (0x9UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o9 ) );
     740     1987987 :   wb_st( out + (0xaUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oA ) );
     741     1987987 :   wb_st( out + (0xbUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oB ) );
     742     1987987 :   wb_st( out + (0xcUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oC ) );
     743     1987987 :   wb_st( out + (0xdUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oD ) );
     744     1987987 :   wb_st( out + (0xeUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oE ) );
     745     1987987 :   wb_st( out + (0xfUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oF ) );
     746     1987987 : }

Generated by: LCOV version 1.14