LCOV - cov.lcov - ballet/blake3/fd_blake3

LCOV - code coverage report

Current view:	top level - ballet/blake3 - fd_blake3_avx2.c (source / functions)		Hit	Total	Coverage
Test:	cov.lcov	Lines:	483	485	99.6 %
Date:	2025-10-27 04:40:00	Functions:	6	6	100.0 %

          Line data    Source code

       1             : 
       2             : // Source originally from https://github.com/BLAKE3-team/BLAKE3
       3             : // From commit: 64747d48ffe9d1fbf4b71e94cabeb8a211461081
       4             : 
       5             : #include "fd_blake3.h"
       6             : #include "fd_blake3_private.h"
       7             : #include "../../util/simd/fd_avx.h"
       8             : #include <assert.h>
       9             : 
      10 17258081008 : #define wu_rot16 wb_exch_adj_pair
      11             : 
      12             : static inline __attribute__((always_inline)) wu_t
      13 17258081008 : wu_rot12( wu_t x ) {
      14 17258081008 :   return wu_ror( x, 12 );
      15 17258081008 : }
      16             : 
      17             : static inline __attribute__((always_inline)) wu_t
      18 17258081008 : wu_rot8( wu_t x ) {
      19 17258081008 :   wb_t const mask =
      20 17258081008 :     wb( 1,2,3,0,  5,6,7,4,  9,10,11,8,  13,14,15,12,
      21 17258081008 :         1,2,3,0,  5,6,7,4,  9,10,11,8,  13,14,15,12 );
      22 17258081008 :   return _mm256_shuffle_epi8( x, mask );
      23 17258081008 : }
      24             : 
      25             : static inline __attribute__((always_inline)) wu_t
      26 17258081008 : wu_rot7( wu_t x ) {
      27 17258081008 :   return wu_ror( x, 7 );
      28 17258081008 : }
      29             : 
      30             : static inline __attribute__((always_inline)) void
      31             : round_fn8( wu_t  v[16],
      32             :            wu_t  m[16],
      33  2157260126 :            ulong r ) {
      34  2157260126 :   v[ 0] = wu_add(v[0], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][0]]);
      35  2157260126 :   v[ 1] = wu_add(v[1], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][2]]);
      36  2157260126 :   v[ 2] = wu_add(v[2], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][4]]);
      37  2157260126 :   v[ 3] = wu_add(v[3], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][6]]);
      38  2157260126 :   v[ 0] = wu_add(v[0], v[4]);
      39  2157260126 :   v[ 1] = wu_add(v[1], v[5]);
      40  2157260126 :   v[ 2] = wu_add(v[2], v[6]);
      41  2157260126 :   v[ 3] = wu_add(v[3], v[7]);
      42  2157260126 :   v[12] = wu_xor(v[12], v[0]);
      43  2157260126 :   v[13] = wu_xor(v[13], v[1]);
      44  2157260126 :   v[14] = wu_xor(v[14], v[2]);
      45  2157260126 :   v[15] = wu_xor(v[15], v[3]);
      46  2157260126 :   v[12] = wu_rot16(v[12]);
      47  2157260126 :   v[13] = wu_rot16(v[13]);
      48  2157260126 :   v[14] = wu_rot16(v[14]);
      49  2157260126 :   v[15] = wu_rot16(v[15]);
      50  2157260126 :   v[ 8] = wu_add(v[8], v[12]);
      51  2157260126 :   v[ 9] = wu_add(v[9], v[13]);
      52  2157260126 :   v[10] = wu_add(v[10], v[14]);
      53  2157260126 :   v[11] = wu_add(v[11], v[15]);
      54  2157260126 :   v[ 4] = wu_xor(v[4], v[8]);
      55  2157260126 :   v[ 5] = wu_xor(v[5], v[9]);
      56  2157260126 :   v[ 6] = wu_xor(v[6], v[10]);
      57  2157260126 :   v[ 7] = wu_xor(v[7], v[11]);
      58  2157260126 :   v[ 4] = wu_rot12(v[4]);
      59  2157260126 :   v[ 5] = wu_rot12(v[5]);
      60  2157260126 :   v[ 6] = wu_rot12(v[6]);
      61  2157260126 :   v[ 7] = wu_rot12(v[7]);
      62  2157260126 :   v[ 0] = wu_add(v[0], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][1]]);
      63  2157260126 :   v[ 1] = wu_add(v[1], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][3]]);
      64  2157260126 :   v[ 2] = wu_add(v[2], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][5]]);
      65  2157260126 :   v[ 3] = wu_add(v[3], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][7]]);
      66  2157260126 :   v[ 0] = wu_add(v[0], v[4]);
      67  2157260126 :   v[ 1] = wu_add(v[1], v[5]);
      68  2157260126 :   v[ 2] = wu_add(v[2], v[6]);
      69  2157260126 :   v[ 3] = wu_add(v[3], v[7]);
      70  2157260126 :   v[12] = wu_xor(v[12], v[0]);
      71  2157260126 :   v[13] = wu_xor(v[13], v[1]);
      72  2157260126 :   v[14] = wu_xor(v[14], v[2]);
      73  2157260126 :   v[15] = wu_xor(v[15], v[3]);
      74  2157260126 :   v[12] = wu_rot8(v[12]);
      75  2157260126 :   v[13] = wu_rot8(v[13]);
      76  2157260126 :   v[14] = wu_rot8(v[14]);
      77  2157260126 :   v[15] = wu_rot8(v[15]);
      78  2157260126 :   v[ 8] = wu_add(v[8], v[12]);
      79  2157260126 :   v[ 9] = wu_add(v[9], v[13]);
      80  2157260126 :   v[10] = wu_add(v[10], v[14]);
      81  2157260126 :   v[11] = wu_add(v[11], v[15]);
      82  2157260126 :   v[ 4] = wu_xor(v[4], v[8]);
      83  2157260126 :   v[ 5] = wu_xor(v[5], v[9]);
      84  2157260126 :   v[ 6] = wu_xor(v[6], v[10]);
      85  2157260126 :   v[ 7] = wu_xor(v[7], v[11]);
      86  2157260126 :   v[ 4] = wu_rot7(v[4]);
      87  2157260126 :   v[ 5] = wu_rot7(v[5]);
      88  2157260126 :   v[ 6] = wu_rot7(v[6]);
      89  2157260126 :   v[ 7] = wu_rot7(v[7]);
      90             : 
      91  2157260126 :   v[ 0] = wu_add(v[0], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][8]]);
      92  2157260126 :   v[ 1] = wu_add(v[1], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][10]]);
      93  2157260126 :   v[ 2] = wu_add(v[2], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][12]]);
      94  2157260126 :   v[ 3] = wu_add(v[3], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][14]]);
      95  2157260126 :   v[ 0] = wu_add(v[0], v[5]);
      96  2157260126 :   v[ 1] = wu_add(v[1], v[6]);
      97  2157260126 :   v[ 2] = wu_add(v[2], v[7]);
      98  2157260126 :   v[ 3] = wu_add(v[3], v[4]);
      99  2157260126 :   v[15] = wu_xor(v[15], v[0]);
     100  2157260126 :   v[12] = wu_xor(v[12], v[1]);
     101  2157260126 :   v[13] = wu_xor(v[13], v[2]);
     102  2157260126 :   v[14] = wu_xor(v[14], v[3]);
     103  2157260126 :   v[15] = wu_rot16(v[15]);
     104  2157260126 :   v[12] = wu_rot16(v[12]);
     105  2157260126 :   v[13] = wu_rot16(v[13]);
     106  2157260126 :   v[14] = wu_rot16(v[14]);
     107  2157260126 :   v[10] = wu_add(v[10], v[15]);
     108  2157260126 :   v[11] = wu_add(v[11], v[12]);
     109  2157260126 :   v[ 8] = wu_add(v[8], v[13]);
     110  2157260126 :   v[ 9] = wu_add(v[9], v[14]);
     111  2157260126 :   v[ 5] = wu_xor(v[5], v[10]);
     112  2157260126 :   v[ 6] = wu_xor(v[6], v[11]);
     113  2157260126 :   v[ 7] = wu_xor(v[7], v[8]);
     114  2157260126 :   v[ 4] = wu_xor(v[4], v[9]);
     115  2157260126 :   v[ 5] = wu_rot12(v[5]);
     116  2157260126 :   v[ 6] = wu_rot12(v[6]);
     117  2157260126 :   v[ 7] = wu_rot12(v[7]);
     118  2157260126 :   v[ 4] = wu_rot12(v[4]);
     119  2157260126 :   v[ 0] = wu_add(v[0], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][9]]);
     120  2157260126 :   v[ 1] = wu_add(v[1], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][11]]);
     121  2157260126 :   v[ 2] = wu_add(v[2], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][13]]);
     122  2157260126 :   v[ 3] = wu_add(v[3], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][15]]);
     123  2157260126 :   v[ 0] = wu_add(v[0], v[5]);
     124  2157260126 :   v[ 1] = wu_add(v[1], v[6]);
     125  2157260126 :   v[ 2] = wu_add(v[2], v[7]);
     126  2157260126 :   v[ 3] = wu_add(v[3], v[4]);
     127  2157260126 :   v[15] = wu_xor(v[15], v[0]);
     128  2157260126 :   v[12] = wu_xor(v[12], v[1]);
     129  2157260126 :   v[13] = wu_xor(v[13], v[2]);
     130  2157260126 :   v[14] = wu_xor(v[14], v[3]);
     131  2157260126 :   v[15] = wu_rot8(v[15]);
     132  2157260126 :   v[12] = wu_rot8(v[12]);
     133  2157260126 :   v[13] = wu_rot8(v[13]);
     134  2157260126 :   v[14] = wu_rot8(v[14]);
     135  2157260126 :   v[10] = wu_add(v[10], v[15]);
     136  2157260126 :   v[11] = wu_add(v[11], v[12]);
     137  2157260126 :   v[ 8] = wu_add(v[8], v[13]);
     138  2157260126 :   v[ 9] = wu_add(v[9], v[14]);
     139  2157260126 :   v[ 5] = wu_xor(v[5], v[10]);
     140  2157260126 :   v[ 6] = wu_xor(v[6], v[11]);
     141  2157260126 :   v[ 7] = wu_xor(v[7], v[8]);
     142  2157260126 :   v[ 4] = wu_xor(v[4], v[9]);
     143  2157260126 :   v[ 5] = wu_rot7(v[5]);
     144  2157260126 :   v[ 6] = wu_rot7(v[6]);
     145  2157260126 :   v[ 7] = wu_rot7(v[7]);
     146  2157260126 :   v[ 4] = wu_rot7(v[4]);
     147  2157260126 : }
     148             : 
     149             : void
     150             : fd_blake3_avx_compress8( ulong                   batch_cnt,
     151             :                          void   const * restrict _batch_data,
     152             :                          uint   const * restrict batch_sz,
     153             :                          ulong  const * restrict ctr_vec,
     154             :                          uint   const * restrict batch_flags,
     155             :                          void * const * restrict _batch_hash,
     156             :                          ushort *       restrict lthash,
     157             :                          uint                    out_sz,
     158   137463814 :                          void const *   restrict batch_cv ) {
     159   137463814 :   if( FD_UNLIKELY( lthash && batch_cnt!=8 ) ) FD_LOG_ERR(( "Lane masking not supported for fd_blake3_avx_compress8 in LtHash mode" ));
     160   137463814 :   if( FD_UNLIKELY( batch_cnt==0 || batch_cnt>8 ) ) FD_LOG_ERR(( "Invalid batch_cnt %lu", batch_cnt ));
     161             : 
     162   137463814 :   ulong const * batch_data = (ulong const *)_batch_data;
     163             : 
     164   137463814 :   if( FD_UNLIKELY( batch_cnt==1 ) ) {
     165    41008846 :     fd_blake3_sse_compress1( (uchar *)(_batch_hash[0]),
     166    41008846 :                              (uchar const *)(batch_data[0]),
     167    41008846 :                              batch_sz[0],
     168    41008846 :                              ctr_vec[0],
     169    41008846 :                              batch_flags[0],
     170    41008846 :                              NULL,
     171    41008846 :                              NULL );
     172    41008846 :     return;
     173    41008846 :   }
     174             : 
     175             : #if FD_BLAKE3_TRACING
     176             :   /* This log_line buffer is oversized by a fair bit (due to all the
     177             :      NULL terminators) but that's fine */
     178             :   char log_line[
     179             :       sizeof( "fd_blake3_avx_compress8" )+
     180             :       sizeof( "(batch_cnt=" )+21+
     181             :       sizeof( ",sz=["       )+(8*11)+sizeof( "]" )+
     182             :       sizeof( ",counter=["  )+(8*21)+sizeof( "]" )+
     183             :       sizeof( ",flags=["    )+(8* 2)+sizeof( "]" )+
     184             :       sizeof( ",custom_cv"  )+
     185             :       sizeof( ",lthash" )+
     186             :       sizeof( ")" ) ];
     187             : 
     188             :   char * p = fd_cstr_init( log_line );
     189             :   p = fd_cstr_append_text( p, "fd_blake3_avx_compress8(batch_cnt=", 34UL );
     190             :   p = fd_cstr_append_ulong_as_text( p, 0, 0, batch_cnt, fd_uchar_base10_dig_cnt( (uchar)batch_cnt ) );
     191             :   p = fd_cstr_append_text( p, ",sz=[", 5UL );
     192             :   for( ulong i=0UL; i<batch_cnt; i++ ) {
     193             :     p = fd_cstr_append_uint_as_text( p, ' ', 0, batch_sz[ i ], fd_uint_base10_dig_cnt( batch_sz[ i ] ) );
     194             :     if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
     195             :   }
     196             :   p = fd_cstr_append_text( p, "],counter=[", 11UL );
     197             :   for( ulong i=0UL; i<batch_cnt; i++ ) {
     198             :     p = fd_cstr_append_ulong_as_text( p, ' ', 0, ctr_vec[ i ], fd_ulong_base10_dig_cnt( ctr_vec[ i ] ) );
     199             :     if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
     200             :   }
     201             :   p = fd_cstr_append_text( p, "],flags=[", 9UL );
     202             :   for( ulong i=0UL; i<batch_cnt; i++ ) {
     203             :     static char const hex_lut[ 16 ] = {
     204             :       '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
     205             :     };
     206             :     p = fd_cstr_append_char( p, hex_lut[ batch_flags[ i ]&0xf ] );
     207             :     if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
     208             :   }
     209             :   p = fd_cstr_append_char( p, ']' );
     210             :   if( batch_cv ) p = fd_cstr_append_text( p, ",custom_cv", 10UL );
     211             :   if( lthash   ) p = fd_cstr_append_text( p, ",lthash", 7UL );
     212             :   p = fd_cstr_append_char( p, ')' );
     213             :   ulong line_len = (ulong)( p-log_line );
     214             :   fd_cstr_fini( p );
     215             : 
     216             :   FD_BLAKE3_TRACE(( "%.*s", (int)line_len, log_line ));
     217             : #endif
     218             : 
     219             :   /* We can only process input blocks of 64 bytes, but message data size
     220             :      is not necessarily a multiple of 64.  We compute the tail block of
     221             :      each message here.  We then process complete blocks of the original
     222             :      message in place, switching to processing to these  tail blocks in
     223             :      the same pass toward the end. */
     224             : 
     225    96454968 :   ulong batch_tail_data[ 8 ] __attribute__((aligned(32)));
     226    96454968 :   ulong batch_tail_rem [ 8 ] __attribute__((aligned(32)));
     227             : 
     228    96454968 :   uchar scratch[ 8*FD_BLAKE3_BLOCK_SZ ] __attribute__((aligned(128)));
     229    96454968 :   do {
     230    96454968 :     ulong scratch_free = (ulong)scratch;
     231             : 
     232    96454968 :     wv_t zero = wv_zero();
     233             : 
     234   805697588 :     for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
     235             : 
     236             :       /* Allocate the tail blocks for this message */
     237             : 
     238   709242620 :       ulong data = batch_data[ batch_idx ];
     239   709242620 :       ulong sz   = batch_sz  [ batch_idx ];
     240             : 
     241   709242620 :       ulong tail_data     = scratch_free;
     242   709242620 :       ulong tail_data_sz  = sz & (FD_BLAKE3_BLOCK_SZ-1UL);
     243   709242620 :       ulong tail_data_off = fd_ulong_align_dn( sz, FD_BLAKE3_BLOCK_SZ );
     244             : 
     245   709242620 :       batch_tail_data[ batch_idx ] = tail_data;
     246   709242620 :       batch_tail_rem [ batch_idx ] = (ulong)( (!!tail_data_sz) ^ (!sz) );  /* (hash 1 tail block if 0 sz) */
     247             : 
     248   709242620 :       scratch_free += FD_BLAKE3_BLOCK_SZ;
     249             : 
     250             :       /* Populate the tail blocks.  We first clear the blocks.  Then we
     251             :          copy any straggler data bytes into the tail. */
     252             : 
     253   709242620 :       wv_st( (ulong *) tail_data,     zero );
     254   709242620 :       wv_st( (ulong *)(tail_data+32), zero );
     255             : 
     256   709242620 : #     if 1
     257             :       /* See fd_sha256_private_batch_avx */
     258   709242620 :       ulong src = (ulong)data + tail_data_off;
     259   709242620 :       ulong dst = tail_data;
     260   709242620 :       ulong rem = tail_data_sz;
     261   736149132 :       while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
     262   789978078 :       while( rem>= 8UL ) { *(ulong  *)dst = FD_LOAD( ulong,  src );             dst +=  8UL; src +=  8UL; rem -=  8UL; }
     263   709242620 :       if   ( rem>= 4UL ) { *(uint   *)dst = FD_LOAD( uint,   src );             dst +=  4UL; src +=  4UL; rem -=  4UL; }
     264   709242620 :       if   ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src );             dst +=  2UL; src +=  2UL; rem -=  2UL; }
     265   709242620 :       if   ( rem       ) { *(uchar  *)dst = FD_LOAD( uchar,  src );             dst++;                                 }
     266             : #     else
     267             :       fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
     268             : #     endif
     269   709242620 :     }
     270    96454968 :   } while(0);
     271             : 
     272             : 
     273    96454968 :   wu_t const iv0 = wu_bcast( FD_BLAKE3_IV[0] );
     274    96454968 :   wu_t const iv1 = wu_bcast( FD_BLAKE3_IV[1] );
     275    96454968 :   wu_t const iv2 = wu_bcast( FD_BLAKE3_IV[2] );
     276    96454968 :   wu_t const iv3 = wu_bcast( FD_BLAKE3_IV[3] );
     277    96454968 :   wu_t const iv4 = wu_bcast( FD_BLAKE3_IV[4] );
     278    96454968 :   wu_t const iv5 = wu_bcast( FD_BLAKE3_IV[5] );
     279    96454968 :   wu_t const iv6 = wu_bcast( FD_BLAKE3_IV[6] );
     280    96454968 :   wu_t const iv7 = wu_bcast( FD_BLAKE3_IV[7] );
     281             : 
     282    96454968 :   wu_t h0=iv0; wu_t h1=iv1; wu_t h2=iv2; wu_t h3=iv3;
     283    96454968 :   wu_t h4=iv4; wu_t h5=iv5; wu_t h6=iv6; wu_t h7=iv7;
     284    96454968 :   if( FD_UNLIKELY( batch_cv ) ) {
     285             :     /* If the input chaining value is overridden, transpose the input to
     286             :        AVX representation (8x8 transpose). */
     287    81452968 :     __m256i const ** cv_vec = (__m256i const **)batch_cv;
     288    81452968 :     wu_t cv[8];
     289   733076712 :     for( ulong i=0UL; i<8UL; i++ ) cv[i] = _mm256_loadu_si256( cv_vec[ i ] );
     290    81452968 :     wu_transpose_8x8( cv[0], cv[1], cv[2], cv[3], cv[4], cv[5], cv[6], cv[7],
     291    81452968 :                       h0,    h1,    h2,    h3,    h4,    h5,    h6,    h7 );
     292    81452968 :   }
     293             : 
     294    96454968 :   wu_t ctr_lo = wu( ctr_vec[0],     ctr_vec[1],     ctr_vec[2],     ctr_vec[3],
     295    96454968 :                     ctr_vec[4],     ctr_vec[5],     ctr_vec[6],     ctr_vec[7] );
     296    96454968 :   wu_t ctr_hi = wu( ctr_vec[0]>>32, ctr_vec[1]>>32, ctr_vec[2]>>32, ctr_vec[3]>>32,
     297    96454968 :                     ctr_vec[4]>>32, ctr_vec[5]>>32, ctr_vec[6]>>32, ctr_vec[7]>>32 );
     298    96454968 :   wu_t flags = wu_ldu( batch_flags );
     299    96454968 :   wu_t off   = wu_zero();
     300    96454968 :   wu_t sz    = wu_ldu( batch_sz );
     301             : 
     302    96454968 :   wv_t wv_64        = wv_bcast( FD_BLAKE3_BLOCK_SZ );
     303    96454968 :   wv_t W_sentinel   = wv_bcast( (ulong)scratch );
     304    96454968 :   wc_t batch_lane   = wc_unpack( (1<<batch_cnt)-1 );
     305             : 
     306    96454968 :   wv_t tail_lo      = wv_ld( batch_tail_data   );
     307    96454968 :   wv_t tail_hi      = wv_ld( batch_tail_data+4 );
     308             : 
     309    96454968 :   wv_t tail_rem_lo  = wv_ld( batch_tail_rem    );
     310    96454968 :   wv_t tail_rem_hi  = wv_ld( batch_tail_rem+4  );
     311             : 
     312    96454968 :   wv_t W_lo         = wv_ld( batch_data        );
     313    96454968 :   wv_t W_hi         = wv_ld( batch_data+4      );
     314             : 
     315    96454968 :   wv_t batch_sz_lo  = _mm256_cvtepi32_epi64( _mm256_extractf128_si256( sz, 0 ) );
     316    96454968 :   wv_t batch_sz_hi  = _mm256_cvtepi32_epi64( _mm256_extractf128_si256( sz, 1 ) );
     317             : 
     318    96454968 :   wv_t block_rem_lo = wv_notczero( wc_expand( batch_lane, 0 ),
     319    96454968 :                         wv_add( wv_shr( batch_sz_lo, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_lo ) );
     320    96454968 :   wv_t block_rem_hi = wv_notczero( wc_expand( batch_lane, 1 ),
     321    96454968 :                         wv_add( wv_shr( batch_sz_hi, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_hi ) );
     322             : 
     323             :   /* Upper half of the compression function output.
     324             :      Usually thrown away, but kept in the final compression round if
     325             :      out_sz==64. */
     326    96454968 :   wu_t hu[8] = {0};
     327             : 
     328    96454968 :   ulong lthash_rem    = lthash ? 32 : 0; /* Number of LtHash (XOF) blocks remaining */
     329    96454968 :   int   compress_done = 0;
     330   281244472 :   for(;;) {
     331             :     /* Switch lanes that have hit the end of their in-place bulk
     332             :        processing to their out-of-place scratch tail regions as
     333             :        necessary. */
     334             : 
     335   281244472 :     W_lo = wv_if( wv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
     336   281244472 :     W_hi = wv_if( wv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
     337             : 
     338             :     /* Derive per-block flags and block sizes */
     339             : 
     340   281244472 :     wc_t block_first = wu_eq( off, wu_zero() );
     341   281244472 :     wc_t block_last  = wi_lt( sz,  wu_add( off, wu_bcast( FD_BLAKE3_BLOCK_SZ+1 ) ) );
     342             : 
     343             :     /* Suppress root flag unless last block */
     344             : 
     345   281244472 :     wu_t root_mask = wu_or( block_last, wu_bcast( ~FD_BLAKE3_FLAG_ROOT ) );
     346   281244472 :     wu_t block_flags = wu_and( flags, root_mask );
     347             : 
     348             :     /* LtHash mode ends compression one early */
     349             : 
     350   281244472 :     wc_t active_lane_lo;
     351   281244472 :     wc_t active_lane_hi;
     352   281244472 :     if( FD_UNLIKELY( lthash ) ) {
     353             :       /* Compress until root block */
     354     4705828 :       wu_t all_root = wu_bcast( FD_BLAKE3_FLAG_ROOT );
     355     4705828 :       wu_t not_root = wu_ne( wu_and( block_flags, all_root ), all_root );
     356     4705828 :       active_lane_lo = _mm256_cvtepi32_epi64( _mm256_extractf128_si256( not_root, 0 ) );
     357     4705828 :       active_lane_hi = _mm256_cvtepi32_epi64( _mm256_extractf128_si256( not_root, 1 ) );
     358   276538644 :     } else {
     359             :       /* Complete when there is no more input data */
     360   276538644 :       active_lane_lo = wv_to_wc( block_rem_lo );
     361   276538644 :       active_lane_hi = wv_to_wc( block_rem_hi );
     362   276538644 :     }
     363             : 
     364             :     /* Suppress CHUNK_{START,END} flags unless leaf node */
     365             : 
     366   281244472 :     wc_t is_parent = wu_shl( flags, 5 );  /* shift FLAG_PARENT into AVX condition bit */
     367   281244472 :     wu_t chunk_flags = wu_if( block_last,  wu_bcast( FD_BLAKE3_FLAG_CHUNK_END   ), wu_zero() );
     368   281244472 :     if( out_sz==32 ) {
     369             :       /* Hacky: out_sz==64 is only used for post-compress XOF hashing,
     370             :          so use that as a hint when to suppress the 'CHUNK_START' flag. */
     371   118338536 :       chunk_flags = wu_or( chunk_flags, wu_if( block_first, wu_bcast( FD_BLAKE3_FLAG_CHUNK_START ), wu_zero() ) );
     372   118338536 :     }
     373   281244472 :     wu_t block_sz = wu_min( wu_sub( sz, off ), wu_bcast( FD_BLAKE3_BLOCK_SZ ) );
     374   281244472 :     block_flags = wu_or( block_flags, wu_if( is_parent, wu_zero(), chunk_flags ) );
     375             : 
     376             :     /* Check if we are done compressing */
     377             : 
     378   281244472 :     compress_done |= !wc_any( wc_or( active_lane_lo, active_lane_hi ) );
     379   281244472 :     if( FD_UNLIKELY( compress_done ) ) {
     380    96454968 :       if( FD_UNLIKELY( !lthash_rem ) ) break;
     381      600306 :       active_lane_lo = wc_bcast( INT_MAX );
     382      600306 :       active_lane_hi = wc_bcast( INT_MAX );
     383             :       /* Load the next message block and fall through to XOF expansion */
     384      600306 :     }
     385             : 
     386             :     /* At this point, we have at least 1 block in this message segment
     387             :        pass that has not been processed.  Load the next 64 bytes of
     388             :        each unprocessed block.  Inactive lanes (e.g. message segments
     389             :        in this pass for which we've already processed all the blocks)
     390             :        will load garbage from a sentinel location (and the result of
     391             :        the state computations for the inactive lane will be ignored). */
     392             : 
     393   185389810 :     wv_t W03 = wv_if( active_lane_lo, W_lo, W_sentinel );
     394   185389810 :     uchar const * W0 = (uchar const *)wv_extract( W03, 0 );
     395   185389810 :     uchar const * W1 = (uchar const *)wv_extract( W03, 1 );
     396   185389810 :     uchar const * W2 = (uchar const *)wv_extract( W03, 2 );
     397   185389810 :     uchar const * W3 = (uchar const *)wv_extract( W03, 3 );
     398             : 
     399   185389810 :     wv_t W47 = wv_if( active_lane_hi, W_hi, W_sentinel );
     400   185389810 :     uchar const * W4 = (uchar const *)wv_extract( W47, 0 );
     401   185389810 :     uchar const * W5 = (uchar const *)wv_extract( W47, 1 );
     402   185389810 :     uchar const * W6 = (uchar const *)wv_extract( W47, 2 );
     403   185389810 :     uchar const * W7 = (uchar const *)wv_extract( W47, 3 );
     404             : 
     405   185389810 :     wu_t m[16] = { wu_ldu( W0    ), wu_ldu( W1    ), wu_ldu( W2    ), wu_ldu( W3    ),
     406   185389810 :                    wu_ldu( W4    ), wu_ldu( W5    ), wu_ldu( W6    ), wu_ldu( W7    ),
     407   185389810 :                    wu_ldu( W0+32 ), wu_ldu( W1+32 ), wu_ldu( W2+32 ), wu_ldu( W3+32 ),
     408   185389810 :                    wu_ldu( W4+32 ), wu_ldu( W5+32 ), wu_ldu( W6+32 ), wu_ldu( W7+32 ) };
     409             : 
     410   185389810 :     wu_transpose_8x8( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
     411   185389810 :                       m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7] );
     412   185389810 :     wu_transpose_8x8( m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
     413   185389810 :                       m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
     414             : 
     415             :     /* Compute the BLAKE3 compression function updates */
     416             : 
     417   203999296 : compress: (void)0;
     418   203999296 :     wu_t v[16] = {
     419   203999296 :         h0,     h1,     h2,       h3,
     420   203999296 :         h4,     h5,     h6,       h7,
     421   203999296 :         iv0,    iv1,    iv2,      iv3,
     422   203999296 :         ctr_lo, ctr_hi, block_sz, block_flags,
     423   203999296 :     };
     424             : 
     425             :     /* Debug utility */
     426   203999296 : #define STATE_FMT         "state[%u] =\n  %08x %08x %08x %08x\n  %08x %08x %08x %08x\n  %08x %08x %08x %08x\n  %08x %08x %08x %08x"
     427   203999296 : #define STATE_FMT_ARGS(v,i) (uint)i,\
     428   203999296 :         fd_uint_bswap(wu_extract(v[0x0],i)),fd_uint_bswap(wu_extract(v[0x1],i)),fd_uint_bswap(wu_extract(v[0x2],i)),fd_uint_bswap(wu_extract(v[0x3],i)),\
     429   203999296 :         fd_uint_bswap(wu_extract(v[0x4],i)),fd_uint_bswap(wu_extract(v[0x5],i)),fd_uint_bswap(wu_extract(v[0x6],i)),fd_uint_bswap(wu_extract(v[0x7],i)),\
     430   203999296 :         fd_uint_bswap(wu_extract(v[0x8],i)),fd_uint_bswap(wu_extract(v[0x9],i)),fd_uint_bswap(wu_extract(v[0xa],i)),fd_uint_bswap(wu_extract(v[0xb],i)),\
     431   203999296 :         fd_uint_bswap(wu_extract(v[0xc],i)),fd_uint_bswap(wu_extract(v[0xd],i)),fd_uint_bswap(wu_extract(v[0xe],i)),fd_uint_bswap(wu_extract(v[0xf],i))
     432             : 
     433             :     // FD_LOG_NOTICE(( STATE_FMT, STATE_FMT_ARGS(v,0) ));
     434   203999296 :     round_fn8( v, m, 0 );
     435   203999296 :     round_fn8( v, m, 1 );
     436   203999296 :     round_fn8( v, m, 2 );
     437   203999296 :     round_fn8( v, m, 3 );
     438   203999296 :     round_fn8( v, m, 4 );
     439   203999296 :     round_fn8( v, m, 5 );
     440   203999296 :     round_fn8( v, m, 6 );
     441             :     // FD_LOG_NOTICE(( STATE_FMT, STATE_FMT_ARGS(v,0) ));
     442             : 
     443   203999296 :     wu_t d[8] = {
     444   203999296 :       wu_xor( v[ 0], v[ 8] ), wu_xor( v[ 1], v[ 9] ),
     445   203999296 :       wu_xor( v[ 2], v[10] ), wu_xor( v[ 3], v[11] ),
     446   203999296 :       wu_xor( v[ 4], v[12] ), wu_xor( v[ 5], v[13] ),
     447   203999296 :       wu_xor( v[ 6], v[14] ), wu_xor( v[ 7], v[15] )
     448   203999296 :     };
     449             : 
     450   203999296 :     if( FD_LIKELY( !compress_done ) ) {
     451             : 
     452             :       /* Apply the state updates to the active lanes */
     453             : 
     454   184789504 :       wc_t active_lane = wc_narrow( active_lane_lo, active_lane_hi );
     455   184789504 :       if( FD_UNLIKELY( out_sz==64 ) ) {
     456             :         /* FIXME only export in the last iteration */
     457    81452968 :         hu[0] = wu_if( active_lane, wu_xor( h0, v[ 8] ), hu[0] );
     458    81452968 :         hu[1] = wu_if( active_lane, wu_xor( h1, v[ 9] ), hu[1] );
     459    81452968 :         hu[2] = wu_if( active_lane, wu_xor( h2, v[10] ), hu[2] );
     460    81452968 :         hu[3] = wu_if( active_lane, wu_xor( h3, v[11] ), hu[3] );
     461    81452968 :         hu[4] = wu_if( active_lane, wu_xor( h4, v[12] ), hu[4] );
     462    81452968 :         hu[5] = wu_if( active_lane, wu_xor( h5, v[13] ), hu[5] );
     463    81452968 :         hu[6] = wu_if( active_lane, wu_xor( h6, v[14] ), hu[6] );
     464    81452968 :         hu[7] = wu_if( active_lane, wu_xor( h7, v[15] ), hu[7] );
     465    81452968 :       }
     466   184789504 :       h0 = wu_if( active_lane, d[0], h0 );
     467   184789504 :       h1 = wu_if( active_lane, d[1], h1 );
     468   184789504 :       h2 = wu_if( active_lane, d[2], h2 );
     469   184789504 :       h3 = wu_if( active_lane, d[3], h3 );
     470   184789504 :       h4 = wu_if( active_lane, d[4], h4 );
     471   184789504 :       h5 = wu_if( active_lane, d[5], h5 );
     472   184789504 :       h6 = wu_if( active_lane, d[6], h6 );
     473   184789504 :       h7 = wu_if( active_lane, d[7], h7 );
     474             : 
     475             :       /* Advance to the next message segment blocks.  In pseudo code,
     476             :          the below is:
     477             : 
     478             :            W += 64; if( block_rem ) block_rem--;
     479             : 
     480             :          Since wc_to_wv_raw(false/true) is 0UL/~0UL, we can use wv_add /
     481             :          wc_to_wv_raw instead of wv_sub / wc_to_wv to save some ops.
     482             :          (Consider conditional increment / decrement operations?)
     483             : 
     484             :          Also since we do not load anything at W(lane) above unless
     485             :          block_rem(lane) is non-zero, we can omit vector conditional
     486             :          operations for W(lane) below to save some additional ops. */
     487             : 
     488   184789504 :       W_lo = wv_add( W_lo, wv_if( active_lane_lo, wv_64, wv_zero() ) );
     489   184789504 :       W_hi = wv_add( W_hi, wv_if( active_lane_hi, wv_64, wv_zero() ) );
     490   184789504 :       off  = wu_add( off,  wu_if( active_lane, wu_bcast( FD_BLAKE3_BLOCK_SZ ), wv_zero() ) );
     491             : 
     492   184789504 :       block_rem_lo = wv_add( block_rem_lo, wv_if( active_lane_lo, wc_to_wv_raw( active_lane_lo ), wv_zero() ) );
     493   184789504 :       block_rem_hi = wv_add( block_rem_hi, wv_if( active_lane_hi, wc_to_wv_raw( active_lane_hi ), wv_zero() ) );
     494             : 
     495   188123150 :     } else { /* LtHash mode */
     496             : 
     497             :       /* d[i] contains output_off+(i*4) 32-bit words across output[0..8] */
     498    19209792 :       wu_t dh[ 8 ] = {
     499    19209792 :         wu_xor( h0, v[0x8] ),
     500    19209792 :         wu_xor( h1, v[0x9] ),
     501    19209792 :         wu_xor( h2, v[0xa] ),
     502    19209792 :         wu_xor( h3, v[0xb] ),
     503    19209792 :         wu_xor( h4, v[0xc] ),
     504    19209792 :         wu_xor( h5, v[0xd] ),
     505    19209792 :         wu_xor( h6, v[0xe] ),
     506    19209792 :         wu_xor( h7, v[0xf] )
     507    19209792 :       };
     508             : 
     509             :       /* Transpose outer 8x8 blocks */
     510    19209792 :       wu_transpose_8x8( d [0],d [1],d [2],d [3],d [4],d [5],d [6],d [7],
     511    19209792 :                         d [0],d [1],d [2],d [3],d [4],d [5],d [6],d [7] );
     512    19209792 :       wu_transpose_8x8( dh[0],dh[1],dh[2],dh[3],dh[4],dh[5],dh[6],dh[7],
     513    19209792 :                         dh[0],dh[1],dh[2],dh[3],dh[4],dh[5],dh[6],dh[7] );
     514             : 
     515             :       /* d[i] contains output[i]+out_off */
     516             : 
     517             :       /* Reduce-add into d[0] */
     518    19209792 :       d [0] = wh_add( d [0], d [1] ); /* sum(l[0 1]) */
     519    19209792 :       dh[0] = wh_add( dh[0], dh[1] ); /* sum(h[0 1]) */
     520    19209792 :       d [2] = wh_add( d [2], d [3] ); /* sum(l[2 3]) */
     521    19209792 :       dh[2] = wh_add( dh[2], dh[3] ); /* sum(h[2 3]) */
     522    19209792 :       d [4] = wh_add( d [4], d [5] ); /* sum(l[4 5])*/
     523    19209792 :       dh[4] = wh_add( dh[4], dh[5] ); /* sum(h[4 5]) */
     524    19209792 :       d [6] = wh_add( d [6], d [7] ); /* sum(l[6 7]) */
     525    19209792 :       dh[6] = wh_add( dh[6], dh[7] ); /* sum(h[6 7]) */
     526    19209792 :       d [0] = wh_add( d [0], d [2] ); /* sum(l[0 1 2 3]) */
     527    19209792 :       dh[0] = wh_add( dh[0], dh[2] ); /* sum(h[0 1 2 3]) */
     528    19209792 :       d [4] = wh_add( d [4], d [6] ); /* sum(l[4 5 6 7]) */
     529    19209792 :       dh[4] = wh_add( dh[4], dh[6] ); /* sum(h[4 5 6 7]) */
     530    19209792 :       d [0] = wh_add( d [0], d [4] ); /* sum(l[0 1 2 3 4 5 6 7]) */
     531    19209792 :       dh[0] = wh_add( dh[0], dh[4] ); /* sum(h[0 1 2 3 4 5 6 7]) */
     532    19209792 :       wh_st( lthash,    d [0] );
     533    19209792 :       wh_st( lthash+16, dh[0] );
     534             : 
     535             :       /* Wind up for next iteration */
     536    19209792 :       lthash += 32;
     537    19209792 :       lthash_rem--;
     538    19209792 :       wu_t ctr_add   = wu_bcast( 1 );
     539    19209792 :       /**/ ctr_lo    = wu_add( ctr_lo, ctr_add );
     540    19209792 :       wu_t ctr_carry = wi_gt ( wu_xor( ctr_add, wu_bcast( 0x80000000 ) ),
     541    19209792 :                                wu_xor( ctr_lo,  wu_bcast( 0x80000000 ) ) );
     542    19209792 :       /**/ ctr_hi    = wu_sub( ctr_hi, ctr_carry );
     543    19209792 :       if( FD_UNLIKELY( !lthash_rem ) ) {
     544      600306 :         FD_BLAKE3_TRACE(( "fd_blake3_avx_compress8: done (lthash para)" ));
     545      600306 :         return;
     546      600306 :       }
     547    18609486 :       goto compress;
     548             : 
     549    19209792 : #   undef STATE_FMT
     550    19209792 : #   undef STATE_FMT_ARGS
     551    19209792 :     }
     552   203999296 :   }
     553             : 
     554             :   /* Store the results */
     555             : 
     556    95854662 :   wu_transpose_8x8( h0, h1, h2, h3, h4, h5, h6, h7,
     557    95854662 :                     h0, h1, h2, h3, h4, h5, h6, h7 );
     558             : 
     559    95854662 :   uint * const * batch_hash = (uint * const *)__builtin_assume_aligned( _batch_hash, 32 );
     560    95854662 :   if( FD_LIKELY( out_sz==32 ) ) {
     561    14401694 :     switch( batch_cnt ) { /* application dependent prob */
     562     1107610 :     case 8UL: wu_st( batch_hash[7], h7 ); __attribute__((fallthrough));
     563     1957670 :     case 7UL: wu_st( batch_hash[6], h6 ); __attribute__((fallthrough));
     564     2917194 :     case 6UL: wu_st( batch_hash[5], h5 ); __attribute__((fallthrough));
     565     4025494 :     case 5UL: wu_st( batch_hash[4], h4 ); __attribute__((fallthrough));
     566     5464670 :     case 4UL: wu_st( batch_hash[3], h3 ); __attribute__((fallthrough));
     567     8540402 :     case 3UL: wu_st( batch_hash[2], h2 ); __attribute__((fallthrough));
     568    14401694 :     case 2UL: wu_st( batch_hash[1], h1 ); __attribute__((fallthrough));
     569    14401694 :     case 1UL: wu_st( batch_hash[0], h0 ); __attribute__((fallthrough));
     570    14401694 :     default: break;
     571    14401694 :     }
     572    81653106 :   } else if( out_sz==64 ) {
     573    81452968 :     wu_transpose_8x8( hu[0], hu[1], hu[2], hu[3], hu[4], hu[5], hu[6], hu[7],
     574    81452968 :                       hu[0], hu[1], hu[2], hu[3], hu[4], hu[5], hu[6], hu[7] );
     575    81452968 :     switch( batch_cnt ) { /* application dependent prob */
     576    81452968 :     case 8UL: wu_st( batch_hash[7],   h7    );
     577    81452968 :               wu_st( batch_hash[7]+8, hu[7] ); __attribute__((fallthrough));
     578    81452968 :     case 7UL: wu_st( batch_hash[6],   h6    );
     579    81452968 :               wu_st( batch_hash[6]+8, hu[6] ); __attribute__((fallthrough));
     580    81452968 :     case 6UL: wu_st( batch_hash[5],   h5    );
     581    81452968 :               wu_st( batch_hash[5]+8, hu[5] ); __attribute__((fallthrough));
     582    81452968 :     case 5UL: wu_st( batch_hash[4],   h4    );
     583    81452968 :               wu_st( batch_hash[4]+8, hu[4] ); __attribute__((fallthrough));
     584    81452968 :     case 4UL: wu_st( batch_hash[3],   h3    );
     585    81452968 :               wu_st( batch_hash[3]+8, hu[3] ); __attribute__((fallthrough));
     586    81452968 :     case 3UL: wu_st( batch_hash[2],   h2    );
     587    81452968 :               wu_st( batch_hash[2]+8, hu[2] ); __attribute__((fallthrough));
     588    81452968 :     case 2UL: wu_st( batch_hash[1],   h1    );
     589    81452968 :               wu_st( batch_hash[1]+8, hu[1] ); __attribute__((fallthrough));
     590    81452968 :     case 1UL: wu_st( batch_hash[0],   h0    );
     591    81452968 :               wu_st( batch_hash[0]+8, hu[0] ); __attribute__((fallthrough));
     592    81452968 :     default: break;
     593    81452968 :     }
     594    81452968 :   } else {
     595           0 :     FD_LOG_ERR(( "Invalid out_sz %u", out_sz ));
     596           0 :   }
     597    95854662 : }
     598             : 
     599             : void
     600             : fd_blake3_avx_compress8_fast( uchar const * restrict msg,
     601             :                               uchar       * restrict _out,
     602             :                               ulong                  counter,
     603     8911882 :                               uchar                  flags ) {
     604     8911882 :   FD_BLAKE3_TRACE(( "fd_blake3_avx_compress8_fast(msg=%p,out=%p,counter=%lu,flags=%02x)", (void *)msg, (void *)_out, counter, flags ));
     605             : 
     606     8911882 :   uchar * restrict out = __builtin_assume_aligned( _out, 32 );
     607             : 
     608     8911882 :   int   parent = flags & FD_BLAKE3_FLAG_PARENT;
     609     8911882 :   int   lg_sz  = fd_int_if( parent, FD_BLAKE3_OUTCHAIN_LG_SZ+1, FD_BLAKE3_CHUNK_LG_SZ );
     610     8911882 :   ulong sz     = 1UL<<lg_sz;
     611             : 
     612             :   /* counters stay the same for each block.  Across chunks, they
     613             :      increment if we are hashing leaves.  Otherwise, they are zero. */
     614             : 
     615     8911882 :   wu_t ctr_add   = wu_and( wu_bcast( parent ? 0 : UINT_MAX ),
     616     8911882 :                            wu( 0, 1, 2, 3, 4, 5, 6, 7 ) );
     617     8911882 :   wu_t ctr_lo    = wu_add( wu_bcast( counter ), ctr_add );
     618     8911882 :   wu_t ctr_carry = wi_gt ( wu_xor( ctr_add, wu_bcast( 0x80000000 ) ),
     619     8911882 :                            wu_xor( ctr_lo,  wu_bcast( 0x80000000 ) ) );
     620     8911882 :   wu_t ctr_hi    = wu_sub( wu_bcast( counter>>32 ), ctr_carry );
     621     8911882 :   wu_t sz_vec    = wu_bcast( FD_BLAKE3_BLOCK_SZ );
     622             : 
     623     8911882 :   wu_t const iv0 = wu_bcast( FD_BLAKE3_IV[0] );
     624     8911882 :   wu_t const iv1 = wu_bcast( FD_BLAKE3_IV[1] );
     625     8911882 :   wu_t const iv2 = wu_bcast( FD_BLAKE3_IV[2] );
     626     8911882 :   wu_t const iv3 = wu_bcast( FD_BLAKE3_IV[3] );
     627     8911882 :   wu_t const iv4 = wu_bcast( FD_BLAKE3_IV[4] );
     628     8911882 :   wu_t const iv5 = wu_bcast( FD_BLAKE3_IV[5] );
     629     8911882 :   wu_t const iv6 = wu_bcast( FD_BLAKE3_IV[6] );
     630     8911882 :   wu_t const iv7 = wu_bcast( FD_BLAKE3_IV[7] );
     631             : 
     632     8911882 :   wu_t h0=iv0; wu_t h1=iv1; wu_t h2=iv2; wu_t h3=iv3;
     633     8911882 :   wu_t h4=iv4; wu_t h5=iv5; wu_t h6=iv6; wu_t h7=iv7;
     634             : 
     635     8911882 :   ulong off = 0UL;
     636   104180722 :   do {
     637   104180722 :     ulong const off_next = off+FD_BLAKE3_BLOCK_SZ;
     638   104180722 :     uint chunk_flags =
     639   104180722 :         ( off     ==0UL ? FD_BLAKE3_FLAG_CHUNK_START : 0u ) |
     640   104180722 :         ( off_next==sz  ? FD_BLAKE3_FLAG_CHUNK_END   : 0u );
     641   104180722 :     uint flags_ = flags | fd_uint_if( parent, 0, chunk_flags );
     642   104180722 :     wu_t flags_vec = wu_bcast( flags_ );
     643             : 
     644   104180722 :     wu_t m[16];
     645   104180722 :     m[ 0] = wu_ldu( msg + (0<<lg_sz) + off      );
     646   104180722 :     m[ 1] = wu_ldu( msg + (1<<lg_sz) + off      );
     647   104180722 :     m[ 2] = wu_ldu( msg + (2<<lg_sz) + off      );
     648   104180722 :     m[ 3] = wu_ldu( msg + (3<<lg_sz) + off      );
     649   104180722 :     m[ 4] = wu_ldu( msg + (4<<lg_sz) + off      );
     650   104180722 :     m[ 5] = wu_ldu( msg + (5<<lg_sz) + off      );
     651   104180722 :     m[ 6] = wu_ldu( msg + (6<<lg_sz) + off      );
     652   104180722 :     m[ 7] = wu_ldu( msg + (7<<lg_sz) + off      );
     653   104180722 :     m[ 8] = wu_ldu( msg + (0<<lg_sz) + off + 32 );
     654   104180722 :     m[ 9] = wu_ldu( msg + (1<<lg_sz) + off + 32 );
     655   104180722 :     m[10] = wu_ldu( msg + (2<<lg_sz) + off + 32 );
     656   104180722 :     m[11] = wu_ldu( msg + (3<<lg_sz) + off + 32 );
     657   104180722 :     m[12] = wu_ldu( msg + (4<<lg_sz) + off + 32 );
     658   104180722 :     m[13] = wu_ldu( msg + (5<<lg_sz) + off + 32 );
     659   104180722 :     m[14] = wu_ldu( msg + (6<<lg_sz) + off + 32 );
     660   104180722 :     m[15] = wu_ldu( msg + (7<<lg_sz) + off + 32 );
     661             : 
     662   104180722 :     wu_transpose_8x8( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
     663   104180722 :                       m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7] );
     664   104180722 :     wu_transpose_8x8( m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
     665   104180722 :                       m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
     666             : 
     667   104180722 :     wu_t v[16] = {
     668   104180722 :         h0,     h1,     h2,     h3,
     669   104180722 :         h4,     h5,     h6,     h7,
     670   104180722 :         iv0,    iv1,    iv2,    iv3,
     671   104180722 :         ctr_lo, ctr_hi, sz_vec, flags_vec,
     672   104180722 :     };
     673             : 
     674   104180722 :     round_fn8( v, m, 0 );
     675   104180722 :     round_fn8( v, m, 1 );
     676   104180722 :     round_fn8( v, m, 2 );
     677   104180722 :     round_fn8( v, m, 3 );
     678   104180722 :     round_fn8( v, m, 4 );
     679   104180722 :     round_fn8( v, m, 5 );
     680   104180722 :     round_fn8( v, m, 6 );
     681             : 
     682   104180722 :     h0 = wu_xor( v[ 0], v[ 8] );
     683   104180722 :     h1 = wu_xor( v[ 1], v[ 9] );
     684   104180722 :     h2 = wu_xor( v[ 2], v[10] );
     685   104180722 :     h3 = wu_xor( v[ 3], v[11] );
     686   104180722 :     h4 = wu_xor( v[ 4], v[12] );
     687   104180722 :     h5 = wu_xor( v[ 5], v[13] );
     688   104180722 :     h6 = wu_xor( v[ 6], v[14] );
     689   104180722 :     h7 = wu_xor( v[ 7], v[15] );
     690             : 
     691   104180722 :     off = off_next;
     692   104180722 :   } while( off!=sz );
     693             : 
     694     8911882 :   wu_transpose_8x8( h0, h1, h2, h3, h4, h5, h6, h7,
     695     8911882 :                     h0, h1, h2, h3, h4, h5, h6, h7 );
     696             : 
     697     8911882 :   wu_st( (uint *)( out + (0UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h0 );
     698     8911882 :   wu_st( (uint *)( out + (1UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h1 );
     699     8911882 :   wu_st( (uint *)( out + (2UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h2 );
     700     8911882 :   wu_st( (uint *)( out + (3UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h3 );
     701     8911882 :   wu_st( (uint *)( out + (4UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h4 );
     702     8911882 :   wu_st( (uint *)( out + (5UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h5 );
     703     8911882 :   wu_st( (uint *)( out + (6UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h6 );
     704     8911882 :   wu_st( (uint *)( out + (7UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h7 );
     705     8911882 : }

Generated by: LCOV version 1.14