LCOV - code coverage report
Current view: top level - ballet/blake3 - fd_blake3.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 413 442 93.4 %
Date: 2026-06-29 05:51:35 Functions: 26 26 100.0 %

          Line data    Source code
       1             : #include "fd_blake3.h"
       2             : #include "fd_blake3_private.h"
       3             : 
       4             : /* Hash state machine *************************************************/
       5             : 
       6             : static FD_FN_UNUSED fd_blake3_pos_t *
       7             : fd_blake3_pos_init( fd_blake3_pos_t * s,
       8             :                     uchar const *     data,
       9    10973006 :                     ulong             sz ) {
      10    10973006 :   *s = (fd_blake3_pos_t) {
      11    10973006 :     .input    = data,
      12    10973006 :     .input_sz = sz,
      13    10973006 :     .magic    = FD_BLAKE3_MAGIC,
      14    10973006 :   };
      15    10973006 :   return s;
      16    10973006 : }
      17             : 
      18             : /* fd_blake3_l0_complete returns 1 if all leaf nodes have been hashed,
      19             :    0 otherwise. */
      20             : 
      21             : FD_FN_PURE static inline int
      22    31792715 : fd_blake3_l0_complete( fd_blake3_pos_t const * s ) {
      23    31792715 :   return ( s->leaf_idx<<FD_BLAKE3_CHUNK_LG_SZ ) >= fd_ulong_max( s->input_sz, 64 );
      24    31792715 : }
      25             : 
      26             : FD_FN_PURE static inline int
      27             : fd_blake3_is_finished( fd_blake3_pos_t const * s,
      28    14519532 :                        ulong                   tick ) {
      29    14519532 :   int l0_complete = fd_blake3_l0_complete( s );
      30    14519532 :   int ln_complete = s->live_cnt == 1UL;
      31    14519532 :   int idle        = tick >= s->next_tick;
      32    14519532 :   return l0_complete & ln_complete & idle;
      33    14519532 : }
      34             : 
      35             : static fd_blake3_op_t *
      36             : fd_blake3_prepare_leaf( fd_blake3_pos_t * restrict s,
      37             :                         fd_blake3_buf_t * restrict buf,
      38             :                         fd_blake3_op_t *  restrict op,
      39    12419288 :                         ulong                      tick ) {
      40             : 
      41    12419288 :   ulong         msg_off = s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ;
      42    12419288 :   ulong         msg_sz  = fd_ulong_min( s->input_sz - msg_off, 1024UL );
      43    12419288 :   uchar const * msg     = s->input + msg_off;
      44    12419288 :   uchar       * out     = buf->slots[ s->layer ][ s->head.uc[ s->layer ] ];
      45             : 
      46    12419288 :   int flags = fd_int_if( s->input_sz <= FD_BLAKE3_CHUNK_SZ, FD_BLAKE3_FLAG_ROOT, 0 );
      47             : 
      48    12419288 :   *op = (fd_blake3_op_t) {
      49    12419288 :     .msg     = msg,
      50    12419288 :     .out     = out,
      51    12419288 :     .counter = s->leaf_idx,
      52    12419288 :     .sz      = (ushort)msg_sz,
      53    12419288 :     .flags   = (uchar)flags
      54    12419288 :   };
      55             : 
      56    12419288 :   s->head.uc[ 0 ] = (uchar)( s->head.uc[ 0 ]+1 );
      57    12419288 :   s->leaf_idx++;
      58    12419288 :   s->live_cnt++;
      59    12419288 :   s->next_tick = tick+1;
      60             : 
      61    12419288 :   return op;
      62             : 
      63    12419288 : }
      64             : 
      65             : static int
      66             : fd_blake3_seek_branch( fd_blake3_pos_t * restrict s,
      67             :                        fd_blake3_buf_t * restrict buf,
      68    11537475 :                        ulong                      tick ) {
      69             : 
      70    11537475 :   if( s->live_cnt == 1UL )
      71       77571 :     return 0;
      72             : 
      73    11459904 :   if( !fd_blake3_l0_complete( s ) )
      74     1691520 :     return ( s->tail.uc[ s->layer - 1 ] + 1 ) <
      75     1691520 :            ( s->head.uc[ s->layer - 1 ]     );
      76             : 
      77     9768384 : # if FD_HAS_AVX
      78             : 
      79     9768384 :   wb_t diff = wb_sub( s->head.wb, s->tail.wb );
      80             : 
      81     9768384 :   uint mergeable_layers = (uint)_mm256_movemask_epi8( wb_gt( diff, wb_bcast( 1 ) ) );
      82     9768384 :   int  merge_layer = fd_uint_find_lsb_w_default( mergeable_layers, -1 );
      83     9768384 :   if( merge_layer>=0 ) {
      84     8996601 :     if( ((uint)merge_layer >= s->layer) & (tick < s->next_tick) )
      85     1633421 :       return 0;  /* still waiting for previous merge */
      86     7363180 :     s->layer = (uint)merge_layer+1U;
      87     7363180 :     return 1;
      88     8996601 :   }
      89             : 
      90      771783 :   uint single_layers = (uint)_mm256_movemask_epi8( wb_eq( diff, wb_bcast( 1 ) ) );
      91      771783 :   uint single_lo = (uint)fd_uint_find_lsb( single_layers );
      92      771783 :   uint single_hi = (uint)fd_uint_find_lsb( single_layers & ( ~fd_uint_mask_lsb( (int)(single_lo+1U) ) ) );
      93             : 
      94      771783 :   wb_t node = wb_ld( buf->slots[ single_lo ][ s->tail.uc[ single_lo ] ] );
      95      771783 :               wb_st( buf->slots[ single_hi ][ s->head.uc[ single_hi ] ], node );
      96             : 
      97             : # else /* FD_HAS_AVX */
      98             : 
      99             :   uchar diff[ 32 ];
     100             :   for( ulong j=0UL; j<32UL; j++ ) diff[j] = (uchar)( s->head.uc[j] - s->tail.uc[j] );
     101             : 
     102             :   int merge_layer = -1;
     103             :   for( uint j=0U; j<32U; j++ ) {
     104             :     if( diff[j]>1 ) {
     105             :       merge_layer = (int)j;
     106             :       break;
     107             :     }
     108             :   }
     109             :   if( merge_layer>=0 ) {
     110             :     if( ((uint)merge_layer >= s->layer) & (tick < s->next_tick) )
     111             :       return 0;  /* still waiting for previous merge */
     112             :     s->layer = (uint)(merge_layer+1);
     113             :     return 1;
     114             :   }
     115             : 
     116             :   uint j=0U;
     117             :   uint single_lo = 0UL;
     118             :   uint single_hi = 0UL;
     119             :   for( ; j<32U; j++ ) {
     120             :     if( diff[j] ) {
     121             :       single_lo = j;
     122             :       break;
     123             :     }
     124             :   }
     125             :   j++;
     126             :   for( ; j<32U; j++ ) {
     127             :     if( diff[j] ) {
     128             :       single_hi = j;
     129             :       break;
     130             :     }
     131             :   }
     132             : 
     133             :   memcpy( buf->slots[ single_hi ][ s->head.uc[ single_hi ] ],
     134             :           buf->slots[ single_lo ][ s->tail.uc[ single_lo ] ],
     135             :           32UL );
     136             : 
     137             : # endif /* FD_HAS_AVX */
     138             : 
     139      771783 :   FD_BLAKE3_TRACE(( "fd_blake3_seek_branch: moving up %u/%u to %u/%u",
     140      771783 :                     single_lo, s->tail.uc[ single_lo ],
     141      771783 :                     single_hi, s->head.uc[ single_hi ] ));
     142             : 
     143      771783 :   if( ((uint)single_hi >= s->layer) & (tick < s->next_tick) )
     144      263007 :     return 0;  /* still waiting for previous merge */
     145             : 
     146      508776 :   s->head.uc[ single_lo ] = (uchar)( s->head.uc[ single_lo ]-1 );
     147      508776 :   s->head.uc[ single_hi ] = (uchar)( s->head.uc[ single_hi ]+1 );
     148             : 
     149      508776 :   s->layer = (uint)single_hi+1U;
     150      508776 :   return 1;
     151      771783 : }
     152             : 
     153             : static fd_blake3_op_t *
     154             : fd_blake3_prepare_branch( fd_blake3_pos_t * restrict s,
     155             :                           fd_blake3_buf_t * restrict buf,
     156             :                           fd_blake3_op_t *  restrict op,
     157    11537475 :                           ulong                      tick ) {
     158             : 
     159    11537475 :   if( !fd_blake3_seek_branch( s, buf, tick ) )
     160     1973999 :     return NULL;
     161             : 
     162     9563476 :   FD_DCHECK_CRIT( s->layer < FD_BLAKE3_ROW_CNT, "invariant violation" );
     163             : 
     164     9563476 :   uchar const * msg = buf->slots[ s->layer-1U ][ s->tail.uc[ s->layer-1U ] ];
     165     9563476 :   uchar       * out = buf->slots[ s->layer    ][ s->head.uc[ s->layer    ] ];
     166             : 
     167     9563476 :   s->head.uc[ s->layer   ] = (uchar)( s->head.uc[ s->layer   ]+1 );
     168     9563476 :   s->tail.uc[ s->layer-1 ] = (uchar)( s->tail.uc[ s->layer-1 ]+2 );
     169     9563476 :   s->live_cnt--;
     170     9563476 :   s->next_tick = tick+1;
     171             : 
     172     9563476 :   uint flags = FD_BLAKE3_FLAG_PARENT |
     173     9563476 :                fd_uint_if( s->live_cnt==1UL, FD_BLAKE3_FLAG_ROOT, 0u );
     174             : 
     175     9563476 :   *op = (fd_blake3_op_t) {
     176     9563476 :     .msg     = msg,
     177     9563476 :     .out     = out,
     178     9563476 :     .counter = 0UL,
     179     9563476 :     .sz      = 64U,
     180     9563476 :     .flags   = (uchar)flags
     181     9563476 :   };
     182     9563476 :   return op;
     183             : 
     184    11537475 : }
     185             : 
     186             : static void
     187     2738049 : fd_blake3_advance( fd_blake3_pos_t * restrict s ) {
     188             : 
     189     2738049 : # if FD_HAS_AVX
     190             : 
     191     2738049 :   wb_t mask = wb_eq( s->tail.wb, s->head.wb );
     192     2738049 :   s->tail.wb = wb_andnot( mask, s->tail.wb );
     193     2738049 :   s->head.wb = wb_andnot( mask, s->head.wb );
     194             : 
     195             : # else /* FD_HAS_AVX */
     196             : 
     197             :   for( ulong j=0UL; j<32UL; j++ ) {
     198             :     if( s->tail.uc[j] == s->head.uc[j] ) {
     199             :       s->tail.uc[j] = 0;
     200             :       s->head.uc[j] = 0;
     201             :     }
     202             :   }
     203             : 
     204             : # endif /* FD_HAS_AVX */
     205             : 
     206     2738049 :   if( s->head.uc[ s->layer ]==FD_BLAKE3_COL_CNT ) {
     207       93938 :     s->layer++;
     208       93938 :   }
     209     2644111 :   else if( ( s->layer > 0UL ) &&
     210     2644111 :            ( s->tail.uc[ s->layer-1 ] < s->head.uc[ s->layer-1 ] ) ) {
     211             :     /* pass */
     212      785099 :   }
     213     1859012 :   else if( fd_blake3_l0_complete( s ) ) {
     214     1537307 :     s->layer++;
     215     1537307 :   }
     216      321705 :   else if( s->layer > 0UL ) {
     217      116270 :     s->layer = 0UL;
     218      116270 :   }
     219             : 
     220     2738049 : }
     221             : 
     222             : static fd_blake3_op_t *
     223             : fd_blake3_prepare( fd_blake3_pos_t * restrict s,
     224             :                    fd_blake3_buf_t * restrict buf,
     225             :                    fd_blake3_op_t *  restrict op,
     226    13597518 :                    ulong                      tick ) {
     227             : 
     228    13597518 :   FD_DCHECK_CRIT( s->layer < FD_BLAKE3_ROW_CNT, "invariant violation" );
     229             : 
     230    13597518 :   if( fd_blake3_is_finished( s, tick ) )
     231           0 :     return NULL;
     232             : 
     233    13597518 :   if( tick >= s->next_tick )
     234     2738049 :     fd_blake3_advance( s );
     235             : 
     236    13597518 :   if( s->layer != 0 )
     237    11537475 :     return fd_blake3_prepare_branch( s, buf, op, tick );
     238             : 
     239     2060043 :   if( ( s->head.uc[0] >= FD_BLAKE3_COL_CNT ) |
     240     2060043 :       ( fd_blake3_l0_complete( s )         ) ) {
     241      291927 :     return NULL;
     242      291927 :   }
     243             : 
     244     1768116 :   return fd_blake3_prepare_leaf( s, buf, op, tick );
     245             : 
     246     2060043 : }
     247             : 
     248             : #if FD_BLAKE3_PARA_MAX>1
     249             : 
     250             : /* fd_blake3_prepare_fast does streamlined hashing of full chunks or
     251             :    full branches. */
     252             : 
     253             : static fd_blake3_op_t *
     254             : fd_blake3_prepare_fast( fd_blake3_pos_t * restrict s,
     255             :                         fd_blake3_buf_t * restrict buf,
     256             :                         fd_blake3_op_t *  restrict op,
     257             :                         ulong                      n,
     258     8436075 :                         ulong                      min ) {
     259             : 
     260     8436075 :   if( s->layer && s->head.uc[ s->layer-1 ]==FD_BLAKE3_COL_CNT ) {
     261     3803638 :     op->msg     = buf->rows[ s->layer-1 ];
     262     3803638 :     op->out     = buf->rows[ s->layer ] + (s->head.uc[ s->layer ]<<FD_BLAKE3_OUTCHAIN_LG_SZ);
     263     3803638 :     op->counter = 0UL;
     264     3803638 :     op->flags   = FD_BLAKE3_FLAG_PARENT;
     265             : 
     266             :     /* Assume that branch layer is fully hashed (up to col cnt) */
     267     3803638 :     s->head.uc[ s->layer-1 ] =  0;
     268     3803638 :     s->head.uc[ s->layer   ] = (uchar)( (ulong)s->head.uc[ s->layer ]+n );
     269     3803638 :     s->live_cnt -= n;
     270     3803638 :     s->layer = fd_uint_if( s->head.uc[ s->layer ]==FD_BLAKE3_COL_CNT,
     271     3803638 :                            s->layer+1U, 0U );
     272             : 
     273     3803638 :     return op;
     274     3803638 :   }
     275             : 
     276     4632437 :   ulong pos   = s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ;
     277     4632437 :   ulong avail = fd_ulong_align_dn( s->input_sz - pos, FD_BLAKE3_CHUNK_SZ ) >> FD_BLAKE3_CHUNK_LG_SZ;
     278     4632437 :   n = fd_ulong_min( n, avail );
     279             : 
     280             :   /* This constants controls the threshold when to use the (slow)
     281             :      scheduler instead of fast single-message hashing.  Carefully tuned
     282             :      for best overall performance. */
     283     4632437 :   if( n<min ) return NULL;
     284             : 
     285     4606460 :   op->msg     = s->input + (s->leaf_idx<<FD_BLAKE3_CHUNK_LG_SZ);
     286     4606460 :   op->out     = buf->rows[0] + (s->head.uc[0]<<FD_BLAKE3_OUTCHAIN_LG_SZ);
     287     4606460 :   op->counter = s->leaf_idx;
     288     4606460 :   op->flags   = 0;
     289             : 
     290     4606460 :   s->head.uc[0] = (uchar)( (ulong)s->head.uc[0]+n );
     291     4606460 :   s->leaf_idx   += n;
     292     4606460 :   s->live_cnt   += n;
     293     4606460 :   s->layer      =  fd_uint_if( s->head.uc[0]==FD_BLAKE3_COL_CNT, 1U, 0U );
     294             : 
     295     4606460 :   return op;
     296     4632437 : }
     297             : 
     298             : static void
     299             : fd_blake3_batch_hash( fd_blake3_op_t const * ops,
     300     2494404 :                       ulong                  op_cnt ) {
     301     2494404 :   uchar const * batch_data   [ FD_BLAKE3_PARA_MAX ] __attribute__((aligned(64)));
     302     2494404 :   uint          batch_data_sz[ FD_BLAKE3_PARA_MAX ] = {0};
     303     2494404 :   uchar *       batch_hash   [ FD_BLAKE3_PARA_MAX ] __attribute__((aligned(64)));
     304     2494404 :   ulong         batch_ctr    [ FD_BLAKE3_PARA_MAX ];
     305     2494404 :   uint          batch_flags  [ FD_BLAKE3_PARA_MAX ];
     306    13582351 :   for( ulong j=0UL; j<op_cnt; j++ ) {
     307    11087947 :     batch_data   [ j ] = ops[ j ].msg;
     308    11087947 :     batch_hash   [ j ] = ops[ j ].out;
     309    11087947 :     batch_data_sz[ j ] = ops[ j ].sz;
     310    11087947 :     batch_ctr    [ j ] = ops[ j ].counter;
     311    11087947 :     batch_flags  [ j ] = ops[ j ].flags;
     312    11087947 :   }
     313      827402 : #if FD_HAS_AVX512
     314      827402 :   fd_blake3_avx512_compress16( op_cnt, batch_data, batch_data_sz, batch_ctr, batch_flags, fd_type_pun( batch_hash ), NULL, 32U, NULL );
     315             : #elif FD_HAS_AVX
     316     1667002 :   fd_blake3_avx_compress8    ( op_cnt, batch_data, batch_data_sz, batch_ctr, batch_flags, fd_type_pun( batch_hash ), NULL, 32U, NULL );
     317             : #else
     318             :   #error "FIXME missing para support"
     319             : #endif
     320     2494404 : }
     321             : 
     322             : #endif
     323             : 
     324             : /* Simple API *********************************************************/
     325             : 
     326             : ulong
     327          66 : fd_blake3_align( void ) {
     328          66 :   return FD_BLAKE3_ALIGN;
     329          66 : }
     330             : 
     331             : ulong
     332          21 : fd_blake3_footprint( void ) {
     333          21 :   return FD_BLAKE3_FOOTPRINT;
     334          21 : }
     335             : 
     336             : void *
     337          24 : fd_blake3_new( void * shmem ) {
     338          24 :   fd_blake3_t * sha = (fd_blake3_t *)shmem;
     339             : 
     340          24 :   if( FD_UNLIKELY( !shmem ) ) {
     341           3 :     FD_LOG_WARNING(( "NULL shmem" ));
     342           3 :     return NULL;
     343           3 :   }
     344             : 
     345          21 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, fd_blake3_align() ) ) ) {
     346           3 :     FD_LOG_WARNING(( "misaligned shmem" ));
     347           3 :     return NULL;
     348           3 :   }
     349             : 
     350          18 :   ulong footprint = fd_blake3_footprint();
     351             : 
     352          18 :   fd_memset( sha, 0, footprint );
     353             : 
     354          18 :   FD_COMPILER_MFENCE();
     355          18 :   FD_VOLATILE( sha->pos.magic ) = FD_BLAKE3_MAGIC;
     356          18 :   FD_COMPILER_MFENCE();
     357             : 
     358          18 :   return (void *)sha;
     359          21 : }
     360             : 
     361             : fd_blake3_t *
     362          24 : fd_blake3_join( void * shsha ) {
     363             : 
     364          24 :   if( FD_UNLIKELY( !shsha ) ) {
     365           3 :     FD_LOG_WARNING(( "NULL shsha" ));
     366           3 :     return NULL;
     367           3 :   }
     368             : 
     369          21 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shsha, fd_blake3_align() ) ) ) {
     370           3 :     FD_LOG_WARNING(( "misaligned shsha" ));
     371           3 :     return NULL;
     372           3 :   }
     373             : 
     374          18 :   fd_blake3_t * sha = (fd_blake3_t *)shsha;
     375             : 
     376          18 :   if( FD_UNLIKELY( sha->pos.magic!=FD_BLAKE3_MAGIC ) ) {
     377           0 :     FD_LOG_WARNING(( "bad magic" ));
     378           0 :     return NULL;
     379           0 :   }
     380             : 
     381          18 :   return sha;
     382          18 : }
     383             : 
     384             : void *
     385          21 : fd_blake3_leave( fd_blake3_t * sha ) {
     386             : 
     387          21 :   if( FD_UNLIKELY( !sha ) ) {
     388           3 :     FD_LOG_WARNING(( "NULL sha" ));
     389           3 :     return NULL;
     390           3 :   }
     391             : 
     392          18 :   return (void *)sha;
     393          21 : }
     394             : 
     395             : void *
     396          24 : fd_blake3_delete( void * shsha ) {
     397             : 
     398          24 :   if( FD_UNLIKELY( !shsha ) ) {
     399           3 :     FD_LOG_WARNING(( "NULL shsha" ));
     400           3 :     return NULL;
     401           3 :   }
     402             : 
     403          21 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shsha, fd_blake3_align() ) ) ) {
     404           3 :     FD_LOG_WARNING(( "misaligned shsha" ));
     405           3 :     return NULL;
     406           3 :   }
     407             : 
     408          18 :   fd_blake3_t * sha = (fd_blake3_t *)shsha;
     409             : 
     410          18 :   if( FD_UNLIKELY( sha->pos.magic!=FD_BLAKE3_MAGIC ) ) {
     411           0 :     FD_LOG_WARNING(( "bad magic" ));
     412           0 :     return NULL;
     413           0 :   }
     414             : 
     415          18 :   FD_COMPILER_MFENCE();
     416          18 :   FD_VOLATILE( sha->pos.magic ) = 0UL;
     417          18 :   FD_COMPILER_MFENCE();
     418             : 
     419          18 :   return (void *)sha;
     420          18 : }
     421             : 
     422             : 
     423             : fd_blake3_t *
     424    10947029 : fd_blake3_init( fd_blake3_t * sha ) {
     425    10947029 :   FD_BLAKE3_TRACE(( "fd_blake3_init(sha=%p)", (void *)sha ));
     426    10947029 :   fd_blake3_pos_init( &sha->pos, NULL, 0UL );
     427    10947029 :   sha->block_sz = 0UL;
     428    10947029 :   return sha;
     429    10947029 : }
     430             : 
     431             : #if FD_BLAKE3_PARA_MAX>1
     432             : 
     433             : static void
     434             : fd_blake3_append_blocks( fd_blake3_pos_t * s,
     435             :                          fd_blake3_buf_t * tbl,
     436             :                          uchar const *     data,
     437      350331 :                          ulong             buf_cnt ) {
     438      350331 :   s->input = data - (s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ); /* TODO HACKY!! */
     439     4266433 :   for( ulong i=0UL; i<buf_cnt; i++ ) {
     440     3916102 :     fd_blake3_op_t op[1];
     441     7117589 :     do {
     442     7117589 :       if( !fd_blake3_prepare_fast( s, tbl, op, FD_BLAKE3_PARA_MAX, FD_BLAKE3_PARA_MAX ) )
     443           0 :         return;
     444     1336483 : #if FD_HAS_AVX512
     445     1336483 :       fd_blake3_avx512_compress16_fast( op->msg, op->out, op->counter, op->flags );
     446             : #elif FD_HAS_AVX
     447     5781106 :       fd_blake3_avx_compress8_fast( op->msg, op->out, op->counter, op->flags );
     448             : #else
     449             :       #error "missing para support"
     450             : #endif
     451     7117589 :     } while( op->flags & FD_BLAKE3_FLAG_PARENT );
     452     3916102 :   }
     453      350331 : }
     454             : 
     455             : #else
     456             : 
     457             : static void
     458             : fd_blake3_append_blocks( fd_blake3_pos_t * s,
     459             :                          fd_blake3_buf_t * tbl,
     460             :                          uchar const *     data,
     461             :                          ulong             buf_cnt ) {
     462             :   (void)buf_cnt;
     463             :   s->input = data - (s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ); /* TODO HACKY!! */
     464             :   fd_blake3_op_t op[1];
     465             :   while( buf_cnt ) {
     466             :     if( !fd_blake3_prepare( s, tbl, op, s->next_tick ) ) {
     467             :       FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: no more ops to prepare" ));
     468             :       break;
     469             :     }
     470             :     if( op->flags & FD_BLAKE3_FLAG_PARENT ) {
     471             :       FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: compressing output chaining values (layer %u)", s->layer ));
     472             :       fd_blake3_ref_compress1( op->out, op->msg, 64UL, op->counter, op->flags, NULL, NULL );
     473             :     } else {
     474             :       FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: compressing %lu leaf chunks", FD_BLAKE3_COL_CNT ));
     475             :       fd_blake3_ref_compress1( op->out, op->msg, FD_BLAKE3_CHUNK_SZ, op->counter, op->flags, NULL, NULL );
     476             :       buf_cnt--;
     477             :     }
     478             :     s->next_tick++;
     479             :   }
     480             : }
     481             : 
     482             : #endif
     483             : 
     484             : fd_blake3_t *
     485             : fd_blake3_append( fd_blake3_t * sha,
     486             :                   void const *  _data,
     487    11130437 :                   ulong         sz ) {
     488             : 
     489             :   /* If no data to append, we are done */
     490             : 
     491    11130437 :   if( FD_UNLIKELY( !sz ) ) return sha;
     492    11091205 :   FD_BLAKE3_TRACE(( "fd_blake3_append(sha=%p,data=%p,sz=%lu)", (void *)sha, _data, sz ));
     493             : 
     494             :   /* Unpack inputs */
     495             : 
     496    11091205 :   fd_blake3_pos_t * s        = &sha->pos;
     497    11091205 :   fd_blake3_buf_t * tbl      = &sha->buf;
     498    11091205 :   uchar *           buf      = sha->block;
     499    11091205 :   ulong             buf_used = sha->block_sz;
     500             : 
     501    11091205 :   uchar const * data = (uchar const *)_data;
     502             : 
     503             :   /* Update input_sz */
     504             : 
     505    11091205 :   s->input_sz += sz;
     506             : 
     507             :   /* Edge case: For the first completed 1024 bytes of input, don't
     508             :      immediately hash, since it is not clear whether this chunk has
     509             :      the root flag set. */
     510    11091205 :   if( FD_UNLIKELY( FD_BLAKE3_PARA_MAX==1 && s->input_sz==1024UL ) ) {
     511           0 :     fd_memcpy( buf + buf_used, data, sz );
     512           0 :     sha->block_sz = FD_BLAKE3_CHUNK_SZ;
     513           0 :     return sha;
     514           0 :   }
     515             : 
     516             :   /* Handle buffered bytes from previous appends */
     517             : 
     518    11091205 :   if( FD_UNLIKELY( buf_used ) ) { /* optimized for well aligned use of append */
     519             : 
     520             :     /* If the append isn't large enough to complete the current block,
     521             :        buffer these bytes too and return */
     522             : 
     523      148190 :     ulong buf_rem = FD_BLAKE3_PRIVATE_BUF_MAX - buf_used; /* In (0,FD_BLAKE3_PRIVATE_BUF_MAX) */
     524      148190 :     if( FD_UNLIKELY( sz < buf_rem ) ) { /* optimize for large append */
     525       88211 :       fd_memcpy( buf + buf_used, data, sz );
     526       88211 :       sha->block_sz = buf_used + sz;
     527       88211 :       return sha;
     528       88211 :     }
     529             : 
     530             :     /* Otherwise, buffer enough leading bytes of data to complete the
     531             :        block, update the hash and then continue processing any remaining
     532             :        bytes of data. */
     533             : 
     534       59979 :     fd_memcpy( buf + buf_used, data, buf_rem );
     535       59979 :     data += buf_rem;
     536       59979 :     sz   -= buf_rem;
     537             : 
     538       59979 :     fd_blake3_append_blocks( s, tbl, buf, 1UL );
     539       59979 :     sha->block_sz = 0UL;
     540       59979 :   }
     541             : 
     542             :   /* Append the bulk of the data */
     543             : 
     544    11002994 :   ulong buf_cnt = sz >> FD_BLAKE3_PRIVATE_LG_BUF_MAX;
     545    11002994 :   if( FD_LIKELY( buf_cnt ) ) fd_blake3_append_blocks( s, tbl, data, buf_cnt ); /* optimized for large append */
     546             : 
     547             :   /* Buffer any leftover bytes */
     548             : 
     549    11002994 :   buf_used = sz & (FD_BLAKE3_PRIVATE_BUF_MAX-1UL); /* In [0,FD_BLAKE3_PRIVATE_BUF_MAX) */
     550    11002994 :   if( FD_UNLIKELY( buf_used ) ) { /* optimized for well aligned use of append */
     551    11002920 :     fd_memcpy( buf, data + (buf_cnt << FD_BLAKE3_PRIVATE_LG_BUF_MAX), buf_used );
     552    11002920 :     sha->block_sz = buf_used; /* In (0,FD_BLAKE3_PRIVATE_BUF_MAX) */
     553    11002920 :   }
     554             : 
     555    11002994 :   FD_BLAKE3_TRACE(( "fd_blake3_append: done" ));
     556    11002994 :   return sha;
     557    11091205 : }
     558             : 
     559             : static void const *
     560             : fd_blake3_single_hash( fd_blake3_pos_t * s,
     561       78189 :                        fd_blake3_buf_t * tbl ) {
     562       78189 : #if FD_BLAKE3_PARA_MAX>1
     563       78189 :   ulong tick = 0UL;
     564      922014 :   while( !fd_blake3_is_finished( s, tick ) ) {
     565      843825 :     fd_blake3_op_t ops[ FD_BLAKE3_PARA_MAX ] = {0};
     566      843825 :     ulong          op_cnt = 0UL;
     567     4304248 :     while( op_cnt<FD_BLAKE3_PARA_MAX ) {
     568     4246194 :       fd_blake3_op_t * op = &ops[ op_cnt ];
     569     4246194 :       if( !fd_blake3_prepare( s, tbl, op, tick ) )
     570      785771 :         break;
     571     3460423 :       op_cnt++;
     572     3460423 :     }
     573             : 
     574      843825 :     fd_blake3_batch_hash( ops, op_cnt );
     575      843825 :     tick++;
     576      843825 :   }
     577             : #else
     578             :   while( !fd_blake3_is_finished( s, s->next_tick ) ) {
     579             :     fd_blake3_op_t op[1] = {0};
     580             :     if( !fd_blake3_prepare( s, tbl, op, s->next_tick ) )
     581             :       break;
     582             :     s->next_tick++;
     583             :     FD_BLAKE3_TRACE(( "fd_blake3_single_hash: compressing %hu bytes at layer %u, counter %lu, flags 0x%x",
     584             :                       op->sz, s->layer, op->counter, op->flags ));
     585             : #   if FD_HAS_SSE
     586             :     fd_blake3_sse_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
     587             : #   else
     588             :     fd_blake3_ref_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
     589             : #   endif
     590             :   }
     591             : #endif
     592       78189 :   return tbl->slots[ s->layer ][0];
     593       78189 : }
     594             : 
     595             : void *
     596             : fd_blake3_fini( fd_blake3_t * sha,
     597       52212 :                 void *        hash ) {
     598             : 
     599             :   /* Unpack inputs */
     600             : 
     601       52212 :   fd_blake3_pos_t * s        = &sha->pos;
     602       52212 :   fd_blake3_buf_t * tbl      = &sha->buf;
     603       52212 :   uchar *           buf      = sha->block;
     604       52212 :   ulong             buf_used = sha->block_sz;
     605       52212 :   FD_BLAKE3_TRACE(( "fd_blake3_fini(sha=%p,sz=%lu)", (void *)sha, s->input_sz ));
     606             : 
     607             :   /* TODO HACKY!! */
     608       52212 :   s->input    = buf - ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ );
     609       52212 :   s->input_sz = ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ ) + buf_used;
     610             : 
     611       52212 :   void const * hash_ = fd_blake3_single_hash( s, tbl );
     612       52212 :   memcpy( hash, hash_, 32UL );
     613       52212 :   return hash;
     614       52212 : }
     615             : 
     616             : /* fd_blake3_fini_xof_compress performs BLAKE3 compression (input
     617             :    hashing) for all blocks in the hash tree except for the root block.
     618             :    Root compression inputs are returned via the function's out pointers:
     619             :    On return, root_msg[0..64] contains the padded message input for the
     620             :    root block, root_cv_pre[0..64] contains the output chaining value of
     621             :    the previous block (or the BLAKE3 IV if root block is the only block
     622             :    in the hash operation, i.e. <=64 byte hash input).
     623             :    Other values (counter, flags, size) are re-derived by the XOF
     624             :    implementation using the blake3 state object. */
     625             : 
     626             : void
     627             : fd_blake3_fini_xof_compress( fd_blake3_t * sha,
     628             :                              uchar *       root_msg,
     629    10894817 :                              uchar *       root_cv_pre ) {
     630    10894817 :   fd_blake3_pos_t * s        = &sha->pos;
     631    10894817 :   fd_blake3_buf_t * tbl      = &sha->buf;
     632    10894817 :   uchar *           buf      = sha->block;
     633    10894817 :   ulong             buf_used = sha->block_sz;
     634             : 
     635             :   /* TODO HACKY!! */
     636    10894817 :   s->input    = buf - ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ );
     637    10894817 :   s->input_sz = ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ ) + buf_used;
     638             : 
     639             :   /* The root block is contained in a leaf.  Process all but the last
     640             :      blocks of the chunk.  (The last block is the "root" block) */
     641    10894817 :   if( s->input_sz<=FD_BLAKE3_CHUNK_SZ ) {
     642    10651172 :     fd_blake3_op_t op[1];
     643    10651172 :     if( !fd_blake3_prepare_leaf( s, tbl, op, s->next_tick ) )
     644           0 :       FD_LOG_ERR(( "fd_blake3_fini_xof_compress invariant violation: failed to prepare compression of <=1024 byte message (duplicate call to fini?)" ));
     645    10651172 : #if FD_HAS_SSE
     646    10651172 :     fd_blake3_sse_compress1( root_msg, op->msg, op->sz, op->counter, op->flags, root_cv_pre, NULL );
     647             : #else
     648             :     fd_blake3_ref_compress1( root_msg, op->msg, op->sz, op->counter, op->flags, root_cv_pre, NULL );
     649             : #endif
     650    10651172 :     return;
     651    10651172 :   }
     652             : 
     653             :   /* The root block is a branch node.  Continue working until there are
     654             :      only two blocks remaining. */
     655      243645 :   ulong tick = sha->pos.next_tick+1;
     656     1894224 :   for(;;) {
     657     1894224 :     int l0_complete = fd_blake3_l0_complete( s );
     658     1894224 :     int ln_complete = s->live_cnt == 2UL;
     659     1894224 :     if( l0_complete & ln_complete ) break;
     660             : 
     661     1650579 : #if FD_BLAKE3_PARA_MAX>1
     662     1650579 :     fd_blake3_op_t ops[ FD_BLAKE3_PARA_MAX ] = {0};
     663     1650579 :     ulong          op_cnt = 0UL;
     664     9278103 :     while( op_cnt<FD_BLAKE3_PARA_MAX ) {
     665     9107679 :       fd_blake3_op_t * op = &ops[ op_cnt ];
     666     9107679 :       if( !fd_blake3_prepare( s, tbl, op, tick ) )
     667     1480155 :         break;
     668     7627524 :       op_cnt++;
     669     7627524 :     }
     670     1650579 :     if( FD_UNLIKELY( !op_cnt ) ) {
     671           0 :       FD_LOG_ERR(( "fd_blake3_fini_xof_compress invariant violation: failed to prepare branch compression with live_cnt=%lu (duplicate call to fini?)", s->live_cnt ));
     672           0 :     }
     673             : 
     674     1650579 :     fd_blake3_batch_hash( ops, op_cnt );
     675             : #else
     676             :     fd_blake3_op_t op[1] = {0};
     677             :     if( !fd_blake3_prepare( s, tbl, op, tick ) )
     678             :       break;
     679             : #   if FD_HAS_SSE
     680             :     fd_blake3_sse_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
     681             : #   else
     682             :     fd_blake3_ref_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
     683             : #   endif
     684             : #endif
     685     1650579 :     tick++;
     686     1650579 :   }
     687      243645 : }
     688             : 
     689             : void *
     690             : fd_blake3_fini_2048( fd_blake3_t * sha,
     691    10894753 :                      void *        hash ) {
     692    10894753 :   FD_BLAKE3_TRACE(( "fd_blake3_fini_2048(sha=%p,hash=%p)", (void *)sha, hash ));
     693             : 
     694             :   /* Compress input until the last remaining piece of work is the BLAKE3
     695             :      root block.  This root block is put through the compression
     696             :      function repeatedly to "expand" the hash output (XOF hashing).
     697             :      Solana uses this to generate a 2048 byte 'LtHash' value.
     698             :      fd_blake3 does this SIMD-parallel for better performance. */
     699    10894753 :   uchar root_msg   [ 64 ] __attribute__((aligned(64)));
     700    10894753 :   uchar root_cv_pre[ 32 ] __attribute__((aligned(32)));
     701    10894753 :   fd_blake3_fini_xof_compress( sha, root_msg, root_cv_pre );
     702             : 
     703             :   /* Restore root block details */
     704    10894753 :   uint          last_block_sz    = 64u;
     705    10894753 :   uint          last_block_flags = FD_BLAKE3_FLAG_ROOT | FD_BLAKE3_FLAG_PARENT;
     706    10894753 :   ulong         ctr0             = 0UL;
     707    10894753 :   if( sha->pos.input_sz<=FD_BLAKE3_CHUNK_SZ ) {
     708    10651108 :     last_block_sz    = (uint)sha->pos.input_sz & 63u;
     709    10651108 :     if( fd_ulong_is_aligned( sha->pos.input_sz, 64 ) ) last_block_sz = 64;
     710    10651108 :     if( FD_UNLIKELY( sha->pos.input_sz==0UL        ) ) last_block_sz = 0u;
     711    10651108 :     last_block_flags = FD_BLAKE3_FLAG_ROOT | FD_BLAKE3_FLAG_CHUNK_END;
     712    10651108 :     if( sha->pos.input_sz<=FD_BLAKE3_BLOCK_SZ ) last_block_flags |= FD_BLAKE3_FLAG_CHUNK_START;
     713    10651108 :     ctr0             = sha->pos.leaf_idx-1UL;
     714    10651108 :   } else {
     715      243645 :     fd_blake3_op_t op[1];
     716      243645 :     if( FD_UNLIKELY( !fd_blake3_prepare( &sha->pos, &sha->buf, op, sha->pos.next_tick+1UL ) ) ) {
     717           0 :       FD_LOG_ERR(( "fd_blake3_fini_2048 invariant violation: failed to prepare branch root compression (duplicate call to fini?)" ));
     718           0 :     }
     719      243645 :     memcpy( root_msg,    op->msg,      64UL );
     720      243645 :     memcpy( root_cv_pre, FD_BLAKE3_IV, 32UL );
     721      243645 :   }
     722    10894753 :   FD_BLAKE3_TRACE(( "fd_blake3_fini_2048: sz=%lu ctr0=%lu flags=%x",
     723    10894753 :                     sha->pos.input_sz, ctr0, last_block_flags ));
     724             : 
     725             :   /* Expand LtHash
     726             :      For now, this uses the generic AVX2/AVX512 compress backend.
     727             :      Could write a more optimized version in the future saving some of
     728             :      the matrix transpose work. */
     729    45069519 :   for( ulong i=0UL; i<32UL; i+=FD_BLAKE3_PARA_MAX ) {
     730     9404246 : #if FD_HAS_AVX512
     731     9404246 :     ulong  batch_data [ 16 ] __attribute__((aligned(64)));
     732   159872182 :     /*                     */ for( ulong j=0; j<16; j++ ) batch_data [ j ] = (ulong)root_msg;
     733   159872182 :     uint   batch_sz   [ 16 ]; for( ulong j=0; j<16; j++ ) batch_sz   [ j ] = last_block_sz;
     734   159872182 :     ulong  batch_ctr  [ 16 ]; for( ulong j=0; j<16; j++ ) batch_ctr  [ j ] = ctr0+i+j;
     735   159872182 :     uint   batch_flags[ 16 ]; for( ulong j=0; j<16; j++ ) batch_flags[ j ] = last_block_flags;
     736   159872182 :     void * batch_hash [ 16 ]; for( ulong j=0; j<16; j++ ) batch_hash [ j ] = (uchar *)hash + (i+j)*64;
     737   159872182 :     void * batch_cv   [ 16 ]; for( ulong j=0; j<16; j++ ) batch_cv   [ j ] = root_cv_pre;
     738     9404246 :     fd_blake3_avx512_compress16( 16UL, batch_data, batch_sz, batch_ctr, batch_flags, batch_hash, NULL, 64U, batch_cv );
     739             : #elif FD_HAS_AVX
     740   222934680 :     ulong  batch_data [ 8 ]; for( ulong j=0; j<8; j++ ) batch_data [ j ] = (ulong)root_msg;
     741   222934680 :     uint   batch_sz   [ 8 ]; for( ulong j=0; j<8; j++ ) batch_sz   [ j ] = last_block_sz;
     742   222934680 :     ulong  batch_ctr  [ 8 ]; for( ulong j=0; j<8; j++ ) batch_ctr  [ j ] = ctr0+i+j;
     743   222934680 :     uint   batch_flags[ 8 ]; for( ulong j=0; j<8; j++ ) batch_flags[ j ] = last_block_flags;
     744   222934680 :     void * batch_hash [ 8 ]; for( ulong j=0; j<8; j++ ) batch_hash [ j ] = (uchar *)hash + (i+j)*64;
     745   222934680 :     void * batch_cv   [ 8 ]; for( ulong j=0; j<8; j++ ) batch_cv   [ j ] = root_cv_pre;
     746    24770520 :     fd_blake3_avx_compress8( 8UL, batch_data, batch_sz, batch_ctr, batch_flags, batch_hash, NULL, 64U, batch_cv );
     747             : #elif FD_HAS_SSE
     748             :     fd_blake3_sse_compress1( (uchar *)hash+i*64, root_msg, last_block_sz, ctr0+i, last_block_flags, NULL, root_cv_pre );
     749             : #else
     750             :     fd_blake3_ref_compress1( (uchar *)hash+i*64, root_msg, last_block_sz, ctr0+i, last_block_flags, NULL, root_cv_pre );
     751             : #endif
     752    34174766 :   }
     753             : 
     754    10894753 :   FD_BLAKE3_TRACE(( "fd_blake3_fini_2048: done" ));
     755    10894753 :   return hash;
     756    10894753 : }
     757             : 
     758             : void *
     759             : fd_blake3_hash( void const * data,
     760             :                 ulong        sz,
     761       25977 :                 void *       hash ) {
     762             : 
     763       25977 :   fd_blake3_buf_t tbl[1];
     764       25977 :   fd_blake3_pos_t s[1];
     765       25977 :   fd_blake3_pos_init( s, data, sz );
     766             : 
     767       25977 : #if FD_BLAKE3_PARA_MAX>1
     768     1318486 :   for(;;) {
     769     1318486 :     fd_blake3_op_t op[1];
     770     1318486 :     if( !fd_blake3_prepare_fast( s, tbl, op, FD_BLAKE3_PARA_MAX, 4 ) )
     771       25977 :       break;
     772      251627 : #if FD_HAS_AVX512
     773      251627 :     fd_blake3_avx512_compress16_fast( op->msg, op->out, op->counter, op->flags );
     774             : #elif FD_HAS_AVX
     775     1040882 :     fd_blake3_avx_compress8_fast( op->msg, op->out, op->counter, op->flags );
     776             : #else
     777             :     #error "missing para support"
     778             : #endif
     779     1292509 :   }
     780       25977 : #endif
     781             : 
     782       25977 :   void const * hash_ = fd_blake3_single_hash( s, tbl );
     783       25977 :   memcpy( hash, hash_, 32UL );
     784       25977 :   return hash;
     785       25977 : }
     786             : 
     787             : #if FD_HAS_AVX
     788             : 
     789             : void
     790             : fd_blake3_lthash_batch8(
     791             :     void const * batch_data[8],  /* align=32 ele_align=1 */
     792             :     uint const   batch_sz  [8],  /* align=32 */
     793             :     void *       out_lthash      /* align=32 */
     794     1378478 : ) {
     795     1378478 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_data, 32 ) ) ) {
     796           0 :     FD_LOG_ERR(( "misaligned batch_data: %p", (void *)batch_data ));
     797           0 :   }
     798     1378478 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_sz, 32 ) ) ) {
     799           0 :     FD_LOG_ERR(( "misaligned batch_sz: %p", (void *)batch_sz ));
     800           0 :   }
     801     1378478 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)out_lthash, 32 ) ) ) {
     802           0 :     FD_LOG_ERR(( "misaligned out_lthash: %p", (void *)out_lthash ));
     803           0 :   }
     804             : 
     805     1378478 :   ulong batch_ctr  [ 8 ] = {0};
     806    12406302 :   uint  batch_flags[ 8 ]; for( uint i=0; i<8; i++ ) batch_flags[ i ] = FD_BLAKE3_FLAG_ROOT;
     807     1378478 :   fd_blake3_avx_compress8( 8UL, batch_data, batch_sz, batch_ctr, batch_flags, NULL, out_lthash, 32U, NULL );
     808     1378478 : }
     809             : 
     810             : #endif
     811             : 
     812             : #if FD_HAS_AVX512
     813             : 
     814             : void
     815             : fd_blake3_lthash_batch16(
     816             :     void const * batch_data[16],  /* align=32 ele_align=1 */
     817             :     uint const   batch_sz  [16],  /* align=32 */
     818             :     void *       out_lthash      /* align=32 */
     819      369256 : ) {
     820      369256 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_data, 64 ) ) ) {
     821           0 :     FD_LOG_ERR(( "misaligned batch_data: %p", (void *)batch_data ));
     822           0 :   }
     823      369256 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_sz, 64 ) ) ) {
     824           0 :     FD_LOG_ERR(( "misaligned batch_sz: %p", (void *)batch_sz ));
     825           0 :   }
     826      369256 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)out_lthash, 64 ) ) ) {
     827           0 :     FD_LOG_ERR(( "misaligned out_lthash: %p", (void *)out_lthash ));
     828           0 :   }
     829             : 
     830      369256 :   ulong batch_ctr  [ 16 ] = {0};
     831     6277352 :   uint  batch_flags[ 16 ]; for( uint i=0; i<16; i++ ) batch_flags[ i ] = FD_BLAKE3_FLAG_ROOT;
     832      369256 :   fd_blake3_avx512_compress16( 16UL, batch_data, batch_sz, batch_ctr, batch_flags, NULL, out_lthash, 32U, NULL );
     833      369256 : }
     834             : 
     835             : #endif

Generated by: LCOV version 1.14