LCOV - code coverage report
Current view: top level - ballet/blake3 - fd_blake3.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 413 442 93.4 %
Date: 2025-09-18 04:41:32 Functions: 26 26 100.0 %

          Line data    Source code
       1             : #include "fd_blake3.h"
       2             : #include "fd_blake3_private.h"
       3             : #include <assert.h>
       4             : 
       5             : /* Hash state machine *************************************************/
       6             : 
       7             : static FD_FN_UNUSED fd_blake3_pos_t *
       8             : fd_blake3_pos_init( fd_blake3_pos_t * s,
       9             :                     uchar const *     data,
      10    88117259 :                     ulong             sz ) {
      11    88117259 :   *s = (fd_blake3_pos_t) {
      12    88117259 :     .input    = data,
      13    88117259 :     .input_sz = sz,
      14    88117259 :     .magic    = FD_BLAKE3_MAGIC,
      15    88117259 :   };
      16    88117259 :   return s;
      17    88117259 : }
      18             : 
      19             : /* fd_blake3_l0_complete returns 1 if all leaf nodes have been hashed,
      20             :    0 otherwise. */
      21             : 
      22             : FD_FN_PURE static inline int
      23   665827251 : fd_blake3_l0_complete( fd_blake3_pos_t const * s ) {
      24   665827251 :   return ( s->leaf_idx<<FD_BLAKE3_CHUNK_LG_SZ ) >= fd_ulong_max( s->input_sz, 64 );
      25   665827251 : }
      26             : 
      27             : FD_FN_PURE static inline int
      28             : fd_blake3_is_finished( fd_blake3_pos_t const * s,
      29   354966304 :                        ulong                   tick ) {
      30   354966304 :   int l0_complete = fd_blake3_l0_complete( s );
      31   354966304 :   int ln_complete = s->live_cnt == 1UL;
      32   354966304 :   int idle        = tick >= s->next_tick;
      33   354966304 :   return l0_complete & ln_complete & idle;
      34   354966304 : }
      35             : 
      36             : static fd_blake3_op_t *
      37             : fd_blake3_prepare_leaf( fd_blake3_pos_t * restrict s,
      38             :                         fd_blake3_buf_t * restrict buf,
      39             :                         fd_blake3_op_t *  restrict op,
      40   111629180 :                         ulong                      tick ) {
      41             : 
      42   111629180 :   ulong         msg_off = s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ;
      43   111629180 :   ulong         msg_sz  = fd_ulong_min( s->input_sz - msg_off, 1024UL );
      44   111629180 :   uchar const * msg     = s->input + msg_off;
      45   111629180 :   uchar       * out     = buf->slots[ s->layer ][ s->head.uc[ s->layer ] ];
      46             : 
      47   111629180 :   int flags = fd_int_if( s->input_sz <= FD_BLAKE3_CHUNK_SZ, FD_BLAKE3_FLAG_ROOT, 0 );
      48             : 
      49   111629180 :   *op = (fd_blake3_op_t) {
      50   111629180 :     .msg     = msg,
      51   111629180 :     .out     = out,
      52   111629180 :     .counter = s->leaf_idx,
      53   111629180 :     .sz      = (ushort)msg_sz,
      54   111629180 :     .flags   = (uchar)flags
      55   111629180 :   };
      56             : 
      57   111629180 :   s->head.uc[ 0 ] = (uchar)( s->head.uc[ 0 ]+1 );
      58   111629180 :   s->leaf_idx++;
      59   111629180 :   s->live_cnt++;
      60   111629180 :   s->next_tick = tick+1;
      61             : 
      62   111629180 :   return op;
      63             : 
      64   111629180 : }
      65             : 
      66             : static int
      67             : fd_blake3_seek_branch( fd_blake3_pos_t * restrict s,
      68             :                        fd_blake3_buf_t * restrict buf,
      69    85432477 :                        ulong                      tick ) {
      70             : 
      71    85432477 :   if( s->live_cnt == 1UL )
      72     6725649 :     return 0;
      73             : 
      74    78706828 :   if( !fd_blake3_l0_complete( s ) )
      75     1255872 :     return ( s->tail.uc[ s->layer - 1 ] + 1 ) <
      76     1255872 :            ( s->head.uc[ s->layer - 1 ]     );
      77             : 
      78    77450956 : # if FD_HAS_AVX
      79             : 
      80    77450956 :   wb_t diff = wb_sub( s->head.wb, s->tail.wb );
      81             : 
      82    77450956 :   uint mergeable_layers = (uint)_mm256_movemask_epi8( wb_gt( diff, wb_bcast( 1 ) ) );
      83    77450956 :   int  merge_layer = fd_uint_find_lsb_w_default( mergeable_layers, -1 );
      84    77450956 :   if( merge_layer>=0 ) {
      85    66010141 :     if( ((uint)merge_layer >= s->layer) & (tick < s->next_tick) )
      86    12212666 :       return 0;  /* still waiting for previous merge */
      87    53797475 :     s->layer = (uint)merge_layer+1U;
      88    53797475 :     return 1;
      89    66010141 :   }
      90             : 
      91    11440815 :   uint single_layers = (uint)_mm256_movemask_epi8( wb_eq( diff, wb_bcast( 1 ) ) );
      92    11440815 :   uint single_lo = (uint)fd_uint_find_lsb( single_layers );
      93    11440815 :   uint single_hi = (uint)fd_uint_find_lsb( single_layers & ( ~fd_uint_mask_lsb( (int)(single_lo+1U) ) ) );
      94             : 
      95    11440815 :   wb_t node = wb_ld( buf->slots[ single_lo ][ s->tail.uc[ single_lo ] ] );
      96    11440815 :               wb_st( buf->slots[ single_hi ][ s->head.uc[ single_hi ] ], node );
      97             : 
      98             : # else /* FD_HAS_AVX */
      99             : 
     100             :   uchar diff[ 32 ];
     101             :   for( ulong j=0UL; j<32UL; j++ ) diff[j] = (uchar)( s->head.uc[j] - s->tail.uc[j] );
     102             : 
     103             :   int merge_layer = -1;
     104             :   for( uint j=0U; j<32U; j++ ) {
     105             :     if( diff[j]>1 ) {
     106             :       merge_layer = (int)j;
     107             :       break;
     108             :     }
     109             :   }
     110             :   if( merge_layer>=0 ) {
     111             :     if( ((uint)merge_layer >= s->layer) & (tick < s->next_tick) )
     112             :       return 0;  /* still waiting for previous merge */
     113             :     s->layer = (uint)(merge_layer+1);
     114             :     return 1;
     115             :   }
     116             : 
     117             :   uint j=0U;
     118             :   uint single_lo = 0UL;
     119             :   uint single_hi = 0UL;
     120             :   for( ; j<32U; j++ ) {
     121             :     if( diff[j] ) {
     122             :       single_lo = j;
     123             :       break;
     124             :     }
     125             :   }
     126             :   j++;
     127             :   for( ; j<32U; j++ ) {
     128             :     if( diff[j] ) {
     129             :       single_hi = j;
     130             :       break;
     131             :     }
     132             :   }
     133             : 
     134             :   memcpy( buf->slots[ single_hi ][ s->head.uc[ single_hi ] ],
     135             :           buf->slots[ single_lo ][ s->tail.uc[ single_lo ] ],
     136             :           32UL );
     137             : 
     138             : # endif /* FD_HAS_AVX */
     139             : 
     140    11440815 :   FD_BLAKE3_TRACE(( "fd_blake3_seek_branch: moving up %u/%u to %u/%u",
     141    11440815 :                     single_lo, s->tail.uc[ single_lo ],
     142    11440815 :                     single_hi, s->head.uc[ single_hi ] ));
     143             : 
     144    11440815 :   if( ((uint)single_hi >= s->layer) & (tick < s->next_tick) )
     145     4706778 :     return 0;  /* still waiting for previous merge */
     146             : 
     147     6734037 :   s->head.uc[ single_lo ] = (uchar)( s->head.uc[ single_lo ]-1 );
     148     6734037 :   s->head.uc[ single_hi ] = (uchar)( s->head.uc[ single_hi ]+1 );
     149             : 
     150     6734037 :   s->layer = (uint)single_hi+1U;
     151     6734037 :   return 1;
     152    11440815 : }
     153             : 
     154             : static fd_blake3_op_t *
     155             : fd_blake3_prepare_branch( fd_blake3_pos_t * restrict s,
     156             :                           fd_blake3_buf_t * restrict buf,
     157             :                           fd_blake3_op_t *  restrict op,
     158    85432477 :                           ulong                      tick ) {
     159             : 
     160    85432477 :   if( !fd_blake3_seek_branch( s, buf, tick ) )
     161    23645093 :     return NULL;
     162             : 
     163    85432477 :   assert( s->layer < FD_BLAKE3_ROW_CNT );
     164             : 
     165    61787384 :   uchar const * msg = buf->slots[ s->layer-1U ][ s->tail.uc[ s->layer-1U ] ];
     166    61787384 :   uchar       * out = buf->slots[ s->layer    ][ s->head.uc[ s->layer    ] ];
     167             : 
     168    61787384 :   s->head.uc[ s->layer   ] = (uchar)( s->head.uc[ s->layer   ]+1 );
     169    61787384 :   s->tail.uc[ s->layer-1 ] = (uchar)( s->tail.uc[ s->layer-1 ]+2 );
     170    61787384 :   s->live_cnt--;
     171    61787384 :   s->next_tick = tick+1;
     172             : 
     173    61787384 :   uint flags = FD_BLAKE3_FLAG_PARENT |
     174    61787384 :                fd_uint_if( s->live_cnt==1UL, FD_BLAKE3_FLAG_ROOT, 0u );
     175             : 
     176    61787384 :   *op = (fd_blake3_op_t) {
     177    61787384 :     .msg     = msg,
     178    61787384 :     .out     = out,
     179    61787384 :     .counter = 0UL,
     180    61787384 :     .sz      = 64U,
     181    61787384 :     .flags   = (uchar)flags
     182    61787384 :   };
     183    61787384 :   return op;
     184             : 
     185    61787384 : }
     186             : 
     187             : static void
     188    85741963 : fd_blake3_advance( fd_blake3_pos_t * restrict s ) {
     189             : 
     190    85741963 : # if FD_HAS_AVX
     191             : 
     192    85741963 :   wb_t mask = wb_eq( s->tail.wb, s->head.wb );
     193    85741963 :   s->tail.wb = wb_andnot( mask, s->tail.wb );
     194    85741963 :   s->head.wb = wb_andnot( mask, s->head.wb );
     195             : 
     196             : # else /* FD_HAS_AVX */
     197             : 
     198             :   for( ulong j=0UL; j<32UL; j++ ) {
     199             :     if( s->tail.uc[j] == s->head.uc[j] ) {
     200             :       s->tail.uc[j] = 0;
     201             :       s->head.uc[j] = 0;
     202             :     }
     203             :   }
     204             : 
     205             : # endif /* FD_HAS_AVX */
     206             : 
     207    85741963 :   if( s->head.uc[ s->layer ]==FD_BLAKE3_COL_CNT ) {
     208      275418 :     s->layer++;
     209      275418 :   }
     210    85466545 :   else if( ( s->layer > 0UL ) &&
     211    85466545 :            ( s->tail.uc[ s->layer-1 ] < s->head.uc[ s->layer-1 ] ) ) {
     212             :     /* pass */
     213     7407838 :   }
     214    78058707 :   else if( fd_blake3_l0_complete( s ) ) {
     215    19975764 :     s->layer++;
     216    19975764 :   }
     217    58082943 :   else if( s->layer > 0UL ) {
     218       90281 :     s->layer = 0UL;
     219       90281 :   }
     220             : 
     221    85741963 : }
     222             : 
     223             : static fd_blake3_op_t *
     224             : fd_blake3_prepare( fd_blake3_pos_t * restrict s,
     225             :                    fd_blake3_buf_t * restrict buf,
     226             :                    fd_blake3_op_t *  restrict op,
     227   226389913 :                    ulong                      tick ) {
     228             : 
     229   226389913 :   assert( s->layer < FD_BLAKE3_ROW_CNT );
     230             : 
     231   226389913 :   if( fd_blake3_is_finished( s, tick ) )
     232           0 :     return NULL;
     233             : 
     234   226389913 :   if( tick >= s->next_tick )
     235    85741963 :     fd_blake3_advance( s );
     236             : 
     237   226389913 :   if( s->layer != 0 )
     238    85432477 :     return fd_blake3_prepare_branch( s, buf, op, tick );
     239             : 
     240   140957436 :   if( ( s->head.uc[0] >= FD_BLAKE3_COL_CNT ) |
     241   140957436 :       ( fd_blake3_l0_complete( s )         ) ) {
     242    57954621 :     return NULL;
     243    57954621 :   }
     244             : 
     245    83002815 :   return fd_blake3_prepare_leaf( s, buf, op, tick );
     246             : 
     247   140957436 : }
     248             : 
     249             : #if FD_BLAKE3_PARA_MAX>1
     250             : 
     251             : /* fd_blake3_prepare_fast does streamlined hashing of full chunks or
     252             :    full branches. */
     253             : 
     254             : static fd_blake3_op_t *
     255             : fd_blake3_prepare_fast( fd_blake3_pos_t * restrict s,
     256             :                         fd_blake3_buf_t * restrict buf,
     257             :                         fd_blake3_op_t *  restrict op,
     258             :                         ulong                      n,
     259    38072253 :                         ulong                      min ) {
     260             : 
     261    38072253 :   if( s->layer && s->head.uc[ s->layer-1 ]==FD_BLAKE3_COL_CNT ) {
     262     2718646 :     op->msg     = buf->rows[ s->layer-1 ];
     263     2718646 :     op->out     = buf->rows[ s->layer ] + (s->head.uc[ s->layer ]<<FD_BLAKE3_OUTCHAIN_LG_SZ);
     264     2718646 :     op->counter = 0UL;
     265     2718646 :     op->flags   = FD_BLAKE3_FLAG_PARENT;
     266             : 
     267             :     /* Assume that branch layer is fully hashed (up to col cnt) */
     268     2718646 :     s->head.uc[ s->layer-1 ] =  0;
     269     2718646 :     s->head.uc[ s->layer   ] = (uchar)( (ulong)s->head.uc[ s->layer ]+n );
     270     2718646 :     s->live_cnt -= n;
     271     2718646 :     s->layer = fd_uint_if( s->head.uc[ s->layer ]==FD_BLAKE3_COL_CNT,
     272     2718646 :                            s->layer+1U, 0U );
     273             : 
     274     2718646 :     return op;
     275     2718646 :   }
     276             : 
     277    35353607 :   ulong pos   = s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ;
     278    35353607 :   ulong avail = fd_ulong_align_dn( s->input_sz - pos, FD_BLAKE3_CHUNK_SZ ) >> FD_BLAKE3_CHUNK_LG_SZ;
     279    35353607 :   n = fd_ulong_min( n, avail );
     280             : 
     281             :   /* This constants controls the threshold when to use the (slow)
     282             :      scheduler instead of fast single-message hashing.  Carefully tuned
     283             :      for best overall performance. */
     284    35353607 :   if( n<min ) return NULL;
     285             : 
     286     7380416 :   op->msg     = s->input + (s->leaf_idx<<FD_BLAKE3_CHUNK_LG_SZ);
     287     7380416 :   op->out     = buf->rows[0] + (s->head.uc[0]<<FD_BLAKE3_OUTCHAIN_LG_SZ);
     288     7380416 :   op->counter = s->leaf_idx;
     289     7380416 :   op->flags   = 0;
     290             : 
     291     7380416 :   s->head.uc[0] = (uchar)( (ulong)s->head.uc[0]+n );
     292     7380416 :   s->leaf_idx   += n;
     293     7380416 :   s->live_cnt   += n;
     294     7380416 :   s->layer      =  fd_uint_if( s->head.uc[0]==FD_BLAKE3_COL_CNT, 1U, 0U );
     295             : 
     296     7380416 :   return op;
     297    35353607 : }
     298             : 
     299             : static void
     300             : fd_blake3_batch_hash( fd_blake3_op_t const * ops,
     301    82223473 :                       ulong                  op_cnt ) {
     302    82223473 :   uchar const * batch_data   [ FD_BLAKE3_PARA_MAX ] __attribute__((aligned(64)));
     303    82223473 :   uint          batch_data_sz[ FD_BLAKE3_PARA_MAX ] = {0};
     304    82223473 :   uchar *       batch_hash   [ FD_BLAKE3_PARA_MAX ] __attribute__((aligned(64)));
     305    82223473 :   ulong         batch_ctr    [ FD_BLAKE3_PARA_MAX ];
     306    82223473 :   uint          batch_flags  [ FD_BLAKE3_PARA_MAX ];
     307   223495182 :   for( ulong j=0UL; j<op_cnt; j++ ) {
     308   141271709 :     batch_data   [ j ] = ops[ j ].msg;
     309   141271709 :     batch_hash   [ j ] = ops[ j ].out;
     310   141271709 :     batch_data_sz[ j ] = ops[ j ].sz;
     311   141271709 :     batch_ctr    [ j ] = ops[ j ].counter;
     312   141271709 :     batch_flags  [ j ] = ops[ j ].flags;
     313   141271709 :   }
     314    27413539 : #if FD_HAS_AVX512
     315    27413539 :   fd_blake3_avx512_compress16( op_cnt, batch_data, batch_data_sz, batch_ctr, batch_flags, fd_type_pun( batch_hash ), NULL, 32U, NULL );
     316             : #elif FD_HAS_AVX
     317    54809934 :   fd_blake3_avx_compress8    ( op_cnt, batch_data, batch_data_sz, batch_ctr, batch_flags, fd_type_pun( batch_hash ), NULL, 32U, NULL );
     318             : #else
     319             :   #error "FIXME missing para support"
     320             : #endif
     321    82223473 : }
     322             : 
     323             : #endif
     324             : 
     325             : /* Simple API *********************************************************/
     326             : 
     327             : ulong
     328          75 : fd_blake3_align( void ) {
     329          75 :   return FD_BLAKE3_ALIGN;
     330          75 : }
     331             : 
     332             : ulong
     333          24 : fd_blake3_footprint( void ) {
     334          24 :   return FD_BLAKE3_FOOTPRINT;
     335          24 : }
     336             : 
     337             : void *
     338          27 : fd_blake3_new( void * shmem ) {
     339          27 :   fd_blake3_t * sha = (fd_blake3_t *)shmem;
     340             : 
     341          27 :   if( FD_UNLIKELY( !shmem ) ) {
     342           3 :     FD_LOG_WARNING(( "NULL shmem" ));
     343           3 :     return NULL;
     344           3 :   }
     345             : 
     346          24 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, fd_blake3_align() ) ) ) {
     347           3 :     FD_LOG_WARNING(( "misaligned shmem" ));
     348           3 :     return NULL;
     349           3 :   }
     350             : 
     351          21 :   ulong footprint = fd_blake3_footprint();
     352             : 
     353          21 :   fd_memset( sha, 0, footprint );
     354             : 
     355          21 :   FD_COMPILER_MFENCE();
     356          21 :   FD_VOLATILE( sha->pos.magic ) = FD_BLAKE3_MAGIC;
     357          21 :   FD_COMPILER_MFENCE();
     358             : 
     359          21 :   return (void *)sha;
     360          24 : }
     361             : 
     362             : fd_blake3_t *
     363          27 : fd_blake3_join( void * shsha ) {
     364             : 
     365          27 :   if( FD_UNLIKELY( !shsha ) ) {
     366           3 :     FD_LOG_WARNING(( "NULL shsha" ));
     367           3 :     return NULL;
     368           3 :   }
     369             : 
     370          24 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shsha, fd_blake3_align() ) ) ) {
     371           3 :     FD_LOG_WARNING(( "misaligned shsha" ));
     372           3 :     return NULL;
     373           3 :   }
     374             : 
     375          21 :   fd_blake3_t * sha = (fd_blake3_t *)shsha;
     376             : 
     377          21 :   if( FD_UNLIKELY( sha->pos.magic!=FD_BLAKE3_MAGIC ) ) {
     378           0 :     FD_LOG_WARNING(( "bad magic" ));
     379           0 :     return NULL;
     380           0 :   }
     381             : 
     382          21 :   return sha;
     383          21 : }
     384             : 
     385             : void *
     386          24 : fd_blake3_leave( fd_blake3_t * sha ) {
     387             : 
     388          24 :   if( FD_UNLIKELY( !sha ) ) {
     389           3 :     FD_LOG_WARNING(( "NULL sha" ));
     390           3 :     return NULL;
     391           3 :   }
     392             : 
     393          21 :   return (void *)sha;
     394          24 : }
     395             : 
     396             : void *
     397          27 : fd_blake3_delete( void * shsha ) {
     398             : 
     399          27 :   if( FD_UNLIKELY( !shsha ) ) {
     400           3 :     FD_LOG_WARNING(( "NULL shsha" ));
     401           3 :     return NULL;
     402           3 :   }
     403             : 
     404          24 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shsha, fd_blake3_align() ) ) ) {
     405           3 :     FD_LOG_WARNING(( "misaligned shsha" ));
     406           3 :     return NULL;
     407           3 :   }
     408             : 
     409          21 :   fd_blake3_t * sha = (fd_blake3_t *)shsha;
     410             : 
     411          21 :   if( FD_UNLIKELY( sha->pos.magic!=FD_BLAKE3_MAGIC ) ) {
     412           0 :     FD_LOG_WARNING(( "bad magic" ));
     413           0 :     return NULL;
     414           0 :   }
     415             : 
     416          21 :   FD_COMPILER_MFENCE();
     417          21 :   FD_VOLATILE( sha->pos.magic ) = 0UL;
     418          21 :   FD_COMPILER_MFENCE();
     419             : 
     420          21 :   return (void *)sha;
     421          21 : }
     422             : 
     423             : 
     424             : fd_blake3_t *
     425    60144068 : fd_blake3_init( fd_blake3_t * sha ) {
     426    60144068 :   FD_BLAKE3_TRACE(( "fd_blake3_init(sha=%p)", (void *)sha ));
     427    60144068 :   fd_blake3_pos_init( &sha->pos, NULL, 0UL );
     428    60144068 :   sha->block_sz = 0UL;
     429    60144068 :   return sha;
     430    60144068 : }
     431             : 
     432             : #if FD_BLAKE3_PARA_MAX>1
     433             : 
     434             : static void
     435             : fd_blake3_append_blocks( fd_blake3_pos_t * s,
     436             :                          fd_blake3_buf_t * tbl,
     437             :                          uchar const *     data,
     438     1957190 :                          ulong             buf_cnt ) {
     439     1957190 :   s->input = data - (s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ); /* TODO HACKY!! */
     440     6193811 :   for( ulong i=0UL; i<buf_cnt; i++ ) {
     441     4236621 :     fd_blake3_op_t op[1];
     442     6024808 :     do {
     443     6024808 :       if( !fd_blake3_prepare_fast( s, tbl, op, FD_BLAKE3_PARA_MAX, FD_BLAKE3_PARA_MAX ) )
     444           0 :         return;
     445      822624 : #if FD_HAS_AVX512
     446      822624 :       fd_blake3_avx512_compress16_fast( op->msg, op->out, op->counter, op->flags );
     447             : #elif FD_HAS_AVX
     448     5202184 :       fd_blake3_avx_compress8_fast( op->msg, op->out, op->counter, op->flags );
     449             : #else
     450             :       #error "missing para support"
     451             : #endif
     452     6024808 :     } while( op->flags & FD_BLAKE3_FLAG_PARENT );
     453     4236621 :   }
     454     1957190 : }
     455             : 
     456             : #else
     457             : 
     458             : static void
     459             : fd_blake3_append_blocks( fd_blake3_pos_t * s,
     460             :                          fd_blake3_buf_t * tbl,
     461             :                          uchar const *     data,
     462             :                          ulong             buf_cnt ) {
     463             :   (void)buf_cnt;
     464             :   s->input = data - (s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ); /* TODO HACKY!! */
     465             :   fd_blake3_op_t op[1];
     466             :   while( buf_cnt ) {
     467             :     if( !fd_blake3_prepare( s, tbl, op, s->next_tick ) ) {
     468             :       FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: no more ops to prepare" ));
     469             :       break;
     470             :     }
     471             :     if( op->flags & FD_BLAKE3_FLAG_PARENT ) {
     472             :       FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: compressing output chaining values (layer %u)", s->layer ));
     473             :       fd_blake3_ref_compress1( op->out, op->msg, 64UL, op->counter, op->flags, NULL, NULL );
     474             :     } else {
     475             :       FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: compressing %lu leaf chunks", FD_BLAKE3_COL_CNT ));
     476             :       fd_blake3_ref_compress1( op->out, op->msg, FD_BLAKE3_CHUNK_SZ, op->counter, op->flags, NULL, NULL );
     477             :       buf_cnt--;
     478             :     }
     479             :     s->next_tick++;
     480             :   }
     481             : }
     482             : 
     483             : #endif
     484             : 
     485             : fd_blake3_t *
     486             : fd_blake3_append( fd_blake3_t * sha,
     487             :                   void const *  _data,
     488    60227120 :                   ulong         sz ) {
     489             : 
     490             :   /* If no data to append, we are done */
     491             : 
     492    60227120 :   if( FD_UNLIKELY( !sz ) ) return sha;
     493    60187996 :   FD_BLAKE3_TRACE(( "fd_blake3_append(sha=%p,data=%p,sz=%lu)", (void *)sha, _data, sz ));
     494             : 
     495             :   /* Unpack inputs */
     496             : 
     497    60187996 :   fd_blake3_pos_t * s        = &sha->pos;
     498    60187996 :   fd_blake3_buf_t * tbl      = &sha->buf;
     499    60187996 :   uchar *           buf      = sha->block;
     500    60187996 :   ulong             buf_used = sha->block_sz;
     501             : 
     502    60187996 :   uchar const * data = (uchar const *)_data;
     503             : 
     504             :   /* Update input_sz */
     505             : 
     506    60187996 :   s->input_sz += sz;
     507             : 
     508             :   /* Edge case: For the first completed 1024 bytes of input, don't
     509             :      immediately hash, since it is not clear whether this chunk has
     510             :      the root flag set. */
     511    60187996 :   if( FD_UNLIKELY( FD_BLAKE3_PARA_MAX==1 && s->input_sz==1024UL ) ) {
     512           0 :     fd_memcpy( buf + buf_used, data, sz );
     513           0 :     sha->block_sz = FD_BLAKE3_CHUNK_SZ;
     514           0 :     return sha;
     515           0 :   }
     516             : 
     517             :   /* Handle buffered bytes from previous appends */
     518             : 
     519    60187996 :   if( FD_UNLIKELY( buf_used ) ) { /* optimized for well aligned use of append */
     520             : 
     521             :     /* If the append isn't large enough to complete the current block,
     522             :        buffer these bytes too and return */
     523             : 
     524       47942 :     ulong buf_rem = FD_BLAKE3_PRIVATE_BUF_MAX - buf_used; /* In (0,FD_BLAKE3_PRIVATE_BUF_MAX) */
     525       47942 :     if( FD_UNLIKELY( sz < buf_rem ) ) { /* optimize for large append */
     526        8649 :       fd_memcpy( buf + buf_used, data, sz );
     527        8649 :       sha->block_sz = buf_used + sz;
     528        8649 :       return sha;
     529        8649 :     }
     530             : 
     531             :     /* Otherwise, buffer enough leading bytes of data to complete the
     532             :        block, update the hash and then continue processing any remaining
     533             :        bytes of data. */
     534             : 
     535       39293 :     fd_memcpy( buf + buf_used, data, buf_rem );
     536       39293 :     data += buf_rem;
     537       39293 :     sz   -= buf_rem;
     538             : 
     539       39293 :     fd_blake3_append_blocks( s, tbl, buf, 1UL );
     540       39293 :     sha->block_sz = 0UL;
     541       39293 :   }
     542             : 
     543             :   /* Append the bulk of the data */
     544             : 
     545    60179347 :   ulong buf_cnt = sz >> FD_BLAKE3_PRIVATE_LG_BUF_MAX;
     546    60179347 :   if( FD_LIKELY( buf_cnt ) ) fd_blake3_append_blocks( s, tbl, data, buf_cnt ); /* optimized for large append */
     547             : 
     548             :   /* Buffer any leftover bytes */
     549             : 
     550    60179347 :   buf_used = sz & (FD_BLAKE3_PRIVATE_BUF_MAX-1UL); /* In [0,FD_BLAKE3_PRIVATE_BUF_MAX) */
     551    60179347 :   if( FD_UNLIKELY( buf_used ) ) { /* optimized for well aligned use of append */
     552    59876262 :     fd_memcpy( buf, data + (buf_cnt << FD_BLAKE3_PRIVATE_LG_BUF_MAX), buf_used );
     553    59876262 :     sha->block_sz = buf_used; /* In (0,FD_BLAKE3_PRIVATE_BUF_MAX) */
     554    59876262 :   }
     555             : 
     556    60179347 :   FD_BLAKE3_TRACE(( "fd_blake3_append: done" ));
     557    60179347 :   return sha;
     558    60187996 : }
     559             : 
     560             : static void const *
     561             : fd_blake3_single_hash( fd_blake3_pos_t * s,
     562    55972404 :                        fd_blake3_buf_t * tbl ) {
     563    55972404 : #if FD_BLAKE3_PARA_MAX>1
     564    55972404 :   ulong tick = 0UL;
     565   128576391 :   while( !fd_blake3_is_finished( s, tick ) ) {
     566    72603987 :     fd_blake3_op_t ops[ FD_BLAKE3_PARA_MAX ] = {0};
     567    72603987 :     ulong          op_cnt = 0UL;
     568   179647468 :     while( op_cnt<FD_BLAKE3_PARA_MAX ) {
     569   179349375 :       fd_blake3_op_t * op = &ops[ op_cnt ];
     570   179349375 :       if( !fd_blake3_prepare( s, tbl, op, tick ) )
     571    72305894 :         break;
     572   107043481 :       op_cnt++;
     573   107043481 :     }
     574             : 
     575    72603987 :     fd_blake3_batch_hash( ops, op_cnt );
     576    72603987 :     tick++;
     577    72603987 :   }
     578             : #else
     579             :   while( !fd_blake3_is_finished( s, s->next_tick ) ) {
     580             :     fd_blake3_op_t op[1] = {0};
     581             :     if( !fd_blake3_prepare( s, tbl, op, s->next_tick ) )
     582             :       break;
     583             :     s->next_tick++;
     584             :     FD_BLAKE3_TRACE(( "fd_blake3_single_hash: compressing %hu bytes at layer %u, counter %lu, flags 0x%x",
     585             :                       op->sz, s->layer, op->counter, op->flags ));
     586             : #   if FD_HAS_SSE
     587             :     fd_blake3_sse_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
     588             : #   else
     589             :     fd_blake3_ref_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
     590             : #   endif
     591             :   }
     592             : #endif
     593    55972404 :   return tbl->slots[ s->layer ][0];
     594    55972404 : }
     595             : 
     596             : void *
     597             : fd_blake3_fini( fd_blake3_t * sha,
     598    27999213 :                 void *        hash ) {
     599             : 
     600             :   /* Unpack inputs */
     601             : 
     602    27999213 :   fd_blake3_pos_t * s        = &sha->pos;
     603    27999213 :   fd_blake3_buf_t * tbl      = &sha->buf;
     604    27999213 :   uchar *           buf      = sha->block;
     605    27999213 :   ulong             buf_used = sha->block_sz;
     606    27999213 :   FD_BLAKE3_TRACE(( "fd_blake3_fini(sha=%p,sz=%lu)", (void *)sha, s->input_sz ));
     607             : 
     608             :   /* TODO HACKY!! */
     609    27999213 :   s->input    = buf - ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ );
     610    27999213 :   s->input_sz = ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ ) + buf_used;
     611             : 
     612    27999213 :   void const * hash_ = fd_blake3_single_hash( s, tbl );
     613    27999213 :   memcpy( hash, hash_, 32UL );
     614    27999213 :   return hash;
     615    27999213 : }
     616             : 
     617             : /* fd_blake3_fini_xof_compress performs BLAKE3 compression (input
     618             :    hashing) for all blocks in the hash tree except for the root block.
     619             :    Root compression inputs are returned via the function's out pointers:
     620             :    On return, root_msg[0..64] contains the padded message input for the
     621             :    root block, root_cv_pre[0..64] contains the output chaining value of
     622             :    the previous block (or the BLAKE3 IV if root block is the only block
     623             :    in the hash operation, i.e. <=64 byte hash input).
     624             :    Other values (counter, flags, size) are re-derived by the XOF
     625             :    implementation using the blake3 state object. */
     626             : 
     627             : void
     628             : fd_blake3_fini_xof_compress( fd_blake3_t * sha,
     629             :                              uchar *       root_msg,
     630    32144855 :                              uchar *       root_cv_pre ) {
     631    32144855 :   fd_blake3_pos_t * s        = &sha->pos;
     632    32144855 :   fd_blake3_buf_t * tbl      = &sha->buf;
     633    32144855 :   uchar *           buf      = sha->block;
     634    32144855 :   ulong             buf_used = sha->block_sz;
     635             : 
     636             :   /* TODO HACKY!! */
     637    32144855 :   s->input    = buf - ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ );
     638    32144855 :   s->input_sz = ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ ) + buf_used;
     639             : 
     640             :   /* The root block is contained in a leaf.  Process all but the last
     641             :      blocks of the chunk.  (The last block is the "root" block) */
     642    32144855 :   if( s->input_sz<=FD_BLAKE3_CHUNK_SZ ) {
     643    28626365 :     fd_blake3_op_t op[1];
     644    28626365 :     if( !fd_blake3_prepare_leaf( s, tbl, op, s->next_tick ) )
     645           0 :       FD_LOG_ERR(( "fd_blake3_fini_xof_compress invariant violation: failed to prepare compression of <=1024 byte message (duplicate call to fini?)" ));
     646    28626365 : #if FD_HAS_SSE
     647    28626365 :     fd_blake3_sse_compress1( root_msg, op->msg, op->sz, op->counter, op->flags, root_cv_pre, NULL );
     648             : #else
     649             :     fd_blake3_ref_compress1( root_msg, op->msg, op->sz, op->counter, op->flags, root_cv_pre, NULL );
     650             : #endif
     651    28626365 :     return;
     652    28626365 :   }
     653             : 
     654             :   /* The root block is a branch node.  Continue working until there are
     655             :      only two blocks remaining. */
     656     3518490 :   ulong tick = sha->pos.next_tick+1;
     657    13137976 :   for(;;) {
     658    13137976 :     int l0_complete = fd_blake3_l0_complete( s );
     659    13137976 :     int ln_complete = s->live_cnt == 2UL;
     660    13137976 :     if( l0_complete & ln_complete ) break;
     661             : 
     662     9619486 : #if FD_BLAKE3_PARA_MAX>1
     663     9619486 :     fd_blake3_op_t ops[ FD_BLAKE3_PARA_MAX ] = {0};
     664     9619486 :     ulong          op_cnt = 0UL;
     665    43847714 :     while( op_cnt<FD_BLAKE3_PARA_MAX ) {
     666    43522048 :       fd_blake3_op_t * op = &ops[ op_cnt ];
     667    43522048 :       if( !fd_blake3_prepare( s, tbl, op, tick ) )
     668     9293820 :         break;
     669    34228228 :       op_cnt++;
     670    34228228 :     }
     671     9619486 :     if( FD_UNLIKELY( !op_cnt ) ) {
     672           0 :       FD_LOG_ERR(( "fd_blake3_fini_xof_compress invariant violation: failed to prepare branch compression with live_cnt=%lu (duplicate call to fini?)", s->live_cnt ));
     673           0 :     }
     674             : 
     675     9619486 :     fd_blake3_batch_hash( ops, op_cnt );
     676             : #else
     677             :     fd_blake3_op_t op[1] = {0};
     678             :     if( !fd_blake3_prepare( s, tbl, op, tick ) )
     679             :       break;
     680             : #   if FD_HAS_SSE
     681             :     fd_blake3_sse_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
     682             : #   else
     683             :     fd_blake3_ref_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
     684             : #   endif
     685             : #endif
     686     9619486 :     tick++;
     687     9619486 :   }
     688     3518490 : }
     689             : 
     690             : void *
     691             : fd_blake3_fini_2048( fd_blake3_t * sha,
     692    32144791 :                      void *        hash ) {
     693    32144791 :   FD_BLAKE3_TRACE(( "fd_blake3_fini_2048(sha=%p,hash=%p)", (void *)sha, hash ));
     694             : 
     695             :   /* Compress input until the last remaining piece of work is the BLAKE3
     696             :      root block.  This root block is put through the compression
     697             :      function repeatedly to "expand" the hash output (XOF hashing).
     698             :      Solana uses this to generate a 2048 byte 'LtHash' value.
     699             :      fd_blake3 does this SIMD-parallel for better performance. */
     700    32144791 :   uchar root_msg   [ 64 ] __attribute__((aligned(64)));
     701    32144791 :   uchar root_cv_pre[ 32 ] __attribute__((aligned(32)));
     702    32144791 :   fd_blake3_fini_xof_compress( sha, root_msg, root_cv_pre );
     703             : 
     704             :   /* Restore root block details */
     705    32144791 :   uint          last_block_sz    = 64u;
     706    32144791 :   uint          last_block_flags = FD_BLAKE3_FLAG_ROOT | FD_BLAKE3_FLAG_PARENT;
     707    32144791 :   ulong         ctr0             = 0UL;
     708    32144791 :   if( sha->pos.input_sz<=FD_BLAKE3_CHUNK_SZ ) {
     709    28626301 :     last_block_sz    = (uint)sha->pos.input_sz & 63u;
     710    28626301 :     if( fd_ulong_is_aligned( sha->pos.input_sz, 64 ) ) last_block_sz = 64;
     711    28626301 :     if( FD_UNLIKELY( sha->pos.input_sz==0UL        ) ) last_block_sz = 0u;
     712    28626301 :     last_block_flags = FD_BLAKE3_FLAG_ROOT | FD_BLAKE3_FLAG_CHUNK_END;
     713    28626301 :     if( sha->pos.input_sz<=FD_BLAKE3_BLOCK_SZ ) last_block_flags |= FD_BLAKE3_FLAG_CHUNK_START;
     714    28626301 :     ctr0             = sha->pos.leaf_idx-1UL;
     715    28626301 :   } else {
     716     3518490 :     fd_blake3_op_t op[1];
     717     3518490 :     if( FD_UNLIKELY( !fd_blake3_prepare( &sha->pos, &sha->buf, op, sha->pos.next_tick+1UL ) ) ) {
     718           0 :       FD_LOG_ERR(( "fd_blake3_fini_2048 invariant violation: failed to prepare branch root compression (duplicate call to fini?)" ));
     719           0 :     }
     720     3518490 :     memcpy( root_msg,    op->msg,      64UL );
     721     3518490 :     memcpy( root_cv_pre, FD_BLAKE3_IV, 32UL );
     722     3518490 :   }
     723    32144791 :   FD_BLAKE3_TRACE(( "fd_blake3_fini_2048: sz=%lu ctr0=%lu flags=%x",
     724    32144791 :                     sha->pos.input_sz, ctr0, last_block_flags ));
     725             : 
     726             :   /* Expand LtHash
     727             :      For now, this uses the generic AVX2/AVX512 compress backend.
     728             :      Could write a more optimized version in the future saving some of
     729             :      the matrix transpose work. */
     730   137160761 :   for( ulong i=0UL; i<32UL; i+=FD_BLAKE3_PARA_MAX ) {
     731    23563194 : #if FD_HAS_AVX512
     732    23563194 :     ulong  batch_data [ 16 ] __attribute__((aligned(64)));
     733   400574298 :     /*                     */ for( ulong j=0; j<16; j++ ) batch_data [ j ] = (ulong)root_msg;
     734   400574298 :     uint   batch_sz   [ 16 ]; for( ulong j=0; j<16; j++ ) batch_sz   [ j ] = last_block_sz;
     735   400574298 :     ulong  batch_ctr  [ 16 ]; for( ulong j=0; j<16; j++ ) batch_ctr  [ j ] = ctr0+i+j;
     736   400574298 :     uint   batch_flags[ 16 ]; for( ulong j=0; j<16; j++ ) batch_flags[ j ] = last_block_flags;
     737   400574298 :     void * batch_hash [ 16 ]; for( ulong j=0; j<16; j++ ) batch_hash [ j ] = (uchar *)hash + (i+j)*64;
     738   400574298 :     void * batch_cv   [ 16 ]; for( ulong j=0; j<16; j++ ) batch_cv   [ j ] = root_cv_pre;
     739    23563194 :     fd_blake3_avx512_compress16( 16UL, batch_data, batch_sz, batch_ctr, batch_flags, batch_hash, NULL, 64U, batch_cv );
     740             : #elif FD_HAS_AVX
     741   733074984 :     ulong  batch_data [ 8 ]; for( ulong j=0; j<8; j++ ) batch_data [ j ] = (ulong)root_msg;
     742   733074984 :     uint   batch_sz   [ 8 ]; for( ulong j=0; j<8; j++ ) batch_sz   [ j ] = last_block_sz;
     743   733074984 :     ulong  batch_ctr  [ 8 ]; for( ulong j=0; j<8; j++ ) batch_ctr  [ j ] = ctr0+i+j;
     744   733074984 :     uint   batch_flags[ 8 ]; for( ulong j=0; j<8; j++ ) batch_flags[ j ] = last_block_flags;
     745   733074984 :     void * batch_hash [ 8 ]; for( ulong j=0; j<8; j++ ) batch_hash [ j ] = (uchar *)hash + (i+j)*64;
     746   733074984 :     void * batch_cv   [ 8 ]; for( ulong j=0; j<8; j++ ) batch_cv   [ j ] = root_cv_pre;
     747    81452776 :     fd_blake3_avx_compress8( 8UL, batch_data, batch_sz, batch_ctr, batch_flags, batch_hash, NULL, 64U, batch_cv );
     748             : #elif FD_HAS_SSE
     749             :     fd_blake3_sse_compress1( (uchar *)hash+i*64, root_msg, last_block_sz, ctr0+i, last_block_flags, NULL, root_cv_pre );
     750             : #else
     751             :     fd_blake3_ref_compress1( (uchar *)hash+i*64, root_msg, last_block_sz, ctr0+i, last_block_flags, NULL, root_cv_pre );
     752             : #endif
     753   105015970 :   }
     754             : 
     755    32144791 :   FD_BLAKE3_TRACE(( "fd_blake3_fini_2048: done" ));
     756    32144791 :   return hash;
     757    32144791 : }
     758             : 
     759             : void *
     760             : fd_blake3_hash( void const * data,
     761             :                 ulong        sz,
     762    27973191 :                 void *       hash ) {
     763             : 
     764    27973191 :   fd_blake3_buf_t tbl[1];
     765    27973191 :   fd_blake3_pos_t s[1];
     766    27973191 :   fd_blake3_pos_init( s, data, sz );
     767             : 
     768    27973191 : #if FD_BLAKE3_PARA_MAX>1
     769    32047445 :   for(;;) {
     770    32047445 :     fd_blake3_op_t op[1];
     771    32047445 :     if( !fd_blake3_prepare_fast( s, tbl, op, FD_BLAKE3_PARA_MAX, 4 ) )
     772    27973191 :       break;
     773      965162 : #if FD_HAS_AVX512
     774      965162 :     fd_blake3_avx512_compress16_fast( op->msg, op->out, op->counter, op->flags );
     775             : #elif FD_HAS_AVX
     776     3109092 :     fd_blake3_avx_compress8_fast( op->msg, op->out, op->counter, op->flags );
     777             : #else
     778             :     #error "missing para support"
     779             : #endif
     780     4074254 :   }
     781    27973191 : #endif
     782             : 
     783    27973191 :   void const * hash_ = fd_blake3_single_hash( s, tbl );
     784    27973191 :   memcpy( hash, hash_, 32UL );
     785    27973191 :   return hash;
     786    27973191 : }
     787             : 
     788             : #if FD_HAS_AVX
     789             : 
     790             : void
     791             : fd_blake3_lthash_batch8(
     792             :     void const * batch_data[8],  /* align=32 ele_align=1 */
     793             :     uint const   batch_sz  [8],  /* align=32 */
     794             :     void *       out_lthash      /* align=32 */
     795      300006 : ) {
     796      300006 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_data, 32 ) ) ) {
     797           0 :     FD_LOG_ERR(( "misaligned batch_data: %p", (void *)batch_data ));
     798           0 :   }
     799      300006 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_sz, 32 ) ) ) {
     800           0 :     FD_LOG_ERR(( "misaligned batch_sz: %p", (void *)batch_sz ));
     801           0 :   }
     802      300006 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)out_lthash, 32 ) ) ) {
     803           0 :     FD_LOG_ERR(( "misaligned out_lthash: %p", (void *)out_lthash ));
     804           0 :   }
     805             : 
     806      300006 :   ulong batch_ctr  [ 8 ] = {0};
     807     2700054 :   uint  batch_flags[ 8 ]; for( uint i=0; i<8; i++ ) batch_flags[ i ] = FD_BLAKE3_FLAG_ROOT;
     808      300006 :   fd_blake3_avx_compress8( 8UL, batch_data, batch_sz, batch_ctr, batch_flags, NULL, out_lthash, 32U, NULL );
     809      300006 : }
     810             : 
     811             : #endif
     812             : 
     813             : #if FD_HAS_AVX512
     814             : 
     815             : void
     816             : fd_blake3_lthash_batch16(
     817             :     void const * batch_data[16],  /* align=32 ele_align=1 */
     818             :     uint const   batch_sz  [16],  /* align=32 */
     819             :     void *       out_lthash      /* align=32 */
     820      100001 : ) {
     821      100001 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_data, 64 ) ) ) {
     822           0 :     FD_LOG_ERR(( "misaligned batch_data: %p", (void *)batch_data ));
     823           0 :   }
     824      100001 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_sz, 64 ) ) ) {
     825           0 :     FD_LOG_ERR(( "misaligned batch_sz: %p", (void *)batch_sz ));
     826           0 :   }
     827      100001 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)out_lthash, 64 ) ) ) {
     828           0 :     FD_LOG_ERR(( "misaligned out_lthash: %p", (void *)out_lthash ));
     829           0 :   }
     830             : 
     831      100001 :   ulong batch_ctr  [ 16 ] = {0};
     832     1700017 :   uint  batch_flags[ 16 ]; for( uint i=0; i<16; i++ ) batch_flags[ i ] = FD_BLAKE3_FLAG_ROOT;
     833      100001 :   fd_blake3_avx512_compress16( 16UL, batch_data, batch_sz, batch_ctr, batch_flags, NULL, out_lthash, 32U, NULL );
     834      100001 : }
     835             : 
     836             : #endif

Generated by: LCOV version 1.14