LCOV - code coverage report
Current view: top level - ballet/blake3 - fd_blake3.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 15 15 100.0 %
Date: 2025-09-18 04:41:32 Functions: 0 0 -

          Line data    Source code
       1             : #ifndef HEADER_fd_src_ballet_blake3_fd_blake3_h
       2             : #define HEADER_fd_src_ballet_blake3_fd_blake3_h
       3             : 
       4             : #include "../fd_ballet_base.h"
       5             : #if FD_HAS_AVX
       6             : #include "../../util/simd/fd_avx.h"
       7             : #endif
       8             : 
       9             : /* fd_blake3 provides APIs for BLAKE3 hashing of messages.
      10             : 
      11             :    The BLAKE3 specification is available here:
      12             :    https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf
      13             : 
      14             :    ### High-level overview
      15             : 
      16             :    fd_blake3 provides the "hash" mode of BLAKE3 with variable size
      17             :    output.  Keyed hashing and key derivation are not supported.  For
      18             :    hashes with more than 1024 bytes of input data, uses SIMD parallelism
      19             :    depending on hardware capabilities.  For smaller message sizes, use
      20             :    the batch API to process multiple independent inputs in parallel.
      21             : 
      22             :    ### Usage (simple)
      23             : 
      24             :      fd_blake3_t hasher[1];
      25             :      fd_blake3_init( hasher );
      26             :      fd_blake3_append( hasher, data, sz );
      27             :      uchar hash[ 32 ];
      28             :      fd_blake3_fini( hasher, hash );
      29             : 
      30             :    ### Usage (batched)
      31             : 
      32             :      ... TODO ...
      33             : 
      34             :    ### Hash Construction
      35             : 
      36             :    The "core" of BLAKE3 is an add-rotate-xor compression function with
      37             :    a 512-bit state size.  This state is created from the following
      38             :    896-bit input:
      39             : 
      40             :    - 256-bit chaining value (optionally used to create a hash chain)
      41             :    - 512-bit input data
      42             :    -  64-bit counter
      43             :    -  32-bit input data size
      44             :    -  32-bit flags
      45             : 
      46             :    The BLAKE3 hash is constructed purely by repeated invocation of the
      47             :    compression function while mixing in input data and metadata.
      48             : 
      49             :    At a high-level, there exist two phases: Compress, and expand.
      50             :    The data dependencies of the compression phase form a hash tree,
      51             :    ending in a 896-bit root input.  In the expand phase, the compression
      52             :    function is repeatedly applied on the root input with increasing
      53             :    counter values (each call producing 512-bit of final output data).
      54             : 
      55             :    The compress phase is further divided into the chunk phase and the
      56             :    tree phase.  In the chunk phase, each 8192-bit input is hashed to
      57             :    a 256-bit output via serial calls to the compression function.
      58             :    (Note that each chunk can be computed independently)
      59             : 
      60             :    In the tree phase, the chunks are joined pairwise into a hash tree.
      61             : 
      62             :    Figure 1 illustrates a BLAKE3 hash tree with a 2170 byte input
      63             :    (34 chunks in{X}), one branch nodes (b{Y}), the root state (RS), and
      64             :    a 192 byte hash output (h{Z}). */
      65             : 
      66             :      /*** Figure 1: BLAKE3 Hash Tree ******************************
      67             :      *                                                            *
      68             :      *          ┌────┐    ┌────┐    ┌────┐     ─┐                 *
      69             :      *          │ h0 │    │ h1 │    │ h2 │      │                 *
      70             :      *          └──▲─┘    └─▲──┘    └─▲──┘      ├─ Expand         *
      71             :      *             │        │         │         │                 *
      72             :      *             └───────┐│┌────────┘        ─┘                 *
      73             :      *                     │││                                    *
      74             :      *                    ┌┴┴┴─┐               ─┐                 *
      75             :      *            ┌──────►│ RS ├───────┐        │                 *
      76             :      *            │       └────┘       │        │                 *
      77             :      *            │                    │        ├─ Compress Tree  *
      78             :      *          ┌─┴──┐                 │        │                 *
      79             :      *      ┌──►│ b0 │◄──┐             │        │                 *
      80             :      *      │   └────┘   │             │       ─┘                 *
      81             :      *      │            │             │                          *
      82             :      *   ┌──┴───┐     ┌──┴───┐      ┌──┴───┐   ─┐                 *
      83             :      *   │ in15 │     │ in31 │      │ in33 │    │                 *
      84             :      *   └──▲───┘     └──▲───┘      └──▲───┘    │                 *
      85             :      *      │            │             │        │                 *
      86             :      *     ...          ...         ┌──┴───┐    │                 *
      87             :      *      ▲            ▲          │ in32 │    │                 *
      88             :      *      │            │          └──────┘    ├─ Compress Chunk *
      89             :      *   ┌──┴───┐     ┌──┴───┐                  │                 *
      90             :      *   │ in1  │     │ in17 │                  │                 *
      91             :      *   └──▲───┘     └──▲───┘                  │                 *
      92             :      *      │            │                      │                 *
      93             :      *   ┌──┴───┐     ┌──┴───┐                  │                 *
      94             :      *   │ in0  │     │ in16 │                  │                 *
      95             :      *   └──────┘     └──────┘                 ─┘                 *
      96             :      *                                                            *
      97             :      **************************************************************/
      98             : 
      99             : /* ### Implementation
     100             : 
     101             :    fd_blake3 consists of three major parts:
     102             : 
     103             :      (1) Hash state machines, which track the progress of hash
     104             :          calculations and prepare operations to advance them;
     105             :      (2) Schedulers, which accumulate batches of operations from state
     106             :          machines, then send them to hash backends;
     107             :      (3) Hash backends (SSE, AVX2, AVX512) which work off a static size
     108             :          vector of independent hash operations.
     109             : 
     110             :    The goal is to maximize throughput.  The fastest backend usually is
     111             :    the widest, creating a scheduling problem.  The scheduler should be
     112             :    able to flexibly schedule operations in parallel without taking up
     113             :    valuable time that could be used for hashing.
     114             : 
     115             :    The simplest opportunity to parallelize is during chunk compression.
     116             :    The bulk of the work is done in the chunk phase, independently for
     117             :    each 1024 bytes of input data.  This is effective for inputs of size
     118             :    (width*FD_CHUNK_SZ), i.e. >=8192 bytes of input for AVX2.
     119             : 
     120             :    To accelerate processing of smaller inputs, a batch API is offered.
     121             :    Batching allows the scheduler to process operations over multiple
     122             :    independent messages at once. This has a significantly higher
     123             :    scheduling overhead though.
     124             : 
     125             :    It is worth noting that compression operations require a variable
     126             :    amount of compression function calls.  (Recall that each call
     127             :    processes 64 bytes of input data, but a chunk can have up to 1024
     128             :    bytes of data)  fd_blake3 therefore has an internal clock that ticks
     129             :    each time a hash backend processes a vector of blocks.  When a state
     130             :    machine schedules an op with a 1024 byte input, it knows that the op
     131             :    completes 16 ticks into the future. */
     132             : 
     133             : 
     134             : /* Protocol constants *************************************************/
     135             : 
     136             : /* FD_BLAKE3_BLOCK_SZ is the byte size of the inputs to the internal
     137             :    compression function.  This is a protocol constant. */
     138             : 
     139             : #define FD_BLAKE3_BLOCK_LG_SZ (6)
     140  4287908907 : #define FD_BLAKE3_BLOCK_SZ    (64UL)
     141             : 
     142             : /* FD_BLAKE3_OUTCHAIN_SZ is the byte size of an "output chaining
     143             :    value".  This is a protocol constant. */
     144             : 
     145   124101779 : #define FD_BLAKE3_OUTCHAIN_LG_SZ (5)
     146    28626365 : #define FD_BLAKE3_OUTCHAIN_SZ    (32UL)
     147             : 
     148             : /* FD_BLAKE3_CHUNK_SZ is the max number of input bytes of a leaf node.
     149             :    This is a protocol constant.
     150             :    (1<<FD_BLAKE3_CHUNK_LG_SZ)==FD_BLAKE3_CHUNK_SZ */
     151             : 
     152  1168972154 : #define FD_BLAKE3_CHUNK_LG_SZ (10)
     153   211873033 : #define FD_BLAKE3_CHUNK_SZ    (1024UL)
     154             : 
     155             : /* FD_BLAKE3_KEY_SZ is the byte size of the optional key in expanded
     156             :    form.  This is a protocol constant. */
     157             : 
     158             : #define FD_BLAKE3_KEY_SZ (32UL)
     159             : 
     160             : /* Implementation constants *******************************************/
     161             : 
     162             : /* FD_BLAKE3_ROW_CNT is the max supported tree height of fd_blake3. */
     163             : 
     164             : #define FD_BLAKE3_ROW_CNT (32UL)
     165             : 
     166             : /* FD_BLAKE3_INPUT_MAX_SZ is the max supported message size of
     167             :    fd_blake3, derived by FD_BLAKE3_ROW_CNT.  (About 4.40 terabytes) */
     168             : 
     169             : #define FD_BLAKE3_INPUT_MAX_SZ ((1UL<<FD_BLAKE3_ROW_CNT)<<FD_BLAKE3_CHUNK_LG_SZ)
     170             : 
     171             : /* FD_BLAKE3_COL_CNT is the max number of adjacent tree nodes to be
     172             :    buffered per hash state.  Used for parallel processing.
     173             :    (1<<FD_BLAKE3_COL_LG_CNT) == FD_BLAKE3_COL_CNT */
     174             : 
     175             : #if FD_HAS_AVX512
     176             : #define FD_BLAKE3_COL_LG_CNT ( 5UL)
     177    81834475 : #define FD_BLAKE3_COL_CNT    (32UL)
     178             : #else
     179             : #define FD_BLAKE3_COL_LG_CNT ( 4UL)
     180   157682632 : #define FD_BLAKE3_COL_CNT    (16UL)
     181             : #endif
     182             : 
     183             : /* FD_BLAKE3_{ALIGN,FOOTPRINT} describe the alignment and footprint needed
     184             :    for a memory region to hold a fd_blake3_t.  ALIGN is a positive
     185             :    integer power of 2.  FOOTPRINT is a multiple of align.  ALIGN is
     186             :    recommended to be at least double cache line to mitigate various
     187             :    kinds of false sharing.  These are provided to facilitate compile
     188             :    time declarations. */
     189             : 
     190          75 : #define FD_BLAKE3_ALIGN (128UL)
     191             : 
     192             : /* A fd_blake3_t should be treated as an opaque handle of a blake3
     193             :    calculation state.  (It technically isn't here facilitate compile
     194             :    time declarations of fd_blake3_t memory.) */
     195             : 
     196    88117280 : #define FD_BLAKE3_MAGIC (0xF17EDA2CEB1A4E30) /* FIREDANCE BLAKE3 V0 */
     197             : 
     198             : /* Hash state machine *************************************************/
     199             : 
     200             : /* fd_blake3_pos_t is a hash state machine.  The user should consider
     201             :    this struct implementation-defined.  It prepares inputs to all
     202             :    compression function calls.  It also tracks dependencies between
     203             :    those calls.  For every fd_blake3_pos_t, there is a fd_blake3_buf_t.
     204             :    Depending on input size, it may be able to prepare multiple ops that
     205             :    can be worked on in parallel. */
     206             : 
     207             : struct __attribute__((aligned(FD_BLAKE3_ALIGN))) fd_blake3_pos {
     208             : 
     209             :   /* The tail and head arrays track the hash progress of each tree
     210             :      layer.  head.uc[n] is the number of nodes buffered for that layer.
     211             :      tail.uc[n] is the number of nodes already hashed into the next
     212             :      layer.  The 32-byte "output chaining value" for that node is stored
     213             :      in fd_blake3_buf_t.   */
     214             : 
     215             :   /* This point is 128-byte aligned */
     216             : 
     217             : # if FD_HAS_AVX
     218             :   union { uchar uc[ 32 ]; wb_t wb; } tail;
     219             :   union { uchar uc[ 32 ]; wb_t wb; } head;
     220             : # else
     221             :   union { uchar uc[ 32 ]; } tail;
     222             :   union { uchar uc[ 32 ]; } head;
     223             : # endif
     224             : 
     225             :   /* leaf_idx is the number of leaf chunks processed so far.  All but
     226             :      the last leaf chunk are of size FD_CHUNK_SZs.  live_cnt is the
     227             :      number of nodes for which an output chaining value is buffered and
     228             :      awaiting further processing.  next_tick keeps track of relative
     229             :      time to inform scheduling when a batch of operations will complete.
     230             :      layer is the tree layer that the scheduler will work on next. */
     231             : 
     232             :   /* This point is 64-byte aligned */
     233             : 
     234             :   ulong leaf_idx;
     235             :   ulong live_cnt;
     236             :   ulong next_tick;
     237             :   uint  layer;
     238             :   uchar _pad[4];
     239             : 
     240             :   /* [input,input+input_sz) is the user-provided memory region
     241             :      containing the hash input.  May be unaligned.  */
     242             : 
     243             :   /* This point is 32-byte aligned */
     244             : 
     245             :   uchar const * input;
     246             :   ulong         input_sz;
     247             : 
     248             :   /* magic==FD_BLAKE3_MAGIC (useful for debugging and detecting memory
     249             :      corruption) */
     250             : 
     251             :   ulong magic;
     252             : 
     253             : };
     254             : 
     255             : typedef struct fd_blake3_pos fd_blake3_pos_t;
     256             : 
     257             : /* fd_blake3_buf_t contains intermediate results of hash tree
     258             :    construction.  Internally, it is a table of output chaining values.
     259             :    Each row contains a contiguous window of output chaining values for
     260             :    the nodes at a specific tree layer.  Row 0 is the leaf layer. */
     261             : 
     262             : union __attribute__((aligned(FD_BLAKE3_ALIGN))) fd_blake3_buf {
     263             : 
     264             :   uchar slots[ FD_BLAKE3_ROW_CNT ][ FD_BLAKE3_COL_CNT ][ FD_BLAKE3_OUTCHAIN_SZ ];
     265             :   uchar rows [ FD_BLAKE3_ROW_CNT ][ FD_BLAKE3_COL_CNT *  FD_BLAKE3_OUTCHAIN_SZ ];
     266             : 
     267             : };
     268             : 
     269             : typedef union fd_blake3_buf fd_blake3_buf_t;
     270             : 
     271             : /* Simple API *********************************************************/
     272             : 
     273             : #if FD_HAS_AVX512
     274   178496379 : #define FD_BLAKE3_PARA_LG_MAX (4UL)
     275             : #elif FD_HAS_AVX
     276   374394732 : #define FD_BLAKE3_PARA_LG_MAX (3UL)
     277             : #else
     278             : #define FD_BLAKE3_PARA_LG_MAX (0UL)
     279             : #endif
     280   372608213 : #define FD_BLAKE3_PARA_MAX (1<<FD_BLAKE3_PARA_LG_MAX)
     281             : 
     282   180282898 : #define FD_BLAKE3_PRIVATE_LG_BUF_MAX (FD_BLAKE3_PARA_LG_MAX+FD_BLAKE3_CHUNK_LG_SZ)
     283    60227289 : #define FD_BLAKE3_PRIVATE_BUF_MAX    (1UL<<FD_BLAKE3_PRIVATE_LG_BUF_MAX)
     284             : 
     285             : struct fd_blake3 {
     286             :   fd_blake3_buf_t buf;
     287             :   uchar           block[ FD_BLAKE3_PRIVATE_BUF_MAX ];
     288             :   fd_blake3_pos_t pos;
     289             :   ulong           block_sz;
     290             : };
     291             : 
     292             : typedef struct fd_blake3 fd_blake3_t;
     293             : 
     294          24 : #define FD_BLAKE3_FOOTPRINT (sizeof(fd_blake3_t))
     295             : 
     296             : FD_PROTOTYPES_BEGIN
     297             : 
     298             : /* fd_blake3_{align,footprint,new,join,leave,delete} usage is identical to
     299             :    that of their fd_sha512 counterparts.  See ../sha512/fd_sha512.h */
     300             : 
     301             : FD_FN_CONST ulong
     302             : fd_blake3_align( void );
     303             : 
     304             : FD_FN_CONST ulong
     305             : fd_blake3_footprint( void );
     306             : 
     307             : void *
     308             : fd_blake3_new( void * shmem );
     309             : 
     310             : fd_blake3_t *
     311             : fd_blake3_join( void * shsha );
     312             : 
     313             : void *
     314             : fd_blake3_leave( fd_blake3_t * sha );
     315             : 
     316             : void *
     317             : fd_blake3_delete( void * shsha );
     318             : 
     319             : /* fd_blake3_init starts a blake3 calculation.  sha is assumed to be a
     320             :    current local join to a blake3 calculation state with no other
     321             :    concurrent operation that would modify the state while this is
     322             :    executing.  Any preexisting state for an in-progress or recently
     323             :    completed calculation will be discarded.  Returns sha (on return, sha
     324             :    will have the state of a new in-progress calculation). */
     325             : 
     326             : fd_blake3_t *
     327             : fd_blake3_init( fd_blake3_t * sha );
     328             : 
     329             : /* fd_blake3_append adds sz bytes locally pointed to by data an
     330             :    in-progress blake3 calculation.  sha, data and sz are assumed to be
     331             :    valid (i.e. sha is a current local join to a blake3 calculation state
     332             :    with no other concurrent operations that would modify the state while
     333             :    this is executing, data points to the first of the sz bytes and will
     334             :    be unmodified while this is running with no interest retained after
     335             :    return ... data==NULL is fine if sz==0).  Returns sha (on return, sha
     336             :    will have the updated state of the in-progress calculation).
     337             : 
     338             :    It does not matter how the user group data bytes for a blake3
     339             :    calculation; the final hash will be identical.  It is preferable for
     340             :    performance to try to append as many bytes as possible as a time
     341             :    though.  It is also preferable for performance if sz is a multiple of
     342             :    64 for all but the last append (it is also preferable if sz is less
     343             :    than 56 for the last append). */
     344             : 
     345             : fd_blake3_t *
     346             : fd_blake3_append( fd_blake3_t * sha,
     347             :                   void const *  data,
     348             :                   ulong         sz );
     349             : 
     350             : /* fd_blake3_fini finishes a a blake3 calculation.  sha and hash are
     351             :    assumed to be valid (i.e. sha is a local join to a blake3 calculation
     352             :    state that has an in-progress calculation with no other concurrent
     353             :    operations that would modify the state while this is executing and
     354             :    hash points to the first byte of a 32-byte memory region where the
     355             :    result of the calculation should be stored).  Returns hash (on
     356             :    return, there will be no calculation in-progress on sha and 32-byte
     357             :    buffer pointed to by hash will be populated with the calculation
     358             :    result). */
     359             : 
     360             : void *
     361             : fd_blake3_fini( fd_blake3_t * sha,
     362             :                 void *        hash );
     363             : 
     364             : void *
     365             : fd_blake3_fini_2048( fd_blake3_t * sha,
     366             :                      void *        hash );
     367             : 
     368             : void *
     369             : fd_blake3_hash( void const * data,
     370             :                 ulong        sz,
     371             :                 void *       hash );
     372             : 
     373             : /* fd_blake3_lthash_batch{n} calculate a batch of n independent BLAKE3
     374             :    hash operations with 2048 byte XOF output, and up to 1024 byte input.
     375             :    The outputs are reduced down to a single value using 'LtHash'
     376             :    group-add arithmetic.
     377             : 
     378             :    batch_data[i] give a pointer to the input message.  batch_data is
     379             :    assumed to be 64-byte aligned.  batch_data[i] does not have to be
     380             :    aligned.  batch_sz[i] give the input size (in [0,1024]).  batch_sz is
     381             :    assumed to be 64-byte aligned.  On return, 2048 bytes of output are
     382             :    written to out_lthash.  out_lthash is assumed to be 64-byte aligned.
     383             : 
     384             :    Execution time is bound by the largest batch_sz[i] input. */
     385             : 
     386             : #if FD_HAS_AVX
     387             : 
     388             : void
     389             : fd_blake3_lthash_batch8(
     390             :     void const * batch_data[8],  /* align=32 ele_align=1 */
     391             :     uint const   batch_sz  [8],  /* align=32 */
     392             :     void *       out_lthash      /* align=32 */
     393             : );
     394             : 
     395             : #endif
     396             : 
     397             : #if FD_HAS_AVX512
     398             : 
     399             : void
     400             : fd_blake3_lthash_batch16(
     401             :     void const * batch_data[16],  /* align=64 ele_align=1 */
     402             :     uint const   batch_sz  [16],  /* align=64 */
     403             :     void *       out_lthash       /* align=64 */
     404             : );
     405             : 
     406             : #endif
     407             : 
     408             : FD_PROTOTYPES_END
     409             : 
     410             : #endif /* HEADER_fd_src_ballet_blake3_fd_blake3_h */

Generated by: LCOV version 1.14