LCOV - code coverage report
Current view: top level - ballet/bmtree - fd_bmtree.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 3 3 100.0 %
Date: 2025-07-01 05:00:49 Functions: 1 279 0.4 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_ballet_bmtree_fd_bmtree_h
       2             : #define HEADER_fd_src_ballet_bmtree_fd_bmtree_h
       3             : 
       4             : /* fd_bmtree provides APIs for working with binary Merkle trees that use
       5             :    the SHA256 hash function. */
       6             : 
       7             : #include "../../util/fd_util_base.h"
       8             : /* Binary Merkle trees are generally used as a vector commitment scheme
       9             :    wherein the root node of the tree commits the vector of leaf nodes.
      10             : 
      11             :    All methods provided by this Merkle tree derive from the following
      12             :    three basic operations:
      13             : 
      14             :      1. Construct leaf node:
      15             : 
      16             :         (leaf blob) -> (node)
      17             : 
      18             :      2. Construct branch node with two children:
      19             : 
      20             :         (node, node) -> (node)
      21             : 
      22             :      3. Construct branch node with one child:
      23             : 
      24             :         (node) -> (node)
      25             : 
      26             :    Example derived methods.
      27             :    (TODO now these all are more or less implemented, but not exactly as
      28             :    described. Is this distinction between basic and derived even useful
      29             :    though?)
      30             : 
      31             :      4. Construct full tree:
      32             : 
      33             :         (vector of leaf blobs) -> (tree of nodes)
      34             : 
      35             :      5. Create inclusion proof from tree data
      36             : 
      37             :         (tree of nodes, node index) -> (inclusion proof)
      38             : 
      39             :      6. Verify node inclusion proof
      40             : 
      41             :         (node, root node, inclusion proof) -> (bool)
      42             : 
      43             :    **Topology**
      44             : 
      45             :    Tree topology has the following constraints:
      46             : 
      47             :     - All leaf nodes are in the bottom level
      48             : 
      49             :     - If a given layer `l`
      50             :       with number of nodes `N_l` ...
      51             : 
      52             :       ... has exactly one node,
      53             :           this one node is the root node
      54             :           and forms the uppermost layer.
      55             : 
      56             :       ... has more than one node
      57             :           ... and `N_l % 2 == 0`,
      58             :               the layer above contains N_l/2 nodes
      59             : 
      60             :           ... and `N_l % 2 == 1`,
      61             :               the layer above contains (N_l+1)/2 nodes.
      62             : 
      63             :    A simple algorithm to approach such a a tree is as follows:
      64             :    (Note that the code uses here uses an optimized approach)
      65             : 
      66             :     - Start with the smallest complete binary tree that has at least
      67             :        `n` leaf nodes.
      68             :     - Label the leaf nodes from left to right `L_0`, `L_1`, ... `L_(n-1)`
      69             :     - Delete any un-labeled leaf nodes, and then recursively delete any
      70             :       nodes with no children.
      71             :     - For any nodes with a single remaining child, duplicate the link to
      72             :       the child.
      73             :     - Each non-leaf node now has exactly two children, counting
      74             :       duplicates.
      75             : 
      76             :    Example: Tree with 1 leaf node (root node = leaf node)
      77             : 
      78             :     L0
      79             : 
      80             :    Example: Tree with 4 leaf nodes
      81             : 
      82             :            Iδ
      83             :           /  \
      84             :          /    \
      85             :        Iα      Iβ
      86             :       /  \    /  \
      87             :      L0  L1  L2  L3
      88             : 
      89             :    Example: Tree with 5 leaf nodes
      90             : 
      91             :               Iζ
      92             :              /  \
      93             :             /    \
      94             :            Iδ     Iε
      95             :           /  \     \\
      96             :          /    \     \\
      97             :        Iα      Iβ    Iγ
      98             :       /  \    /  \   ||
      99             :      L0  L1  L2  L3  L4
     100             : 
     101             :    **Construction**
     102             : 
     103             :    The input data is a vector of arbitrary-sized binary blobs.
     104             :    First, each blob is converted to a fixed-size leaf node by hashing
     105             :    the blob in the `FD_BMTREE_PREFIX_LEAF` hash domain.
     106             : 
     107             :    Then, the hash function is recursively applied over pairs of nodes
     108             :    until there is only one node left (the root).
     109             : 
     110             :    `fd_bmtree_32` uses the full SHA-256 digest for each tree node
     111             :    and is thus considered cryptographically secure.
     112             : 
     113             :    `fd_bmtree_20` uses SHA-256 digests truncated to 160 bits.
     114             : 
     115             :    **Inclusion Proofs**
     116             : 
     117             :    Inclusion proofs are used to verify whether a set of leaf nodes is
     118             :    part of a commitment (identified by the root node).
     119             : 
     120             :    At a high level, inclusion proofs present a sequence of hash
     121             :    instructions that when executed result in the root commitment.
     122             : 
     123             :    Inclusion proof size is O(log n) with regards to tree node count.
     124             : 
     125             :    Various types of inclusion proofs exist:
     126             : 
     127             :      - Single inclusion proofs (over one leaf node)
     128             :      - Range inclusion proofs (over a contiguous range of leaf nodes)
     129             :      - Sparse inclusion proofs (over an arbitrary subset of leaf nodes) */
     130             : 
     131             : 
     132             : /* As of https://github.com/solana-labs/solana/pull/29339, Solana
     133             :    changed the second preimage resistance strategy to depend on whether
     134             :    it's the 20B shred tree or the 32B runtime tree.  In the 20B case,
     135             :    they prepend the full leaf_prefix (excluding the nul terminator).  In
     136             :    the 32B case, they just prepend a single 0x00 byte.  Similarly for
     137             :    internal nodes.  These prefixes are aligned and padded to facilitate
     138             :    use with AVX. */
     139     1449558 : #define FD_BMTREE_LONG_PREFIX_SZ  26UL
     140             : #define FD_BMTREE_SHORT_PREFIX_SZ 1UL
     141             : static uchar const fd_bmtree_leaf_prefix[32UL] __attribute__((aligned(32))) = "\x00SOLANA_MERKLE_SHREDS_LEAF";
     142             : static uchar const fd_bmtree_node_prefix[32UL] __attribute__((aligned(32))) = "\x01SOLANA_MERKLE_SHREDS_NODE";
     143             : 
     144             : 
     145             : /* bmtree_node_t is the hash of a tree node (e.g. SHA256-160 / SHA256
     146             :    for a 20 / 32 byte node size).  We declare it this way to make the
     147             :    structure very AVX friendly and to allow SHA256 to write directly
     148             :    into the hash even if BMTREE_HASH_SZ isn't 32. */
     149             : struct __attribute__((packed)) fd_bmtree_node {
     150             :   uchar hash[ 32 ]; /* Last bytes may not be meaningful */
     151             : };
     152             : typedef struct fd_bmtree_node fd_bmtree_node_t;
     153             : 
     154             : /* bmtree_hash_leaf computes `SHA-256(prefix|data), where prefix is the
     155             :    first prefix_sz bytes of fd_bmtree_leaf_prefix.  prefix_sz is
     156             :    typically FD_BMTREE_LONG_PREFIX_SZ or FD_BMTREE_SHORT_PREFIX_SZ.
     157             :    This is the first step in the creation of a Merkle tree.  Returns
     158             :    node.  U.B. if `node` and `data` overlap. */
     159             : fd_bmtree_node_t * fd_bmtree_hash_leaf( fd_bmtree_node_t * node, void const * data, ulong data_sz, ulong prefix_sz );
     160             : 
     161             : /* A fd_bmtree_commit_t stores intermediate state used to compute the
     162             :    root of a binary Merkle tree built incrementally.  It can be used for
     163             :    two different typed of calculations:
     164             :      * leaf-based commitment calculations  (fd_bmtree_commit_*)
     165             :      * proof-based commitment calculations (fd_bmtree_commitp_*)
     166             :     but only one at a time.
     167             : 
     168             :    For leaf-based commitment calculations, it theoretically requires
     169             :    O(log n) space with regard to the number of nodes, although n is
     170             :    currently capped (at an astronomical value), so it requires constant
     171             :    space.
     172             : 
     173             :    During the accumulation phase, the data structure consumes all tree
     174             :    leaf nodes sequentially while calculating and buffering branch nodes
     175             :    of upper layers along the way.
     176             : 
     177             :    In the finalization phase, the buffered branch node data is hashed to
     178             :    derive the final root hash.
     179             : 
     180             :    The separation of the accumulation and finalization phases is
     181             :    required for trees with leaf counts that are not powers of two.
     182             :    Those contain at least one branch node with only one child node.
     183             : 
     184             :    The node_buf is large enough to handle trees with ~2^63 leaves.  This
     185             :    is orders of magnitude more leaves are practical (it would take
     186             :    ~30,000 years at a rate of ~100 microsecond per leaf insert but if
     187             :    you are willing to wait to make a larger tree, increase 63 below). */
     188             : 
     189             : struct fd_bmtree_commit_private {
     190             :   /* Explanation of the above internal state in bmtree_commit_t:
     191             : 
     192             :      - `leaf_cnt` contains the number of leaf nodes that have been
     193             :      accumulated so far. It is synonymous to the index of within the
     194             :      vector of leaf nodes.
     195             : 
     196             :      This is used to check how many branch nodes in the upper layers can
     197             :      be derived with the currently known information.
     198             : 
     199             :      The current depth of the layers above the leaf nodes is the number
     200             :      of times the `leaf_cnt` is divisible by 2.
     201             : 
     202             :      - `node_buf` is indexed by layer, with 0 being the leaf layer.
     203             : 
     204             :      Given a layer `L` containing a vector of nodes known so far,
     205             :      `node_buf[L]` contains the right-most node in layer `L` (counting
     206             :      from the bottom) that is a left child of its parent.
     207             : 
     208             :      More precisely:
     209             : 
     210             :      The subset `L_left` contains all nodes with index `i` within that
     211             :      layer where `i%2==0`. Then, `node_buf[L]` contains the node with
     212             :      the largest index `i`within `L_left`.
     213             : 
     214             :    **Example**
     215             : 
     216             :    Step-by-step walkthrough of the internal state:
     217             : 
     218             :    Initialize
     219             :    - leaf_cnt    <- 0
     220             : 
     221             :    Insert leaf `l_0`
     222             :    - node_buf[0] <- l_0
     223             :    - leaf_cnt    <- 1
     224             : 
     225             :    Insert leaf `l_1`
     226             :    - b_0         <- hash_branch( node_buf[0], l_1 )
     227             :    - node_buf[1] <- b_0
     228             :    - leaf_cnt    <- 2
     229             : 
     230             :    Insert leaf `l_2`
     231             :    - node_buf[0] <- l_2
     232             :    - leaf_cnt    <- 3
     233             : 
     234             :    Insert leaf `l_3
     235             :    - b_0         <- hash_branch( node_buf[0], l_3 )
     236             :    - b_1         <- hash_branch( node_buf[1], b_0 )
     237             :    - node_buf[2] <- b_1
     238             :    - leaf_cnt    <- 4
     239             : 
     240             :    inclusion_proofs stores hashes of internal nodes from previous
     241             :    computation.  If 0 <= i < inclusion_proof_sz, then
     242             :    inclusion_proofs[i] stores the hash at node i of the tree numbered
     243             :    in complete binary search tree order.  E.g.
     244             : 
     245             :                   3
     246             :                 /   \
     247             :               1       5
     248             :              / \     //
     249             :             0   2   4
     250             : 
     251             :    This is a superset of what is stored in node_buf, but in order to not
     252             :    lose the log(n) cache utilization features when we don't care about
     253             :    inclusion proofs and are only trying to derive the root hash, we
     254             :    store them separately.
     255             : 
     256             :    In general, this binary search tree order is fairly friendly.  To
     257             :    find the layer of a node, you count the number of trailing 1 bits in
     258             :    its index.  To get the left/right child, you add/subtract 1<<layer.
     259             :    This ordering makes sense for these trees, because they grow up and
     260             :    to the right, the same way the numbers increase, so a node's index
     261             :    never changes as more leaves are added to the tree.
     262             : 
     263             :    The biggest subtlety comes when there are nodes with incomplete left
     264             :    subtrees, e.g.
     265             : 
     266             :                          7
     267             :                       /     \
     268             :                     /         \
     269             :                    3          ??
     270             :                  /   \        /
     271             :                 1     5      9
     272             :                / \   / \    /
     273             :               0   2  4  6  8
     274             : 
     275             :    What number belongs in the ?? spot?  By binary search order, it
     276             :    should be 10.  The natural index of that node is 7+4=11 though.  The
     277             :    indexing gets very complicated and error-prone if we try to store it
     278             :    in 10, so we prefer to store it in 11.  That means the size of our
     279             :    storage depends only on the maximum number of layers we expect in
     280             :    the tree, which is somewhat convenient.  This does waste up to
     281             :    O(leaf_cnt) space though. */
     282             : 
     283             :   ulong             leaf_cnt;         /* Number of leaves added so far */
     284             :   ulong             hash_sz;          /* <= 32 bytes */
     285             :   ulong             prefix_sz;        /* <= 26 bytes */
     286             :   ulong             inclusion_proof_sz;
     287             :   fd_bmtree_node_t  node_buf[ 63UL ];
     288             :   /* Dense bit set. Array indexed [0, ceil((inclusion_proof_sz+1)/64)).
     289             :      Points to memory just after the end of the inclusion_proofs array
     290             :      and included in the footprint.  Only used or set in proof-based
     291             :      commits, because it's implicit in the leaf_cnt in leaf-based
     292             :      commits. */
     293             :   ulong *           inclusion_proofs_valid;
     294             :   /* inclusion_proofs is indexed [0, inclusion_proof_sz] where index
     295             :      inclusion_proof_sz is a dummy index used to avoid branches. */
     296             :   fd_bmtree_node_t  inclusion_proofs[ 1 ];
     297             : };
     298             : 
     299             : typedef struct fd_bmtree_commit_private fd_bmtree_commit_t;
     300             : 
     301             : #define FD_BMTREE_COMMIT_FOOTPRINT( inclusion_proof_layer_cnt ) ((((sizeof(fd_bmtree_commit_t) + \
     302             :                                                                  ((1UL<<(inclusion_proof_layer_cnt))-1UL)*sizeof(fd_bmtree_node_t)+\
     303             :                                                                  ((1UL<<(inclusion_proof_layer_cnt))+63UL)/64UL*sizeof(ulong))+31UL)/32UL) * 32UL)
     304        1857 : #define FD_BMTREE_COMMIT_ALIGN                         (32UL)
     305             : 
     306             : FD_PROTOTYPES_BEGIN
     307             : 
     308             : /* bmtree_commit_{footprint,align} return the alignment and footprint
     309             :    required for a memory region to be used as a bmtree_commit_t.  If the
     310             :    tree does not exceed inclusion_proof_layer_cnt layers, then all
     311             :    inclusion proofs can be retrieved after finalization. */
     312             : ulong          fd_bmtree_commit_align    ( void );
     313             : ulong          fd_bmtree_commit_footprint( ulong inclusion_proof_layer_cnt );
     314             : 
     315             : /* bmtree_commit_init starts a vector commitment calculation of either
     316             :    type.  Assumes mem unused with required alignment and footprint.
     317             :    Returns mem as a bmtree_commit_t *, commit will be in a calc.
     318             :    prefix_sz is the size (in bytes) of the second-preimage resistance
     319             :    prefix used.  It's typically FD_BMTREE_LONG_PREFIX_SZ or
     320             :    FD_BMTREE_SHORT_PREFIX_SZ and must not be greater than
     321             :    FD_BMTREE_LONG_PREFIX_SZ.
     322             : 
     323             :    The calculation can also save some inclusion proof information such
     324             :    that if the final tree has no more than inclusion_proof_layer_cnt layers,
     325             :    inclusion proofs will be available for all leaves.  If the tree grows
     326             :    beyond inclusion_proof_layer_cnt layers, then inclusion proofs may
     327             :    not be available for any leaves.
     328             : 
     329             :    For proof-based commitments, inclusion_proof_layers must be at least
     330             :    as large as the number of layers in the tree.
     331             :    */
     332             : fd_bmtree_commit_t * fd_bmtree_commit_init     ( void * mem, ulong hash_sz, ulong prefix_sz, ulong inclusion_proof_layer_cnt );
     333             : 
     334             : /* bmtree_commit_leaf_cnt returns the number of leafs appended thus
     335             :    far.  Assumes state is valid. */
     336     6098781 : FD_FN_PURE static inline ulong fd_bmtree_commit_leaf_cnt ( fd_bmtree_commit_t const * bmt ) { return bmt->leaf_cnt; }
     337             : 
     338             : /* fd_bmtree_depth and fd_bmtree_node_cnt respectively return the number
     339             :    of layers (counting leaf and root) and total number of nodes in a
     340             :    binary Merkle tree with leaf_cnt leaves.  leaf_cnt in [0, ULONG_MAX].
     341             :    */
     342             : FD_FN_CONST ulong fd_bmtree_depth(    ulong leaf_cnt );
     343             : FD_FN_CONST ulong fd_bmtree_node_cnt( ulong leaf_cnt );
     344             : 
     345             : /* bmtree_commit_append appends a range of leaf nodes.  Assumes that
     346             :    leaf_cnt + new_leaf_cnt << 2^63 (which, unless planning on running
     347             :    for millennia, is always true). */
     348             : fd_bmtree_commit_t *                                                         /* Returns state */
     349             : fd_bmtree_commit_append( fd_bmtree_commit_t *                 state,         /* Assumed valid and in a leaf-based calc */
     350             :                          fd_bmtree_node_t const * FD_RESTRICT new_leaf,      /* Indexed [0,new_leaf_cnt) */
     351             :                          ulong                                new_leaf_cnt );
     352             : 
     353             : /* bmtree_commit_fini seals the commitment calculation by deriving the
     354             :    root node.  Assumes state is valid, in a leaf-based calc on entry
     355             :    with at least one leaf in the tree.  The state will be valid but no
     356             :    longer in a calc on return.  Returns a pointer in the caller's
     357             :    address space to the first byte of a memory region of BMTREE_HASH_SZ
     358             :    with to the root hash on success.  The lifetime of the returned
     359             :    pointer is that of the state or until the memory used for state gets
     360             :    initialized for a new calc. */
     361             : uchar * fd_bmtree_commit_fini( fd_bmtree_commit_t * state );
     362             : 
     363             : 
     364             : /* bmtree_get_proof writes an inclusion proof for the leaf
     365             :    with index leaf_idx to the memory at dest.  state must be a valid
     366             :    sealed bmtree commitment (leaf-based or proof-based) with at least
     367             :    leaf_idx+1 leaves.  state must have been initialized with
     368             :    inclusion_proof_layers_cnt >= the height of the tree, which you can
     369             :    get from fd_bmtree_depth( fd_bmtree_commit_leaf_cnt( state ) ).
     370             : 
     371             :    If these conditions are met, upon return, dest[ i ] for
     372             :    0<=i<hash_sz*(tree depth-1) will contain the inclusion proof, and the
     373             :    function will return the number of hashes written.  If
     374             :    inclusion_proof_layers_cnt was initialized to too small of a value,
     375             :    this function will return -1 and the memory pointed to by dest will
     376             :    not be modified.
     377             : 
     378             :    The inclusion proof is ordered from leaf to root but excludes the
     379             :    actual root of the tree. */
     380             : /* FIXME: Returning -1 is pretty bad here, but 0 is the legitimate
     381             :    proof size of a 1 node tree.  Is that case worth distinguishing? */
     382             : int
     383             : fd_bmtree_get_proof( fd_bmtree_commit_t * state,
     384             :                      uchar *              dest,
     385             :                      ulong                leaf_idx );
     386             : 
     387             : /* fd_bmtree_from_proof derives the root of a Merkle tree where the
     388             :    element with hash `leaf` is the leaf_idx^th leaf and proof+hash_sz*i
     389             :    contains its sibling at the ith level (counting from the bottom, with
     390             :    the sibling leaf being the i=0 value).  The full root hash (i.e.
     391             :    untruncated regardless of hash_sz) will be stored in root upon
     392             :    return.
     393             :    Does not retain any read or write interests after returning, and it
     394             :    operates independently of normal tree construction, so it neither
     395             :    starts nor ends a calc, and it can safely be done in the middle of a
     396             :    calc.
     397             : 
     398             :    Memory regions should not overlap.
     399             : 
     400             :    The proof consists of proof_depth hashes, each hash_sz bytes
     401             :    concatenated with no padding ordered from leaf to root, excluding the
     402             :    root.
     403             : 
     404             :    Returns root if the proof is valid and NULL otherwise.  If the proof
     405             :    is invalid, the root will not be stored.  A proof can only be invalid
     406             :    if it is too short to possibly correspond to the leaf_idx^th node. */
     407             : /* TODO: Write the caching version of this */
     408             : fd_bmtree_node_t *
     409             : fd_bmtree_from_proof( fd_bmtree_node_t const * leaf,
     410             :                       ulong                    leaf_idx, /* in [0, ULONG_MAX) */
     411             :                       fd_bmtree_node_t *       root,
     412             :                       uchar const *            proof,
     413             :                       ulong                    proof_depth, /* in [0, 63] */
     414             :                       ulong                    hash_sz, /* in [1, 32] */
     415             :                       ulong                    prefix_sz /* either LONG_PREFIX_SZ or SHORT_PREFIX_SZ */ );
     416             : 
     417             : 
     418             : /* fd_bmtree_commitp_insert_with_proof inserts a leaf at index idx in
     419             :    the proof-based calc, optionally with some proof.  Returns 1 if
     420             :    the leaf and proof are consistent with everything previously added to
     421             :    this calc, or 0 if not.
     422             : 
     423             :    fd_bmtree_depth( idx+1 ) must be <= inclusion_proof_layer_cnt used in
     424             :    init.  Additionally, proof_depth<inclusion_proof_layer_cnt.
     425             : 
     426             :    Like all the other functions in this file that deal with inclusion
     427             :    proofs, the proof format is leaf to root, excluding the root, where
     428             :    each of the proof_depth hashes occupies hash_sz bytes and there is no
     429             :    padding.  Truncated proof_depths are fine and are interpreted as the
     430             :    first proof_depth elements of the proof, i.e. the ones closer to the
     431             :    leaf.  In particular, a proof_depth of 0 is fine, in which case
     432             :    proof==NULL is fine.
     433             : 
     434             :    If this returns success and opt_root is not NULL, the highest node
     435             :    (closest to the root) in the branch containing idx that is known will
     436             :    be written to the memory pointed to by opt_root.  In the case that
     437             :    the inclusion proof is full (contains all the nodes except the root),
     438             :    the node that is stored is the root of the tree; however, in general
     439             :    the tree can grow beyond this, so it isn't possible to guarantee that
     440             :    it is the root in other cases.  If the function returns failure (0)
     441             :    or opt_root==NULL, then the memory pointed to by opt_root will not be
     442             :    accessed.
     443             : 
     444             :    If this returns 0, the commitment state will not be modified.  If it
     445             :    returns 1, then the information provided will be cached to speed up
     446             :    other validations in this calc.
     447             : 
     448             :    Note that a return value of 1 does not necessarily imply the leaf is
     449             :    correct, as there may not be enough information to determine it yet.
     450             :    In that case, fd_bmtreep_fini or another call to fd_bmtreep_insert
     451             :    will return 0. */
     452             : 
     453             : int
     454             : fd_bmtree_commitp_insert_with_proof( fd_bmtree_commit_t *     state,
     455             :                                      ulong                    idx,
     456             :                                      fd_bmtree_node_t const * new_leaf,
     457             :                                      uchar            const * proof,
     458             :                                      ulong                    proof_depth,
     459             :                                      fd_bmtree_node_t       * opt_root );
     460             : 
     461             : /* fd_bmtree_commitp_fini finalizes a proof-based calc.  Returns the
     462             :    root of the tree if it can conclusively determine that the entire
     463             :    tree is correct for a commitment of leaf_cnt leaf nodes and NULL
     464             :    otherwise. */
     465             : uchar * fd_bmtree_commitp_fini( fd_bmtree_commit_t * state, ulong leaf_cnt );
     466             : 
     467             : FD_PROTOTYPES_END
     468             : #endif /* HEADER_fd_src_ballet_bmtree_fd_bmtree_h */

Generated by: LCOV version 1.14