LCOV - code coverage report
Current view: top level - ballet/bmtree - fd_bmtree.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 3 3 100.0 %
Date: 2024-11-13 11:58:15 Functions: 1 181 0.6 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_ballet_bmtree_fd_bmtree_h
       2             : #define HEADER_fd_src_ballet_bmtree_fd_bmtree_h
       3             : 
       4             : /* fd_bmtree provides APIs for working with binary Merkle trees that use
       5             :    the SHA256 hash function. */
       6             : 
       7             : #include "../../util/fd_util_base.h"
       8             : /* Binary Merkle trees are generally used as a vector commitment scheme
       9             :    wherein the root node of the tree commits the vector of leaf nodes.
      10             : 
      11             :    All methods provided by this Merkle tree derive from the following
      12             :    three basic operations:
      13             : 
      14             :      1. Construct leaf node:
      15             : 
      16             :         (leaf blob) -> (node)
      17             : 
      18             :      2. Construct branch node with two children:
      19             : 
      20             :         (node, node) -> (node)
      21             : 
      22             :      3. Construct branch node with one child:
      23             : 
      24             :         (node) -> (node)
      25             : 
      26             :    Example derived methods.
      27             :    (TODO now these all are more or less implemented, but not exactly as
      28             :    described. Is this distinction between basic and derived even useful
      29             :    though?)
      30             : 
      31             :      4. Construct full tree:
      32             : 
      33             :         (vector of leaf blobs) -> (tree of nodes)
      34             : 
      35             :      5. Create inclusion proof from tree data
      36             : 
      37             :         (tree of nodes, node index) -> (inclusion proof)
      38             : 
      39             :      6. Verify node inclusion proof
      40             : 
      41             :         (node, root node, inclusion proof) -> (bool)
      42             : 
      43             :    **Topology**
      44             : 
      45             :    Tree topology has the following constraints:
      46             : 
      47             :     - All leaf nodes are in the bottom level
      48             : 
      49             :     - If a given layer `l`
      50             :       with number of nodes `N_l` ...
      51             : 
      52             :       ... has exactly one node,
      53             :           this one node is the root node
      54             :           and forms the uppermost layer.
      55             : 
      56             :       ... has more than one node
      57             :           ... and `N_l % 2 == 0`,
      58             :               the layer above contains N_l/2 nodes
      59             : 
      60             :           ... and `N_l % 2 == 1`,
      61             :               the layer above contains (N_l+1)/2 nodes.
      62             : 
      63             :    A simple algorithm to approach such a a tree is as follows:
      64             :    (Note that the code uses here uses an optimized approach)
      65             : 
      66             :     - Start with the smallest complete binary tree that has at least
      67             :        `n` leaf nodes.
      68             :     - Label the leaf nodes from left to right `L_0`, `L_1`, ... `L_(n-1)`
      69             :     - Delete any un-labeled leaf nodes, and then recursively delete any
      70             :       nodes with no children.
      71             :     - For any nodes with a single remaining child, duplicate the link to
      72             :       the child.
      73             :     - Each non-leaf node now has exactly two children, counting
      74             :       duplicates.
      75             : 
      76             :    Example: Tree with 1 leaf node (root node = leaf node)
      77             : 
      78             :     L0
      79             : 
      80             :    Example: Tree with 4 leaf nodes
      81             : 
      82             :            Iδ
      83             :           /  \
      84             :          /    \
      85             :        Iα      Iβ
      86             :       /  \    /  \
      87             :      L0  L1  L2  L3
      88             : 
      89             :    Example: Tree with 5 leaf nodes
      90             : 
      91             :               Iζ
      92             :              /  \
      93             :             /    \
      94             :            Iδ     Iε
      95             :           /  \     \\
      96             :          /    \     \\
      97             :        Iα      Iβ    Iγ
      98             :       /  \    /  \   ||
      99             :      L0  L1  L2  L3  L4
     100             : 
     101             :    **Construction**
     102             : 
     103             :    The input data is a vector of arbitrary-sized binary blobs.
     104             :    First, each blob is converted to a fixed-size leaf node by hashing
     105             :    the blob in the `FD_BMTREE_PREFIX_LEAF` hash domain.
     106             : 
     107             :    Then, the hash function is recursively applied over pairs of nodes
     108             :    until there is only one node left (the root).
     109             : 
     110             :    `fd_bmtree_32` uses the full SHA-256 digest for each tree node
     111             :    and is thus considered cryptographically secure.
     112             : 
     113             :    `fd_bmtree_20` uses SHA-256 digests truncated to 160 bits.
     114             : 
     115             :    **Inclusion Proofs**
     116             : 
     117             :    Inclusion proofs are used to verify whether a set of leaf nodes is
     118             :    part of a commitment (identified by the root node).
     119             : 
     120             :    At a high level, inclusion proofs present a sequence of hash
     121             :    instructions that when executed result in the root commitment.
     122             : 
     123             :    Inclusion proof size is O(log n) with regards to tree node count.
     124             : 
     125             :    Various types of inclusion proofs exist:
     126             : 
     127             :      - Single inclusion proofs (over one leaf node)
     128             :      - Range inclusion proofs (over a contiguous range of leaf nodes)
     129             :      - Sparse inclusion proofs (over an arbitrary subset of leaf nodes) */
     130             : 
     131             : 
     132             : /* As of https://github.com/solana-labs/solana/pull/29339, Solana
     133             :    changed the second preimage resistance strategy to depend on whether
     134             :    it's the 20B shred tree or the 32B runtime tree.  In the 20B case,
     135             :    they prepend the full leaf_prefix (excluding the nul terminator).  In
     136             :    the 32B case, they just prepend a single 0x00 byte.  Similarly for
     137             :    internal nodes.  These prefixes are aligned and padded to facilitate
     138             :    use with AVX. */
     139     1488330 : #define FD_BMTREE_LONG_PREFIX_SZ  26UL
     140             : #define FD_BMTREE_SHORT_PREFIX_SZ 1UL
     141             : static uchar const fd_bmtree_leaf_prefix[32UL] __attribute__((aligned(32))) = "\x00SOLANA_MERKLE_SHREDS_LEAF";
     142             : static uchar const fd_bmtree_node_prefix[32UL] __attribute__((aligned(32))) = "\x01SOLANA_MERKLE_SHREDS_NODE";
     143             : 
     144             : 
     145             : /* bmtree_node_t is the hash of a tree node (e.g. SHA256-160 / SHA256
     146             :    for a 20 / 32 byte node size).  We declare it this way to make the
     147             :    structure very AVX friendly and to allow SHA256 to write directly
     148             :    into the hash even if BMTREE_HASH_SZ isn't 32. */
     149             : struct __attribute__((packed)) fd_bmtree_node {
     150             :   uchar hash[ 32 ]; /* Last bytes may not be meaningful */
     151             : };
     152             : 
     153             : typedef struct fd_bmtree_node fd_bmtree_node_t;
     154             : 
     155             : /* bmtree_hash_leaf computes `SHA-256(prefix|data), where prefix is the
     156             :    first prefix_sz bytes of fd_bmtree_leaf_prefix.  prefix_sz is
     157             :    typically FD_BMTREE_LONG_PREFIX_SZ or FD_BMTREE_SHORT_PREFIX_SZ.
     158             :    This is the first step in the creation of a Merkle tree.  Returns
     159             :    node.  U.B. if `node` and `data` overlap. */
     160             : fd_bmtree_node_t * fd_bmtree_hash_leaf( fd_bmtree_node_t * node, void const * data, ulong data_sz, ulong prefix_sz );
     161             : 
     162             : /* A fd_bmtree_commit_t stores intermediate state used to compute the
     163             :    root of a binary Merkle tree built incrementally.  It can be used for
     164             :    two different typed of calculations:
     165             :      * leaf-based commitment calculations  (fd_bmtree_commit_*)
     166             :      * proof-based commitment calculations (fd_bmtree_commitp_*)
     167             :     but only one at a time.
     168             : 
     169             :    For leaf-based commitment calculations, it theoretically requires
     170             :    O(log n) space with regard to the number of nodes, although n is
     171             :    currently capped (at an astronomical value), so it requires constant
     172             :    space.
     173             : 
     174             :    During the accumulation phase, the data structure consumes all tree
     175             :    leaf nodes sequentially while calculating and buffering branch nodes
     176             :    of upper layers along the way.
     177             : 
     178             :    In the finalization phase, the buffered branch node data is hashed to
     179             :    derive the final root hash.
     180             : 
     181             :    The separation of the accumulation and finalization phases is
     182             :    required for trees with leaf counts that are not powers of two.
     183             :    Those contain at least one branch node with only one child node.
     184             : 
     185             :    The node_buf is large enough to handle trees with ~2^63 leaves.  This
     186             :    is orders of magnitude more leaves are practical (it would take
     187             :    ~30,000 years at a rate of ~100 microsecond per leaf insert but if
     188             :    you are willing to wait to make a larger tree, increase 63 below). */
     189             : 
     190             : struct fd_bmtree_commit_private {
     191             :   /* Explanation of the above internal state in bmtree_commit_t:
     192             : 
     193             :      - `leaf_cnt` contains the number of leaf nodes that have been
     194             :      accumulated so far. It is synonymous to the index of within the
     195             :      vector of leaf nodes.
     196             : 
     197             :      This is used to check how many branch nodes in the upper layers can
     198             :      be derived with the currently known information.
     199             : 
     200             :      The current depth of the layers above the leaf nodes is the number
     201             :      of times the `leaf_cnt` is divisible by 2.
     202             : 
     203             :      - `node_buf` is indexed by layer, with 0 being the leaf layer.
     204             : 
     205             :      Given a layer `L` containing a vector of nodes known so far,
     206             :      `node_buf[L]` contains the right-most node in layer `L` (counting
     207             :      from the bottom) that is a left child of its parent.
     208             : 
     209             :      More precisely:
     210             : 
     211             :      The subset `L_left` contains all nodes with index `i` within that
     212             :      layer where `i%2==0`. Then, `node_buf[L]` contains the node with
     213             :      the largest index `i`within `L_left`.
     214             : 
     215             :    **Example**
     216             : 
     217             :    Step-by-step walkthrough of the internal state:
     218             : 
     219             :    Initialize
     220             :    - leaf_cnt    <- 0
     221             : 
     222             :    Insert leaf `l_0`
     223             :    - node_buf[0] <- l_0
     224             :    - leaf_cnt    <- 1
     225             : 
     226             :    Insert leaf `l_1`
     227             :    - b_0         <- hash_branch( node_buf[0], l_1 )
     228             :    - node_buf[1] <- b_0
     229             :    - leaf_cnt    <- 2
     230             : 
     231             :    Insert leaf `l_2`
     232             :    - node_buf[0] <- l_2
     233             :    - leaf_cnt    <- 3
     234             : 
     235             :    Insert leaf `l_3
     236             :    - b_0         <- hash_branch( node_buf[0], l_3 )
     237             :    - b_1         <- hash_branch( node_buf[1], b_0 )
     238             :    - node_buf[2] <- b_1
     239             :    - leaf_cnt    <- 4
     240             : 
     241             :    inclusion_proofs stores hashes of internal nodes from previous
     242             :    computation.  If 0 <= i < inclusion_proof_sz, then
     243             :    inclusion_proofs[i] stores the hash at node i of the tree numbered
     244             :    in complete binary search tree order.  E.g.
     245             : 
     246             :                   3
     247             :                 /   \
     248             :               1       5
     249             :              / \     //
     250             :             0   2   4
     251             : 
     252             :    This is a superset of what is stored in node_buf, but in order to not
     253             :    lose the log(n) cache utilization features when we don't care about
     254             :    inclusion proofs and are only trying to derive the root hash, we
     255             :    store them separately.
     256             : 
     257             :    In general, this binary search tree order is fairly friendly.  To
     258             :    find the layer of a node, you count the number of trailing 1 bits in
     259             :    its index.  To get the left/right child, you add/subtract 1<<layer.
     260             :    This ordering makes sense for these trees, because they grow up and
     261             :    to the right, the same way the numbers increase, so a node's index
     262             :    never changes as more leaves are added to the tree.
     263             : 
     264             :    The biggest subtlety comes when there are nodes with incomplete left
     265             :    subtrees, e.g.
     266             : 
     267             :                          7
     268             :                       /     \
     269             :                     /         \
     270             :                    3          ??
     271             :                  /   \        /
     272             :                 1     5      9
     273             :                / \   / \    /
     274             :               0   2  4  6  8
     275             : 
     276             :    What number belongs in the ?? spot?  By binary search order, it
     277             :    should be 10.  The natural index of that node is 7+4=11 though.  The
     278             :    indexing gets very complicated and error-prone if we try to store it
     279             :    in 10, so we prefer to store it in 11.  That means the size of our
     280             :    storage depends only on the maximum number of layers we expect in
     281             :    the tree, which is somewhat convenient.  This does waste up to
     282             :    O(leaf_cnt) space though. */
     283             : 
     284             :   ulong             leaf_cnt;         /* Number of leaves added so far */
     285             :   ulong             hash_sz;          /* <= 32 bytes */
     286             :   ulong             prefix_sz;        /* <= 26 bytes */
     287             :   ulong             inclusion_proof_sz;
     288             :   fd_bmtree_node_t  node_buf[ 63UL ];
     289             :   /* Dense bit set. Array indexed [0, ceil((inclusion_proof_sz+1)/64)).
     290             :      Points to memory just after the end of the inclusion_proofs array
     291             :      and included in the footprint.  Only used or set in proof-based
     292             :      commits, because it's implicit in the leaf_cnt in leaf-based
     293             :      commits. */
     294             :   ulong *           inclusion_proofs_valid;
     295             :   /* inclusion_proofs is indexed [0, inclusion_proof_sz] where index
     296             :      inclusion_proof_sz is a dummy index used to avoid branches. */
     297             :   fd_bmtree_node_t  inclusion_proofs[ 1 ];
     298             : };
     299             : 
     300             : typedef struct fd_bmtree_commit_private fd_bmtree_commit_t;
     301             : 
     302             : #define FD_BMTREE_COMMIT_FOOTPRINT( inclusion_proof_layer_cnt ) ((((sizeof(fd_bmtree_commit_t) + \
     303             :                                                                  ((1UL<<(inclusion_proof_layer_cnt))-1UL)*sizeof(fd_bmtree_node_t)+\
     304             :                                                                  ((1UL<<(inclusion_proof_layer_cnt))+63UL)/64UL*sizeof(ulong))+31UL)/32UL) * 32UL)
     305         771 : #define FD_BMTREE_COMMIT_ALIGN                         (32UL)
     306             : 
     307             : FD_PROTOTYPES_BEGIN
     308             : 
     309             : /* bmtree_commit_{footprint,align} return the alignment and footprint
     310             :    required for a memory region to be used as a bmtree_commit_t.  If the
     311             :    tree does not exceed inclusion_proof_layer_cnt layers, then all
     312             :    inclusion proofs can be retrieved after finalization. */
     313             : ulong          fd_bmtree_commit_align    ( void );
     314             : ulong          fd_bmtree_commit_footprint( ulong inclusion_proof_layer_cnt );
     315             : 
     316             : /* bmtree_commit_init starts a vector commitment calculation of either
     317             :    type.  Assumes mem unused with required alignment and footprint.
     318             :    Returns mem as a bmtree_commit_t *, commit will be in a calc.
     319             :    prefix_sz is the size (in bytes) of the second-preimage resistance
     320             :    prefix used.  It's typically FD_BMTREE_LONG_PREFIX_SZ or
     321             :    FD_BMTREE_SHORT_PREFIX_SZ and must not be greater than
     322             :    FD_BMTREE_LONG_PREFIX_SZ.
     323             : 
     324             :    The calculation can also save some inclusion proof information such
     325             :    that if the final tree has no more than inclusion_proof_layer_cnt layers,
     326             :    inclusion proofs will be available for all leaves.  If the tree grows
     327             :    beyond inclusion_proof_layer_cnt layers, then inclusion proofs may
     328             :    not be available for any leaves.
     329             : 
     330             :    For proof-based commitments, inclusion_proof_layers must be at least
     331             :    as large as the number of layers in the tree.
     332             :    */
     333             : fd_bmtree_commit_t * fd_bmtree_commit_init     ( void * mem, ulong hash_sz, ulong prefix_sz, ulong inclusion_proof_layer_cnt );
     334             : 
     335             : /* bmtree_commit_leaf_cnt returns the number of leafs appended thus
     336             :    far.  Assumes state is valid. */
     337     6098781 : FD_FN_PURE static inline ulong fd_bmtree_commit_leaf_cnt ( fd_bmtree_commit_t const * bmt ) { return bmt->leaf_cnt; }
     338             : 
     339             : /* fd_bmtree_depth and fd_bmtree_node_cnt respectively return the number
     340             :    of layers (counting leaf and root) and total number of nodes in a
     341             :    binary Merkle tree with leaf_cnt leaves.  leaf_cnt in [0, ULONG_MAX].
     342             :    */
     343             : FD_FN_CONST ulong fd_bmtree_depth(    ulong leaf_cnt );
     344             : FD_FN_CONST ulong fd_bmtree_node_cnt( ulong leaf_cnt );
     345             : 
     346             : /* bmtree_commit_append appends a range of leaf nodes.  Assumes that
     347             :    leaf_cnt + new_leaf_cnt << 2^63 (which, unless planning on running
     348             :    for millennia, is always true). */
     349             : fd_bmtree_commit_t *                                                         /* Returns state */
     350             : fd_bmtree_commit_append( fd_bmtree_commit_t *                 state,         /* Assumed valid and in a leaf-based calc */
     351             :                          fd_bmtree_node_t const * FD_RESTRICT new_leaf,      /* Indexed [0,new_leaf_cnt) */
     352             :                          ulong                                new_leaf_cnt );
     353             : 
     354             : /* bmtree_commit_fini seals the commitment calculation by deriving the
     355             :    root node.  Assumes state is valid, in a leaf-based calc on entry
     356             :    with at least one leaf in the tree.  The state will be valid but no
     357             :    longer in a calc on return.  Returns a pointer in the caller's
     358             :    address space to the first byte of a memory region of BMTREE_HASH_SZ
     359             :    with to the root hash on success.  The lifetime of the returned
     360             :    pointer is that of the state or until the memory used for state gets
     361             :    initialized for a new calc. */
     362             : uchar * fd_bmtree_commit_fini( fd_bmtree_commit_t * state );
     363             : 
     364             : 
     365             : /* bmtree_get_proof writes an inclusion proof for the leaf
     366             :    with index leaf_idx to the memory at dest.  state must be a valid
     367             :    sealed bmtree commitment (leaf-based or proof-based) with at least
     368             :    leaf_idx+1 leaves.  state must have been initialized with
     369             :    inclusion_proof_layers_cnt >= the height of the tree, which you can
     370             :    get from fd_bmtree_depth( fd_bmtree_commit_leaf_cnt( state ) ).
     371             : 
     372             :    If these conditions are met, upon return, dest[ i ] for
     373             :    0<=i<hash_sz*(tree depth-1) will contain the inclusion proof, and the
     374             :    function will return the number of hashes written.  If
     375             :    inclusion_proof_layers_cnt was initialized to too small of a value,
     376             :    this function will return -1 and the memory pointed to by dest will
     377             :    not be modified.
     378             : 
     379             :    The inclusion proof is ordered from leaf to root but excludes the
     380             :    actual root of the tree. */
     381             : /* FIXME: Returning -1 is pretty bad here, but 0 is the legitimate
     382             :    proof size of a 1 node tree.  Is that case worth distinguishing? */
     383             : int
     384             : fd_bmtree_get_proof( fd_bmtree_commit_t * state,
     385             :                      uchar *              dest,
     386             :                      ulong                leaf_idx );
     387             : 
     388             : /* fd_bmtree_from_proof derives the root of a Merkle tree where the
     389             :    element with hash `leaf` is the leaf_idx^th leaf and proof+hash_sz*i
     390             :    contains its sibling at the ith level (counting from the bottom, with
     391             :    the sibling leaf being the i=0 value).  The full root hash (i.e.
     392             :    untruncated regardless of hash_sz) will be stored in root upon
     393             :    return.
     394             :    Does not retain any read or write interests after returning, and it
     395             :    operates independently of normal tree construction, so it neither
     396             :    starts nor ends a calc, and it can safely be done in the middle of a
     397             :    calc.
     398             : 
     399             :    Memory regions should not overlap.
     400             : 
     401             :    The proof consists of proof_depth hashes, each hash_sz bytes
     402             :    concatenated with no padding ordered from leaf to root, excluding the
     403             :    root.
     404             : 
     405             :    Returns root if the proof is valid and NULL otherwise.  If the proof
     406             :    is invalid, the root will not be stored.  A proof can only be invalid
     407             :    if it is too short to possibly correspond to the leaf_idx^th node. */
     408             : /* TODO: Write the caching version of this */
     409             : fd_bmtree_node_t *
     410             : fd_bmtree_from_proof( fd_bmtree_node_t const * leaf,
     411             :                       ulong                    leaf_idx, /* in [0, ULONG_MAX) */
     412             :                       fd_bmtree_node_t *       root,
     413             :                       uchar const *            proof,
     414             :                       ulong                    proof_depth, /* in [0, 63] */
     415             :                       ulong                    hash_sz, /* in [1, 32] */
     416             :                       ulong                    prefix_sz /* either LONG_PREFIX_SZ or SHORT_PREFIX_SZ */ );
     417             : 
     418             : 
     419             : /* fd_bmtree_commitp_insert_with_proof inserts a leaf at index idx in
     420             :    the proof-based calc, optionally with some proof.  Returns 1 if
     421             :    the leaf and proof are consistent with everything previously added to
     422             :    this calc, or 0 if not.
     423             : 
     424             :    fd_bmtree_depth( idx+1 ) must be <= inclusion_proof_layer_cnt used in
     425             :    init.  Additionally, proof_depth<inclusion_proof_layer_cnt.
     426             : 
     427             :    Like all the other functions in this file that deal with inclusion
     428             :    proofs, the proof format is leaf to root, excluding the root, where
     429             :    each of the proof_depth hashes occupies hash_sz bytes and there is no
     430             :    padding.  Truncated proof_depths are fine and are interpreted as the
     431             :    first proof_depth elements of the proof, i.e. the ones closer to the
     432             :    leaf.  In particular, a proof_depth of 0 is fine, in which case
     433             :    proof==NULL is fine.
     434             : 
     435             :    If this returns success and opt_root is not NULL, the highest node
     436             :    (closest to the root) in the branch containing idx that is known will
     437             :    be written to the memory pointed to by opt_root.  In the case that
     438             :    the inclusion proof is full (contains all the nodes except the root),
     439             :    the node that is stored is the root of the tree; however, in general
     440             :    the tree can grow beyond this, so it isn't possible to guarantee that
     441             :    it is the root in other cases.  If the function returns failure (0)
     442             :    or opt_root==NULL, then the memory pointed to by opt_root will not be
     443             :    accessed.
     444             : 
     445             :    If this returns 0, the commitment state will not be modified.  If it
     446             :    returns 1, then the information provided will be cached to speed up
     447             :    other validations in this calc.
     448             : 
     449             :    Note that a return value of 1 does not necessarily imply the leaf is
     450             :    correct, as there may not be enough information to determine it yet.
     451             :    In that case, fd_bmtreep_fini or another call to fd_bmtreep_insert
     452             :    will return 0. */
     453             : 
     454             : int
     455             : fd_bmtree_commitp_insert_with_proof( fd_bmtree_commit_t *     state,
     456             :                                      ulong                    idx,
     457             :                                      fd_bmtree_node_t const * new_leaf,
     458             :                                      uchar            const * proof,
     459             :                                      ulong                    proof_depth,
     460             :                                      fd_bmtree_node_t       * opt_root );
     461             : 
     462             : /* fd_bmtree_commitp_fini finalizes a proof-based calc.  Returns the
     463             :    root of the tree if it can conclusively determine that the entire
     464             :    tree is correct for a commitment of leaf_cnt leaf nodes and NULL
     465             :    otherwise. */
     466             : uchar * fd_bmtree_commitp_fini( fd_bmtree_commit_t * state, ulong leaf_cnt );
     467             : 
     468             : FD_PROTOTYPES_END
     469             : #endif /* HEADER_fd_src_ballet_bmtree_fd_bmtree_h */

Generated by: LCOV version 1.14