LCOV - code coverage report
Current view: top level - flamenco/runtime - fd_txncache.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 0 6 0.0 %
Date: 2025-01-08 12:08:44 Functions: 0 0 -

          Line data    Source code
       1             : #ifndef HEADER_fd_src_flamenco_runtime_txncache_h
       2             : #define HEADER_fd_src_flamenco_runtime_txncache_h
       3             : 
       4             : #include "../fd_flamenco_base.h"
       5             : #include "../types/fd_types.h"
       6             : 
       7             : /* A txn cache is a concurrent map for saving the result (status) of
       8             :    transactions that have executed.  In addition to supporting fast
       9             :    concurrent insertion and query of transaction results, the txn
      10             :    cache supports serialization of its state into a standard bincode
      11             :    format for serving the state to other nodes via. snapshot responses,
      12             :    and then restoring snapshots produced by other nodes.
      13             : 
      14             :    The txn cache is designed to do two operations fast,
      15             : 
      16             :      (a) Insertion.  Insertion is done by both the leader pipeline and
      17             :          the replay stage, with an insertion for every transaction that
      18             :          is executed, potentially over 1M per second.
      19             : 
      20             :      (b) Query.  The leader pipeline queries the status of transactions
      21             :          before executing them to ensure that it does not execute
      22             :          anything twice.
      23             : 
      24             :    Both of these operations are concurrent and lockless, assuming there
      25             :    are no other (non-insert/query) operations occuring on the txn cache.
      26             :    Most other operations lock the entire structure and will prevent both
      27             :    insertion and query from proceeding.
      28             : 
      29             :    The txn cache is both CPU and memory sensitive.  A transaction result
      30             :    is 1 byte, and the stored transaction hashes are 20 bytes, so
      31             :    without any overhead just storing 150 slots of transactions in a flat
      32             :    array would require
      33             : 
      34             :       524,288 * 150 * 21 ~ 1.5GiB
      35             : 
      36             :    Of memory.  But, we can't query, insert, and remove quickly from a
      37             :    flat array.  We make a few trade-offs to achieve good performance
      38             :    without bloating memory completely.  In particular:
      39             : 
      40             :      - The transactions to be queried are stored in a structure that
      41             :        looks like a
      42             : 
      43             :          hash_map<blockhash, hash_map<txnhash, vec<(slot, status)>>>
      44             : 
      45             :        The top level hash_map is a probed hash map, and the txnhash map
      46             :        is a chained hash map, where the items come from a pool of pages
      47             :        of transactions.  We use pages of transactions to support fast
      48             :        removal of a blockhash from the top level map, we need to return
      49             :        at most 4,800 pages back to the pool rather than 78,643,200
      50             :        individual transactions.
      51             : 
      52             :        This adds additional memory overhead, a blockhash with only one
      53             :        transaction in it will still consume a full page (16,384) of
      54             :        transactions of memory.  Allocating a new transaction page to a
      55             :        chained map is rare (once every 16,384 inserts) so the cost
      56             :        amortizes to zero.  Creating a blockhash happens once per
      57             :        blockhash, so also amortizes to zero, the only operation we care
      58             :        about is then the simple insert case with an unfull transaction
      59             :        page into an existing blockhash.  This can be done with two
      60             :        compare-and-swaps,
      61             : 
      62             :          // 1. Find the blockhash in the probed hash map.
      63             : 
      64             :             let by_blockhash = hash_map[ blockhash ];
      65             : 
      66             :          // 2. Request space for the transaction from the current page
      67             : 
      68             :             let page = by_blockhash.pages.back();
      69             :             let idx = page.used.compare_and_swap( current, current+1 );
      70             : 
      71             :          // 3. Write the transaction into the page and the map
      72             : 
      73             :             page.txns[ idx ] = txn;
      74             :             page.txns[ idx ].next = by_blockhash.txns[ txnhash ].idx;
      75             :             by_blockhash.txns[ txnhash ].head.compare_and_swap( current, idx );
      76             : 
      77             :        Removal of a blockhash from this structure is simple because it
      78             :        does not need to be concurrent (the caller will only remove
      79             :        between executing slots, so there's no contention and it can take
      80             :        a full write lock).  We take a write lock, restore the pages in
      81             :        the blockhash to the pool, and then mark the space in the
      82             :        hash_map as empty.  This is fast since there are at most 4,800
      83             :        pages to restore and restoration is a simple memcpy.
      84             : 
      85             :      - Another structure is required to support serialization of
      86             :        snapshots from the cache.  Serialization must produce a binary
      87             :        structure that encodes essentially:
      88             : 
      89             :          hash_map<slot, hash_map<blockhash, vec<(txnhash, txn_result)>>>
      90             : 
      91             :        For each slot that is rooted.  Observe that we can't reuse the
      92             :        structure used for queries, since it's not grouped by slot, we
      93             :        would have to iterate all of the transactions to do this grouping
      94             :        which is not feasible.
      95             : 
      96             :        We can't add the slot to that index, since then query would not
      97             :        be fast.  Queries are just against (blockhash, txnhash) pairs,
      98             :        they don't know which slot might have included it.
      99             : 
     100             :        So we need a second index for this information.  The index is
     101             :        going to line up with what we described above, the root hash map
     102             :        will be probed, and the nested hash map will be chained.  The
     103             :        vector of results will also use a pool of pages.
     104             : 
     105             :        In this case the page pool is for a different reason: we know a
     106             :        sensible upper bound for the number of transactions that could be
     107             :        alive in the entire cache at any point in time, but we don't know
     108             :        quite how to bound it for a single slot or blockhash.  Well, we
     109             :        do: the bound is 524,288 per (slot, blockhash) pair but we would
     110             :        need to allocate all that memory up front.  Instead, we bound the
     111             :        total number of slots and transactions per slot, assume the
     112             :        transactions must be distributed somehow within the blockhashes,
     113             :        and then give some overhead for unoccupied page entries.
     114             : 
     115             :        The insertion process for this structure is similar to the one
     116             :        above, except the vector is not a chained map but just a regular
     117             :        vector.
     118             : 
     119             :          // 1. Find the (slot, blockhash) in the probed hash map.
     120             : 
     121             :             let by_slot_blockhash = hash_map[ slot ][ blockhash ];
     122             : 
     123             :          // 2. Request space for the transaction from the current page
     124             : 
     125             :             let page = by_slot_blockhash.pages.back();
     126             :             let idx = page.used.fetch_add( 1 );
     127             : 
     128             :          // 3. Write the transaction into the page and the map
     129             : 
     130             :             page.txns[ idx ] = txn;
     131             :             by_slot_blockhash.txn_count += 1;
     132             : 
     133             :        The final increment is OK even with concurrent inserts because
     134             :        no reader will try to observe the txn_count until the slot is
     135             :        rooted, at which point nothing could be being inserted into it
     136             :        and the txn_count will be finalized. */
     137             : 
     138             : #include "../fd_flamenco_base.h"
     139             : 
     140           0 : #define FD_TXNCACHE_ALIGN (128UL)
     141             : 
     142           0 : #define FD_TXNCACHE_MAGIC (0xF17EDA2CE5CAC4E0) /* FIREDANCE SCACHE V0 */
     143             : 
     144             : /* The duration of history to keep around in the txn cache before aging
     145             :    it out.  This must be at least 150, otherwise we could forget about
     146             :    transactions for blockhashes that are still valid to use, and let
     147             :    duplicate transactions make it into a produced block.
     148             : 
     149             :    Beyond 150, any value is valid.  The value used here, 300 comes from
     150             :    Agave, and corresponds to roughly 2 minutes of history assuming there
     151             :    are no forks, with an extra 12.8 seconds of history for slots that
     152             :    are in progress but have not rooted yet.  On a production validator
     153             :    without RPC support, the value should probably be configurable and
     154             :    always set to strictly 150. */
     155             : 
     156           0 : #define FD_TXNCACHE_DEFAULT_MAX_ROOTED_SLOTS (300UL)
     157             : 
     158             : /* A note on live slots ...
     159             : 
     160             :    The maximum number of live slots is the sum of the rooted and
     161             :    unrooted slots.  The rooted slots are explicitly capped at 300 for
     162             :    this structure (implying we keep around 2 minutes of history
     163             :    around for queries and snapshots).
     164             : 
     165             :    For the unrooted slots, we must root at least one slot in an epoch
     166             :    for an epoch transition to occur successfully to the next one, so
     167             :    assuming every slot is unconfirmed for some reason, and the prior
     168             :    epoch was rooted at the first slot in the epoch, and the next epoch
     169             :    is rooted at the last slot, there could be
     170             : 
     171             :       432,000 + 432,000 - 31 = 863,969
     172             : 
     173             :    Live slots on the validator.  This is clearly impractical as each
     174             :    bank consumes a lof of memory to store slot state, so the
     175             :    validator would crash long before this.
     176             : 
     177             :    For now we just pick a number: 2048, based on running on mainnet.
     178             :    We take into account that apart from the 450 recent blockhashes,
     179             :    there are also nonce accounts used as blockhashes by transactions
     180             :    and they contribute significantly to entries being filled up.
     181             : 
     182             :    TODO: This number might be unbounded, more testing required with
     183             :    mainnet before deciding an actual number. */
     184             : 
     185           0 : #define FD_TXNCACHE_DEFAULT_MAX_LIVE_SLOTS (2048UL)
     186             : 
     187             : /* The Solana consensus protocol has an implied restriction on the number
     188             :    transactions in a slot.  A slot might have at most 48,000,000 CUs,
     189             :    but a transaction requires at least around 1500 CUs, so there could
     190             :    be at most 32,000 transactions in a slot.
     191             : 
     192             :    For Firedancer, we respect this limit when running in production, but
     193             :    for development and preformance tuning this limit is removed, and
     194             :    instead we will produce and accept at most 524,288 transactions per
     195             :    slot.  This is chosen arbitrarily and works out to around ~1.3M TPS,
     196             :    however such a value does not exist in the consensus protocol. */
     197             : 
     198           0 : #define FD_TXNCACHE_DEFAULT_MAX_TRANSACTIONS_PER_SLOT (524288UL)
     199             : 
     200             : /* This number is not a strict bound but is a reasonable max allowed of
     201             :    slots that can be constipated. As of the writing of this comment, the only
     202             :    use case for constipating the status cache is to generate a snapshot. We 
     203             :    will use constipation here because we want the root to stay frozen while
     204             :    we generate the full state of a node for a given rooted slot. This max 
     205             :    size gives us roughly 1024 slots * 0.4secs / 60 secs/min = ~6.8 minutes from
     206             :    when we root a slot to when the status cache is done getting serialized into
     207             :    the snapshot format. This SHOULD be enough time because serializing the 
     208             :    status cache into a Solana snapshot is done on the order of seconds and is
     209             :    one of the first things that is done during snapshot creation. */
     210             : 
     211           0 : #define FD_TXNCACHE_DEFAULT_MAX_CONSTIPATED_SLOTS (1024UL)
     212             : 
     213             : struct fd_txncache_insert {
     214             :   uchar const * blockhash;
     215             :   uchar const * txnhash;
     216             :   ulong         slot;
     217             :   uchar const * result;
     218             : };
     219             : 
     220             : typedef struct fd_txncache_insert fd_txncache_insert_t;
     221             : 
     222             : struct fd_txncache_query {
     223             :   uchar const * blockhash;
     224             :   uchar const * txnhash;
     225             : };
     226             : 
     227             : typedef struct fd_txncache_query fd_txncache_query_t;
     228             : 
     229             : struct fd_txncache_snapshot_entry {
     230             :    ulong slot;
     231             :    uchar blockhash[ 32 ];
     232             :    uchar txnhash[ 20 ];
     233             :    ulong txn_idx;
     234             :    uchar result;
     235             : };
     236             : 
     237             : typedef struct fd_txncache_snapshot_entry fd_txncache_snapshot_entry_t;
     238             : 
     239             : /* Forward declare opaque handle */
     240             : struct fd_txncache_private;
     241             : typedef struct fd_txncache_private fd_txncache_t;
     242             : 
     243             : FD_PROTOTYPES_BEGIN
     244             : 
     245             : /* fd_txncache_{align,footprint} give the needed alignment and
     246             :    footprint of a memory region suitable to hold a txn cache.
     247             :    fd_txncache_{align,footprint} return the same value as
     248             :    FD_TXNCACHE_{ALIGN,FOOTPRINT}.
     249             : 
     250             :    fd_txncache_new formats memory region with suitable alignment and
     251             :    footprint suitable for holding a txn cache.  Assumes shmem points
     252             :    on the caller to the first byte of the memory region owned by the
     253             :    caller to use.  Returns shmem on success and NULL on failure (logs
     254             :    details).  The memory region will be owned by the state on successful
     255             :    return.  The caller is not joined on return.
     256             : 
     257             :    fd_txncache_join joins the caller to a txn cache. Assumes shtc points
     258             :    to the first byte of the memory region holding the state.  Returns a
     259             :    local handle to the join on success (this is not necessarily a simple
     260             :    cast of the address) and NULL on failure (logs details).
     261             : 
     262             :    fd_txncache_leave leaves the caller's current local join to a txn
     263             :    cache.  Returns a pointer to the memory region holding the state on
     264             :    success (this is not necessarily a simple cast of the address) and
     265             :    NULL on failure (logs details).  The caller is not joined on
     266             :    successful return.
     267             : 
     268             :    fd_txncache_delete unformats a memory region that holds a txn cache.
     269             :    Assumes shtc points on the caller to the first byte of the memory
     270             :    region holding the state and that nobody is joined.  Returns a
     271             :    pointer to the memory region on success and NULL on failure (logs
     272             :    details).  The caller has ownership of the memory region on
     273             :    successful return. */
     274             : 
     275             : FD_FN_CONST ulong
     276             : fd_txncache_align( void );
     277             : 
     278             : FD_FN_CONST ulong
     279             : fd_txncache_footprint( ulong max_rooted_slots,
     280             :                        ulong max_live_slots,
     281             :                        ulong max_txn_per_slot,
     282             :                        ulong max_constipated_slots );
     283             : 
     284             : void *
     285             : fd_txncache_new( void * shmem,
     286             :                  ulong  max_rooted_slots,
     287             :                  ulong  max_live_slots,
     288             :                  ulong  max_txn_per_slot,
     289             :                  ulong  max_constipated_slots );
     290             : 
     291             : fd_txncache_t *
     292             : fd_txncache_join( void * shtc );
     293             : 
     294             : void *
     295             : fd_txncache_leave( fd_txncache_t * tc );
     296             : 
     297             : void *
     298             : fd_txncache_delete( void * shtc );
     299             : 
     300             : /* fd_txncache_register_root registers a root slot in a txn cache.  Only
     301             :    the provided limit of roots (typically 300) can exist in the txn
     302             :    cache at once, after which the oldest roots will be purged.
     303             : 
     304             :    Purging a root means it cannot be served in a snapshot response,
     305             :    although transactions in the root may or may not still be present.
     306             :    Transaction status is removed once all roots referencing the
     307             :    blockhash of the transaction are removed from the txn cache.
     308             : 
     309             :    This is neither cheap or expensive, it will pause all insertion and
     310             :    query operations but only momentarily until any old slots can be
     311             :    purged from the cache. */
     312             : 
     313             : void
     314             : fd_txncache_register_root_slot( fd_txncache_t * tc,
     315             :                                 ulong           slot );
     316             : 
     317             : /* fd_txncache_register_constipated_slot is the "constipated" version of 
     318             :    fd_txncache_register_root_slot. This means that older root slots will not
     319             :    get purged nor will the newer root slots actually be rooted. All the slots
     320             :    that are marked as constipated will be flushed down to the set of rooted
     321             :    slots when fd_txncache_flush_constipated_slots is called. */
     322             : 
     323             : void
     324             : fd_txncache_register_constipated_slot( fd_txncache_t * tc,
     325             :                                        ulong           slot );
     326             : 
     327             : void
     328             : fd_txncache_flush_constipated_slots( fd_txncache_t * tc );
     329             : 
     330             : /* fd_txncache_root_slots returns the list of live slots currently
     331             :    tracked by the txn cache.  There will be at most max_root_slots
     332             :    slots, which will be written into the provided out_slots.  It is
     333             :    assumed tc points to a txn cache and out_slots has space for at least
     334             :    max_root_slots results.  If there are less than max_root_slots slots
     335             :    in the cache, the front part of out_slots will be filled in, and all
     336             :    the remaining slots will be set to ULONG_MAX.
     337             : 
     338             :    This is a fast operation, but it will lock the whole structure and
     339             :    cause a temporary pause in insert and query operations. */
     340             : 
     341             : void
     342             : fd_txncache_root_slots( fd_txncache_t * tc,
     343             :                         ulong *         out_slots );
     344             : 
     345             : /* fd_txncache_snapshot writes the current state of a txn cache into a
     346             :    binary format suitable for serving to other nodes via snapshot
     347             :    responses.  The write function is called in a streaming fashion with
     348             :    the binary data, the size of the data, and the ctx pointer provided
     349             :    to this function.  The write function should return 0 on success and
     350             :    -1 on failure, this function will propgate a failure to write back to
     351             :    the caller immediately, so this function also returns 0 on success
     352             :    and -1 on failure.
     353             : 
     354             :    IMPORTANT!  THIS ASSUMES THERE ARE NO CONCURRENT INSERTS OCCURING ON
     355             :    THE TXN CACHE AT THE ROOT SLOTS DURING SNAPSHOTTING.  OTHERWISE THE
     356             :    SNAPSHOT MIGHT BE NOT CONTAIN ALL OF THE DATA, ALTHOUGH IT WILL NOT
     357             :    CAUSE CORRUPTION.  THIS IS ASSUMED OK BECAUSE YOU CANNOT MODIFY A
     358             :    ROOTED SLOT.
     359             : 
     360             :    This is a cheap operation and will not cause any pause in insertion
     361             :    or query operations. */
     362             : 
     363             : int
     364             : fd_txncache_snapshot( fd_txncache_t * tc,
     365             :                       void *          ctx,
     366             :                       int ( * write )( uchar const * data, ulong data_sz, void * ctx ) );
     367             : 
     368             : /* fd_txncache_insert_batch inserts a batch of transaction results into
     369             :    a txn cache.  Assumes tc points to a txn cache, txns is a list of
     370             :    transaction results to be inserted, and txns_cnt is the count of the
     371             :    results list.  The insert copies data from the results and does not
     372             :    have any lifetime interest in the results memory provided once this
     373             :    call returns.
     374             : 
     375             :    Returns 1 on success and 0 on failure.  The only reason insertion can
     376             :    fail is because the txn cache is full, which should never happen in
     377             :    practice if the caller sizes the bounds correctly.  This is mostly
     378             :    here to support testing and fuzzing.
     379             : 
     380             :    This is a cheap, high performance, concurrent operation and can occur
     381             :    at the same time as queries and arbitrary other insertions. */
     382             : 
     383             : int
     384             : fd_txncache_insert_batch( fd_txncache_t *              tc,
     385             :                           fd_txncache_insert_t const * txns,
     386             :                           ulong                        txns_cnt );
     387             : 
     388             : /* fd_txncache_query_batch queries a batch of transactions to determine
     389             :    if they exist in the txn cache or not.  The queries have an ambiguous
     390             :    slot, but must match both the blockhash and txnhash.  In addition, if
     391             :    the query_func is not NULL, the query_func will be called with the
     392             :    slot of the txn and the query_func_ctx that was provided, and the
     393             :    transaction will be considered present only if the query_func also
     394             :    returns 1.
     395             : 
     396             :    Assumes tc points to a txn cache, queries is a list of queries to be
     397             :    executed, and queries_cnt is the count of the queries list.
     398             :    out_results must be at least as large as queries_cnt and will be
     399             :    filled with 0 or 1 if the transaction is not present or present
     400             :    respectively.
     401             : 
     402             :    This is a cheap, high performance, concurrent operation and can occur
     403             :    at the same time as queries and arbitrary other insertions. */
     404             : 
     405             : void
     406             : fd_txncache_query_batch( fd_txncache_t *             tc,
     407             :                          fd_txncache_query_t const * queries,
     408             :                          ulong                       queries_cnt,
     409             :                          void *                      query_func_ctx,
     410             :                          int ( * query_func )( ulong slot, void * ctx ),
     411             :                          int *                       out_results );
     412             : 
     413             : /* fd_txncache_set_txnhash_offset sets the correct offset value for the
     414             :    txn hash "key slice" in the blockcache and slotblockcache. This is used
     415             :    primarily for snapshot restore since in firedancer we always use the
     416             :    offset of 0. Return an error if the cache entry isn't found. */
     417             : int
     418             : fd_txncache_set_txnhash_offset( fd_txncache_t * tc,
     419             :                                 ulong slot,
     420             :                                 uchar blockhash[ 32 ],
     421             :                                 ulong txnhash_offset );
     422             : 
     423             : /* fd_txncache_is_rooted_slot returns 1 is `slot` is rooted, 0 otherwise. */
     424             : int
     425             : fd_txncache_is_rooted_slot( fd_txncache_t * tc,
     426             :                             ulong slot );
     427             : 
     428             : /* fd_txncache_get_entries is responsible for converting the rooted state of
     429             :    the status cache back into fd_bank_slot_deltas_t, which is the decoded
     430             :    format used by Agave. This is a helper method used to generate Agave-
     431             :    compatible snapshots.
     432             :    TODO: Currently all allocations are done via scratch. This should
     433             :    probably be changed in the future. */
     434             : 
     435             : int
     436             : fd_txncache_get_entries( fd_txncache_t *         tc,
     437             :                          fd_bank_slot_deltas_t * bank_slot_deltas );
     438             : 
     439             : /* fd_txncache_{is,set}_constipated is used to set and determine if the 
     440             :    status cache is currently in a constipated state. */
     441             : 
     442             : int
     443             : fd_txncache_get_is_constipated( fd_txncache_t * tc );
     444             : 
     445             : int
     446             : fd_txncache_set_is_constipated( fd_txncache_t * tc, int is_constipated );
     447             : 
     448             : FD_PROTOTYPES_END
     449             : 
     450             : #endif /* HEADER_fd_src_flamenco_runtime_txncache_h */

Generated by: LCOV version 1.14