LCOV - code coverage report
Current view: top level - flamenco/runtime - fd_txncache.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 0 10 0.0 %
Date: 2025-07-01 05:00:49 Functions: 0 115 0.0 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_flamenco_runtime_txncache_h
       2             : #define HEADER_fd_src_flamenco_runtime_txncache_h
       3             : 
       4             : #include "../fd_flamenco_base.h"
       5             : #include "../types/fd_types.h"
       6             : #include <math.h>
       7             : 
       8             : /* A txn cache is a concurrent map for saving the result (status) of
       9             :    transactions that have executed.  In addition to supporting fast
      10             :    concurrent insertion and query of transaction results, the txn
      11             :    cache supports serialization of its state into a standard bincode
      12             :    format for serving the state to other nodes via. snapshot responses,
      13             :    and then restoring snapshots produced by other nodes.
      14             : 
      15             :    The txn cache is designed to do two operations fast,
      16             : 
      17             :      (a) Insertion.  Insertion is done by both the leader pipeline and
      18             :          the replay stage, with an insertion for every transaction that
      19             :          is executed, potentially over 1M per second.
      20             : 
      21             :      (b) Query.  The leader pipeline queries the status of transactions
      22             :          before executing them to ensure that it does not execute
      23             :          anything twice.
      24             : 
      25             :    Both of these operations are concurrent and lockless, assuming there
      26             :    are no other (non-insert/query) operations occuring on the txn cache.
      27             :    Most other operations lock the entire structure and will prevent both
      28             :    insertion and query from proceeding.
      29             : 
      30             :    The txn cache is both CPU and memory sensitive.  A transaction result
      31             :    is 1 byte, and the stored transaction hashes are 20 bytes, so
      32             :    without any overhead just storing 150 slots of transactions in a flat
      33             :    array would require
      34             : 
      35             :       524,288 * 150 * 21 ~ 1.5GiB
      36             : 
      37             :    Of memory.  But, we can't query, insert, and remove quickly from a
      38             :    flat array.  We make a few trade-offs to achieve good performance
      39             :    without bloating memory completely.  In particular:
      40             : 
      41             :      - The transactions to be queried are stored in a structure that
      42             :        looks like a
      43             : 
      44             :          hash_map<blockhash, hash_map<txnhash, vec<(slot, status)>>>
      45             : 
      46             :        The top level hash_map is a probed hash map, and the txnhash map
      47             :        is a chained hash map, where the items come from a pool of pages
      48             :        of transactions.  We use pages of transactions to support fast
      49             :        removal of a blockhash from the top level map, we need to return
      50             :        at most 4,800 pages back to the pool rather than 78,643,200
      51             :        individual transactions.
      52             : 
      53             :        This adds additional memory overhead, a blockhash with only one
      54             :        transaction in it will still consume a full page (16,384) of
      55             :        transactions of memory.  Allocating a new transaction page to a
      56             :        chained map is rare (once every 16,384 inserts) so the cost
      57             :        amortizes to zero.  Creating a blockhash happens once per
      58             :        blockhash, so also amortizes to zero, the only operation we care
      59             :        about is then the simple insert case with an unfull transaction
      60             :        page into an existing blockhash.  This can be done with two
      61             :        compare-and-swaps,
      62             : 
      63             :          // 1. Find the blockhash in the probed hash map.
      64             : 
      65             :             let by_blockhash = hash_map[ blockhash ];
      66             : 
      67             :          // 2. Request space for the transaction from the current page
      68             : 
      69             :             let page = by_blockhash.pages.back();
      70             :             let idx = page.used.compare_and_swap( current, current+1 );
      71             : 
      72             :          // 3. Write the transaction into the page and the map
      73             : 
      74             :             page.txns[ idx ] = txn;
      75             :             page.txns[ idx ].next = by_blockhash.txns[ txnhash ].idx;
      76             :             by_blockhash.txns[ txnhash ].head.compare_and_swap( current, idx );
      77             : 
      78             :        Removal of a blockhash from this structure is simple because it
      79             :        does not need to be concurrent (the caller will only remove
      80             :        between executing slots, so there's no contention and it can take
      81             :        a full write lock).  We take a write lock, restore the pages in
      82             :        the blockhash to the pool, and then mark the space in the
      83             :        hash_map as empty.  This is fast since there are at most 4,800
      84             :        pages to restore and restoration is a simple memcpy.
      85             : 
      86             :      - Another structure is required to support serialization of
      87             :        snapshots from the cache.  Serialization must produce a binary
      88             :        structure that encodes essentially:
      89             : 
      90             :          hash_map<slot, hash_map<blockhash, vec<(txnhash, txn_result)>>>
      91             : 
      92             :        For each slot that is rooted.  Observe that we can't reuse the
      93             :        structure used for queries, since it's not grouped by slot, we
      94             :        would have to iterate all of the transactions to do this grouping
      95             :        which is not feasible.
      96             : 
      97             :        We can't add the slot to that index, since then query would not
      98             :        be fast.  Queries are just against (blockhash, txnhash) pairs,
      99             :        they don't know which slot might have included it.
     100             : 
     101             :        So we need a second index for this information.  The index is
     102             :        going to line up with what we described above, the root hash map
     103             :        will be probed, and the nested hash map will be chained.  The
     104             :        vector of results will also use a pool of pages.
     105             : 
     106             :        In this case the page pool is for a different reason: we know a
     107             :        sensible upper bound for the number of transactions that could be
     108             :        alive in the entire cache at any point in time, but we don't know
     109             :        quite how to bound it for a single slot or blockhash.  Well, we
     110             :        do: the bound is 524,288 per (slot, blockhash) pair but we would
     111             :        need to allocate all that memory up front.  Instead, we bound the
     112             :        total number of slots and transactions per slot, assume the
     113             :        transactions must be distributed somehow within the blockhashes,
     114             :        and then give some overhead for unoccupied page entries.
     115             : 
     116             :        The insertion process for this structure is similar to the one
     117             :        above, except the vector is not a chained map but just a regular
     118             :        vector.
     119             : 
     120             :          // 1. Find the (slot, blockhash) in the probed hash map.
     121             : 
     122             :             let by_slot_blockhash = hash_map[ slot ][ blockhash ];
     123             : 
     124             :          // 2. Request space for the transaction from the current page
     125             : 
     126             :             let page = by_slot_blockhash.pages.back();
     127             :             let idx = page.used.fetch_add( 1 );
     128             : 
     129             :          // 3. Write the transaction into the page and the map
     130             : 
     131             :             page.txns[ idx ] = txn;
     132             :             by_slot_blockhash.txn_count += 1;
     133             : 
     134             :        The final increment is OK even with concurrent inserts because
     135             :        no reader will try to observe the txn_count until the slot is
     136             :        rooted, at which point nothing could be being inserted into it
     137             :        and the txn_count will be finalized. */
     138             : 
     139             : #include "../fd_flamenco_base.h"
     140             : 
     141           0 : #define FD_TXNCACHE_ALIGN (128UL)
     142             : 
     143           0 : #define FD_TXNCACHE_MAGIC (0xF17EDA2CE5CAC4E0) /* FIREDANCE SCACHE V0 */
     144             : 
     145             : /* The duration of history to keep around in the txn cache before aging
     146             :    it out.  This must be at least 150, otherwise we could forget about
     147             :    transactions for blockhashes that are still valid to use, and let
     148             :    duplicate transactions make it into a produced block.
     149             : 
     150             :    Beyond 150, any value is valid.  The value used here, 300 comes from
     151             :    Agave, and corresponds to roughly 2 minutes of history assuming there
     152             :    are no forks, with an extra 12.8 seconds of history for slots that
     153             :    are in progress but have not rooted yet.  On a production validator
     154             :    without RPC support, the value should probably be configurable and
     155             :    always set to strictly 150. */
     156             : 
     157           0 : #define FD_TXNCACHE_DEFAULT_MAX_ROOTED_SLOTS (300UL)
     158             : 
     159             : /* A note on live slots ...
     160             : 
     161             :    The maximum number of live slots is the sum of the rooted and
     162             :    unrooted slots.  The rooted slots are explicitly capped at 300 for
     163             :    this structure (implying we keep around 2 minutes of history
     164             :    around for queries and snapshots).
     165             : 
     166             :    For the unrooted slots, we must root at least one slot in an epoch
     167             :    for an epoch transition to occur successfully to the next one, so
     168             :    assuming every slot is unconfirmed for some reason, and the prior
     169             :    epoch was rooted at the first slot in the epoch, and the next epoch
     170             :    is rooted at the last slot, there could be
     171             : 
     172             :       432,000 + 432,000 - 31 = 863,969
     173             : 
     174             :    Live slots on the validator.  This is clearly impractical as each
     175             :    bank consumes a lof of memory to store slot state, so the
     176             :    validator would crash long before this.
     177             : 
     178             :    For now we just pick a number: 2048, based on running on mainnet.
     179             :    We take into account that apart from the 450 recent blockhashes,
     180             :    there are also nonce accounts used as blockhashes by transactions
     181             :    and they contribute significantly to entries being filled up.
     182             : 
     183             :    TODO: This number might be unbounded, more testing required with
     184             :    mainnet before deciding an actual number. */
     185             : 
     186             : #define FD_TXNCACHE_DEFAULT_MAX_LIVE_SLOTS (2048UL)
     187             : 
     188             : /* The Solana consensus protocol has an implied restriction on the number
     189             :    transactions in a slot.  A slot might have 50,000,000+ CUs,
     190             :    but a transaction requires at least around 1500 CUs, so there could
     191             :    be at most 33,333 transactions in a slot.
     192             : 
     193             :    For Firedancer, we respect this limit when running in production, but
     194             :    for development and preformance tuning this limit is removed, and
     195             :    instead we will produce and accept at most 524,288 transactions per
     196             :    slot.  This is chosen arbitrarily and works out to around ~1.3M TPS,
     197             :    however such a value does not exist in the consensus protocol. */
     198             : 
     199           0 : #define FD_TXNCACHE_DEFAULT_MAX_TRANSACTIONS_PER_SLOT (524288UL)
     200             : 
     201             : /* This number is not a strict bound but is a reasonable max allowed of
     202             :    slots that can be constipated. As of the writing of this comment, the only
     203             :    use case for constipating the status cache is to generate a snapshot. We
     204             :    will use constipation here because we want the root to stay frozen while
     205             :    we generate the full state of a node for a given rooted slot. This max
     206             :    size gives us roughly 1024 slots * 0.4secs / 60 secs/min = ~6.8 minutes from
     207             :    when we root a slot to when the status cache is done getting serialized into
     208             :    the snapshot format. This SHOULD be enough time because serializing the
     209             :    status cache into a Solana snapshot is done on the order of seconds and is
     210             :    one of the first things that is done during snapshot creation. */
     211             : 
     212           0 : #define FD_TXNCACHE_DEFAULT_MAX_CONSTIPATED_SLOTS (1024UL)
     213             : 
     214             : struct fd_txncache_insert {
     215             :   uchar const * blockhash;
     216             :   uchar const * txnhash;
     217             :   ulong         slot;
     218             :   uchar const * result;
     219             : };
     220             : 
     221             : typedef struct fd_txncache_insert fd_txncache_insert_t;
     222             : 
     223             : struct fd_txncache_query {
     224             :   uchar const * blockhash;
     225             :   uchar const * txnhash;
     226             : };
     227             : 
     228             : typedef struct fd_txncache_query fd_txncache_query_t;
     229             : 
     230             : struct fd_txncache_snapshot_entry {
     231             :    ulong slot;
     232             :    uchar blockhash[ 32 ];
     233             :    uchar txnhash[ 20 ];
     234             :    ulong txn_idx;
     235             :    uchar result;
     236             : };
     237             : 
     238             : typedef struct fd_txncache_snapshot_entry fd_txncache_snapshot_entry_t;
     239             : 
     240             : /* Forward declare opaque handle */
     241             : struct fd_txncache_private;
     242             : typedef struct fd_txncache_private fd_txncache_t;
     243             : 
     244             : FD_PROTOTYPES_BEGIN
     245             : 
     246             : /* fd_txncache_{align,footprint} give the needed alignment and
     247             :    footprint of a memory region suitable to hold a txn cache.
     248             :    fd_txncache_{align,footprint} return the same value as
     249             :    FD_TXNCACHE_{ALIGN,FOOTPRINT}.
     250             : 
     251             :    fd_txncache_new formats memory region with suitable alignment and
     252             :    footprint suitable for holding a txn cache.  Assumes shmem points
     253             :    on the caller to the first byte of the memory region owned by the
     254             :    caller to use.  Returns shmem on success and NULL on failure (logs
     255             :    details).  The memory region will be owned by the state on successful
     256             :    return.  The caller is not joined on return.
     257             : 
     258             :    fd_txncache_join joins the caller to a txn cache. Assumes shtc points
     259             :    to the first byte of the memory region holding the state.  Returns a
     260             :    local handle to the join on success (this is not necessarily a simple
     261             :    cast of the address) and NULL on failure (logs details).
     262             : 
     263             :    fd_txncache_leave leaves the caller's current local join to a txn
     264             :    cache.  Returns a pointer to the memory region holding the state on
     265             :    success (this is not necessarily a simple cast of the address) and
     266             :    NULL on failure (logs details).  The caller is not joined on
     267             :    successful return.
     268             : 
     269             :    fd_txncache_delete unformats a memory region that holds a txn cache.
     270             :    Assumes shtc points on the caller to the first byte of the memory
     271             :    region holding the state and that nobody is joined.  Returns a
     272             :    pointer to the memory region on success and NULL on failure (logs
     273             :    details).  The caller has ownership of the memory region on
     274             :    successful return. */
     275             : 
     276             : FD_FN_CONST ulong
     277             : fd_txncache_align( void );
     278             : 
     279             : FD_FN_CONST ulong
     280             : fd_txncache_footprint( ulong max_rooted_slots,
     281             :                        ulong max_live_slots,
     282             :                        ulong max_txn_per_slot,
     283             :                        ulong max_constipated_slots );
     284             : 
     285             : static inline ulong
     286           0 : fd_txncache_max_constipated_slots_est( ulong stall_duration_secs ) {
     287           0 :   double stall_slots_d = (double)stall_duration_secs * 0.4;
     288           0 :   ulong  stall_slots   = (ulong)ceil( stall_slots_d );
     289           0 :   return stall_slots;
     290           0 : }
     291             : 
     292             : void *
     293             : fd_txncache_new( void * shmem,
     294             :                  ulong  max_rooted_slots,
     295             :                  ulong  max_live_slots,
     296             :                  ulong  max_txn_per_slot,
     297             :                  ulong  max_constipated_slots );
     298             : 
     299             : fd_txncache_t *
     300             : fd_txncache_join( void * shtc );
     301             : 
     302             : void *
     303             : fd_txncache_leave( fd_txncache_t * tc );
     304             : 
     305             : void *
     306             : fd_txncache_delete( void * shtc );
     307             : 
     308             : /* fd_txncache_register_root registers a root slot in a txn cache.  Only
     309             :    the provided limit of roots (typically 300) can exist in the txn
     310             :    cache at once, after which the oldest roots will be purged.
     311             : 
     312             :    Purging a root means it cannot be served in a snapshot response,
     313             :    although transactions in the root may or may not still be present.
     314             :    Transaction status is removed once all roots referencing the
     315             :    blockhash of the transaction are removed from the txn cache.
     316             : 
     317             :    This is neither cheap or expensive, it will pause all insertion and
     318             :    query operations but only momentarily until any old slots can be
     319             :    purged from the cache. */
     320             : 
     321             : void
     322             : fd_txncache_register_root_slot( fd_txncache_t * tc,
     323             :                                 ulong           slot );
     324             : 
     325             : /* fd_txncache_register_constipated_slot is the "constipated" version of
     326             :    fd_txncache_register_root_slot. This means that older root slots will not
     327             :    get purged nor will the newer root slots actually be rooted. All the slots
     328             :    that are marked as constipated will be flushed down to the set of rooted
     329             :    slots when fd_txncache_flush_constipated_slots is called. */
     330             : 
     331             : void
     332             : fd_txncache_register_constipated_slot( fd_txncache_t * tc,
     333             :                                        ulong           slot );
     334             : 
     335             : void
     336             : fd_txncache_flush_constipated_slots( fd_txncache_t * tc );
     337             : 
     338             : /* fd_txncache_root_slots returns the list of live slots currently
     339             :    tracked by the txn cache.  There will be at most max_root_slots
     340             :    slots, which will be written into the provided out_slots.  It is
     341             :    assumed tc points to a txn cache and out_slots has space for at least
     342             :    max_root_slots results.  If there are less than max_root_slots slots
     343             :    in the cache, the front part of out_slots will be filled in, and all
     344             :    the remaining slots will be set to ULONG_MAX.
     345             : 
     346             :    This is a fast operation, but it will lock the whole structure and
     347             :    cause a temporary pause in insert and query operations. */
     348             : 
     349             : void
     350             : fd_txncache_root_slots( fd_txncache_t * tc,
     351             :                         ulong *         out_slots );
     352             : 
     353             : /* fd_txncache_snapshot writes the current state of a txn cache into a
     354             :    binary format suitable for serving to other nodes via snapshot
     355             :    responses.  The write function is called in a streaming fashion with
     356             :    the binary data, the size of the data, and the ctx pointer provided
     357             :    to this function.  The write function should return 0 on success and
     358             :    -1 on failure, this function will propgate a failure to write back to
     359             :    the caller immediately, so this function also returns 0 on success
     360             :    and -1 on failure.
     361             : 
     362             :    IMPORTANT!  THIS ASSUMES THERE ARE NO CONCURRENT INSERTS OCCURING ON
     363             :    THE TXN CACHE AT THE ROOT SLOTS DURING SNAPSHOTTING.  OTHERWISE THE
     364             :    SNAPSHOT MIGHT BE NOT CONTAIN ALL OF THE DATA, ALTHOUGH IT WILL NOT
     365             :    CAUSE CORRUPTION.  THIS IS ASSUMED OK BECAUSE YOU CANNOT MODIFY A
     366             :    ROOTED SLOT.
     367             : 
     368             :    This is a cheap operation and will not cause any pause in insertion
     369             :    or query operations. */
     370             : 
     371             : int
     372             : fd_txncache_snapshot( fd_txncache_t * tc,
     373             :                       void *          ctx,
     374             :                       int ( * write )( uchar const * data, ulong data_sz, void * ctx ) );
     375             : 
     376             : /* fd_txncache_insert_batch inserts a batch of transaction results into
     377             :    a txn cache.  Assumes tc points to a txn cache, txns is a list of
     378             :    transaction results to be inserted, and txns_cnt is the count of the
     379             :    results list.  The insert copies data from the results and does not
     380             :    have any lifetime interest in the results memory provided once this
     381             :    call returns.
     382             : 
     383             :    Returns 1 on success and 0 on failure.  The only reason insertion can
     384             :    fail is because the txn cache is full, which should never happen in
     385             :    practice if the caller sizes the bounds correctly.  This is mostly
     386             :    here to support testing and fuzzing.
     387             : 
     388             :    This is a cheap, high performance, concurrent operation and can occur
     389             :    at the same time as queries and arbitrary other insertions. */
     390             : 
     391             : int
     392             : fd_txncache_insert_batch( fd_txncache_t *              tc,
     393             :                           fd_txncache_insert_t const * txns,
     394             :                           ulong                        txns_cnt );
     395             : 
     396             : /* fd_txncache_query_batch queries a batch of transactions to determine
     397             :    if they exist in the txn cache or not.  The queries have an ambiguous
     398             :    slot, but must match both the blockhash and txnhash.  In addition, if
     399             :    the query_func is not NULL, the query_func will be called with the
     400             :    slot of the txn and the query_func_ctx that was provided, and the
     401             :    transaction will be considered present only if the query_func also
     402             :    returns 1.
     403             : 
     404             :    Assumes tc points to a txn cache, queries is a list of queries to be
     405             :    executed, and queries_cnt is the count of the queries list.
     406             :    out_results must be at least as large as queries_cnt and will be
     407             :    filled with 0 or 1 if the transaction is not present or present
     408             :    respectively.
     409             : 
     410             :    This is a cheap, high performance, concurrent operation and can occur
     411             :    at the same time as queries and arbitrary other insertions. */
     412             : 
     413             : void
     414             : fd_txncache_query_batch( fd_txncache_t *             tc,
     415             :                          fd_txncache_query_t const * queries,
     416             :                          ulong                       queries_cnt,
     417             :                          void *                      query_func_ctx,
     418             :                          int ( * query_func )( ulong slot, void * ctx ),
     419             :                          int *                       out_results );
     420             : 
     421             : /* fd_txncache_set_txnhash_offset sets the correct offset value for the
     422             :    txn hash "key slice" in the blockcache and slotblockcache. This is used
     423             :    primarily for snapshot restore since in firedancer we always use the
     424             :    offset of 0. Return an error if the cache entry isn't found. */
     425             : int
     426             : fd_txncache_set_txnhash_offset( fd_txncache_t * tc,
     427             :                                 ulong slot,
     428             :                                 uchar blockhash[ 32 ],
     429             :                                 ulong txnhash_offset );
     430             : 
     431             : /* fd_txncache_is_rooted_slot returns 1 is `slot` is rooted, 0 otherwise. */
     432             : int
     433             : fd_txncache_is_rooted_slot( fd_txncache_t * tc,
     434             :                             ulong slot );
     435             : 
     436             : /* fd_txncache_get_entries is responsible for converting the rooted state of
     437             :    the status cache back into fd_bank_slot_deltas_t, which is the decoded
     438             :    format used by Agave. This is a helper method used to generate Agave-
     439             :    compatible snapshots. */
     440             : 
     441             : int
     442             : fd_txncache_get_entries( fd_txncache_t *         tc,
     443             :                          fd_bank_slot_deltas_t * bank_slot_deltas,
     444             :                          fd_spad_t *             spad );
     445             : 
     446             : /* fd_txncache_{is,set}_constipated is used to set and determine if the
     447             :    status cache is currently in a constipated state. */
     448             : 
     449             : int
     450             : fd_txncache_get_is_constipated( fd_txncache_t * tc );
     451             : 
     452             : int
     453             : fd_txncache_set_is_constipated( fd_txncache_t * tc, int is_constipated );
     454             : 
     455             : FD_PROTOTYPES_END
     456             : 
     457             : #endif /* HEADER_fd_src_flamenco_runtime_txncache_h */

Generated by: LCOV version 1.14