LCOV - code coverage report
Current view: top level - discof/replay - fd_rdisp.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 2 3 66.7 %
Date: 2025-10-14 04:31:44 Functions: 0 0 -

          Line data    Source code
       1             : #ifndef HEADER_fd_src_discof_replay_fd_rdisp_h
       2             : #define HEADER_fd_src_discof_replay_fd_rdisp_h
       3             : 
       4             : #include "../../disco/fd_disco_base.h"
       5             : 
       6             : /* fd_rdisp defines methods for building a DAG (directed acyclic graph)
       7             :    of transactions, and executing them in the appropriate order with the
       8             :    maximum amount of parallelism.
       9             : 
      10             :    Transactions must appear to execute in the order in which they occur
      11             :    in the block (the "serial fiction").  However, when two transactions
      12             :    are independent, they can actually be scheduled in either order, or
      13             :    in parallel.  Two transactions are independent unless one writes to
      14             :    an account that the other reads or writes.  If two transactions are
      15             :    not independent, the one that comes earlier in the block is a
      16             :    predecessor transaction of the one that comes later in the block.  In
      17             :    order to ensure correct execution, a transaction cannot be executed
      18             :    until all its predecessor transactions have completed.  In this file,
      19             :    a transaction completing means that the results of its account writes
      20             :    will be visible to a subsequent transaction executing on any core.
      21             : 
      22             :    Shockingly, this dispatcher does not need to keep a copy of
      23             :    transactions (more on this later), which means it operates almost
      24             :    entirely on transaction indices.  An index is a positive integer
      25             :    (uint) up to some maximum.  The 0 index is a sentinel, and doesn't
      26             :    correspond to a real transaction.  It's up to the caller to maintain
      27             :    the mapping between each transaction index and the transaction itself.
      28             : 
      29             :    A transaction index is in one of the following states:
      30             :      * FREE, which means it doesn't correspond to a transaction.
      31             :      * PENDING, which means it corresponds to a transaction that can't
      32             :        be scheduled yet because it must come after transactions that
      33             :        have not completed execution yet.
      34             :      * READY, which means all predecessor transactions have completed
      35             :        execution, but this transaction index has not been returned by
      36             :        get_next_ready yet.
      37             :      * ZOMBIE, which means the transaction completed execution, and
      38             :        successor transactions may be transitioned to the READY state,
      39             :        but the transaction index should not be recycled yet, because
      40             :        there are outstanding non-execution tasks associated with this
      41             :        transaction.
      42             :      * DISPATCHED, which means this transaction index was returned by
      43             :        get_next_ready but has not been completed yet.
      44             : 
      45             : 
      46             :                         --------------> PENDING
      47             :                 add_txn |                  |
      48             :          FREE  ---------|                  |
      49             :           ^             |                  V    get_next_ready
      50             :           |             -------------->  READY ----------------
      51             :           |                                                   |
      52             :           |                                                   |
      53             :           |               complete_txn(reclaim)               V
      54             :           |<-------------------------------------------- DISPATCHED
      55             :           |                                                   |
      56             :           |                                                   | complete_txn(noreclaim)
      57             :           |               complete_txn(reclaim)               V
      58             :           |<---------------------------------------------- ZOMBIE
      59             : 
      60             : 
      61             :    Additionally, fd_rdisp is block-aware, and even somewhat fork-aware.
      62             :    Prior to inserting a transaction, a block must be added.  Then, when
      63             :    a transaction is inserted, the caller must specify the block it is
      64             :    part of.
      65             : 
      66             :    At all times, each block is either STAGED or UNSTAGED.  Blocks that
      67             :    are STAGED benefit from the full parallelization-maximizing dispatch,
      68             :    but at most four linear chains of blocks can be STAGED at any time.
      69             :    Assuming the limit of four is still satisfied, when a block is added,
      70             :    it may be added as STAGED.  Otherwise, if it's added as UNSTAGED, it
      71             :    may be promoted to STAGED later, though this is worse for performance
      72             :    than adding it as STAGED initially.  Blocks may be demoted from
      73             :    STAGED to UNSTAGED only if the linear chain doesn't contain any
      74             :    PENDING, READY, or DISPATCHED transactions.
      75             : 
      76             :    Prior to properly introducing staging lanes, we need to introduce two
      77             :    more concepts.  A block can be insert-ready, schedule-ready, neither,
      78             :    or both.  A block must be insert-ready to call add_txn on
      79             :    it; a block must be schedule-ready to call get_next_ready on it.
      80             :    Various functions either require or modify these properties of a
      81             :    block.  These properties are necessary but not sufficient for the
      82             :    respective functions to succeed; for example, a block may be
      83             :    schedule-ready but empty, in which case, scheduling will still not
      84             :    succeed.  An UNSTAGED block is always both insert-ready and
      85             :    schedule-ready.
      86             : 
      87             :    A staging lane can contain a single block or a sequence of blocks.
      88             :    When a staging lane contains more than one block, some restrictions
      89             :    apply, namely, only the first block in the sequence is
      90             :    schedule-ready, and only the last block in the sequence is
      91             :    insert-ready, where first and last are determined by the order in
      92             :    which the block is staged.  Although this restriction makes the
      93             :    interface a bit more complicated, it's driven by performance
      94             :    requirements and common use cases.  Basically, there's no use for
      95             :    being able to replay a block before we've replayed its parent block.
      96             : 
      97             :    Basically, rdisp is designed to handle three situations well:
      98             :     * The normal case, where replay is pretty much caught up, and there
      99             :       are only one or two forks, containing only one or two un-replayed
     100             :       blocks each.
     101             :     * The startup case, where repair is far ahead of replay, but there's
     102             :       only a single fork, or very few forks
     103             :     * The DoS case, where there are many forks, perhaps duplicate
     104             :       blocks, and we don't know which to prioritize yet, but we will
     105             :       execute some, and prune the rest later.  This includes the case
     106             :       where we stop receiving transactions from some of the forks but
     107             :       don't know that the block has ended.
     108             : 
     109             :    In the normal case, there are few enough unreplayed blocks that it
     110             :    doesn't really matter how the staging lanes are used.  For the
     111             :    startup case, the long linear chains of blocks can all be STAGED using
     112             :    the same lane with no performance degradation.  In the DoS case, most
     113             :    of the forks will be UNSTAGED, and using some combination of
     114             :    replaying, cancelling, and demoting blocks, staging lanes can be freed
     115             :    up so that the canonical chain can emerge.
     116             : 
     117             :    Consider the following example of storing a fork tree in staging
     118             :    lanes:
     119             :     - Slot 10's parent is not specified
     120             :     - Slot 11's parent is slot 10
     121             :     - Slot 13's parent is slot 11
     122             :     - Slot 14's parent is also slot 11
     123             : 
     124             :    Since the caller chooses the staging lane, the caller may choose
     125             :    between
     126             :              Lane 0: 10 --> 11 --> 13
     127             :              Lane 1: 14
     128             :    and
     129             :              Lane 0: 10 --> 11 --> 14
     130             :              Lane 1: 13.
     131             :    or any of the various combinations that consume more staging lanes.
     132             :    Note that the concept of staging lanes is a performance optimization,
     133             :    not a safety feature.  With the first arrangment, the caller cannot
     134             :    call get_next_ready on slot 13 in between slots 10 and 11, but
     135             :    there's no issue with calling it on slot 14 then, which would
     136             :    obiously result in an incorrect replay.  It's ultimately the callers
     137             :    responsibility to ensure correct replay. */
     138             : 
     139           0 : #define FD_RDISP_MAX_DEPTH       0x7FFFFFUL /* 23 bit numbers, approx 8M */
     140     2162664 : #define FD_RDISP_MAX_BLOCK_DEPTH 0xFFFFUL   /* 16 bits */
     141      405246 : #define FD_RDISP_UNSTAGED        ULONG_MAX
     142             : 
     143             : struct fd_rdisp;
     144             : typedef struct fd_rdisp fd_rdisp_t;
     145             : 
     146             : /* fd_rdisp is set up so that the tag of a block can be adjusted to
     147             :    account for differences in handling duplicate blocks/equivocation. */
     148             : #define FD_RDISP_BLOCK_TAG_T ulong
     149             : 
     150             : FD_PROTOTYPES_BEGIN
     151             : 
     152             : /* fd_rdisp_{align,footprint} return the required alignment and
     153             :    footprint in bytes for a region of memory to be used as a dispatcher.
     154             :    depth is the maximum number of transaction indices that can be
     155             :    tracked at a time.  Depth must be at least 2 and cannot exceed
     156             :    FD_RDISP_MAX_DEPTH.  block_depth is the maximum number of blocks that
     157             :    this dispatcher can track.  block_depth must be at least 4 and cannot
     158             :    exceed FD_RDISP_MAX_BLOCK_DEPTH. */
     159             : ulong fd_rdisp_align    ( void        );
     160             : ulong fd_rdisp_footprint( ulong depth, ulong block_depth );
     161             : 
     162             : 
     163             : /* fd_rdisp_new formats a region of memory that satisfies the required
     164             :    footprint and alignment for use as a dispatcher.  depth and
     165             :    block_depth are as explained in fd_rdisp_footprint.  mem is a pointer
     166             :    to the first byte of a region of memory with the required alignment
     167             :    and footprint.  seed is an arbitrary ulong that is used to determine
     168             :    a seed of various internal hash tables.  On return, the caller will
     169             :    not be joined.
     170             : 
     171             :    fd_rdisp_join joins the caller to the dispatcher, enabling it for
     172             :    use. */
     173             : void *
     174             : fd_rdisp_new( void * mem,
     175             :               ulong  depth,
     176             :               ulong  block_depth,
     177             :               ulong  seed );
     178             : 
     179             : fd_rdisp_t *
     180             : fd_rdisp_join( void * mem );
     181             : 
     182             : /* fd_rdisp_suggest_staging_lane recommends a staging lane to use for a
     183             :    potential new block that has a parent block with block tag
     184             :    parent_block.  duplicate is non-zero if this is not the first block
     185             :    we've seen for its slot.
     186             : 
     187             :    This function uses the following logic:
     188             :    1. If it's a duplicate, suggest FD_RDISP_UNSTAGED
     189             :    2. If parent is the last block in any existing staging lane, suggest
     190             :       that lane
     191             :    3. If there is at least one free lane, suggest a free lane
     192             :    4. Else, suggest FD_RDISP_UNSTAGED
     193             :    Note that this function does not add the block (use add_block) for
     194             :    that, and does not modify the state of the dispatcher.  The caller
     195             :    should feel free to use or not use the suggested staging lane. */
     196             : ulong
     197             : fd_rdisp_suggest_staging_lane( fd_rdisp_t const *   disp,
     198             :                                FD_RDISP_BLOCK_TAG_T parent_block,
     199             :                                int                  duplicate );
     200             : 
     201             : 
     202             : /* fd_rdisp_add_block allocates a new block with the tag new_block from
     203             :    disp's internal pool.  new_block must not be the invalid block tag
     204             :    value, and it must be distinct from all other values passed as
     205             :    new_block in all prior calls to fd_rdisp_add_block.
     206             : 
     207             :    staging_lane must be either [0,4) or FD_RDISP_UNSTAGED.  If
     208             :    staging_lane is FD_RDISP_UNSTAGED, the block will be UNSTAGED (see
     209             :    the long comment at the beginning of this header), schedule-ready,
     210             :    and insert-ready.
     211             :    If staging_lane is in [0, 4), the block will be STAGED, and it will
     212             :    be insert-ready.  If the specified staging lane contained any blocks
     213             :    at the time of the call, the last one will no longer be insert-ready,
     214             :    making this the only insert-ready block in the lane.  If the
     215             :    specified staging lane did not contain any blocks at the time of the
     216             :    call, then the newly added block will also be schedule-ready.
     217             : 
     218             :    On successful return, the tag new_block will be usable for other
     219             :    functions that take a block tag block.
     220             : 
     221             :    Returns 0 on success, and -1 on error, which can only happen if out
     222             :    of resources (the number of unremoved blocks is greater than or equal
     223             :    to the block_depth) or if new_block was already known. */
     224             : int
     225             : fd_rdisp_add_block( fd_rdisp_t *          disp,
     226             :                     FD_RDISP_BLOCK_TAG_T  new_block,
     227             :                     ulong                 staging_lane );
     228             : 
     229             : /* fd_rdisp_remove_block deallocates a previously-allocated block with
     230             :    the block tag block, freeing all resources associated with it.  block
     231             :    must be empty (not contain any transactions in the PENDING, READY,
     232             :    DISPATCHED, or ZOMBIE states), and schedule-ready.
     233             :    Returns 0 on success, and -1 if block is not known.  After a
     234             :    successful return, the block tag block will not be known. */
     235             : int
     236             : fd_rdisp_remove_block( fd_rdisp_t *          disp,
     237             :                        FD_RDISP_BLOCK_TAG_T  block );
     238             : 
     239             : 
     240             : /* fd_rdisp_abandon_block is similar to remove_block, but works when the
     241             :    block contains transactions.  It immediately transitions all
     242             :    transactions part of the block to FREE, and then removes the block as
     243             :    in fd_rdisp_remove_block.  Note that if a transaction is DISPATCHED
     244             :    at the time of the call complete_txn should NOT be called on that
     245             :    transaction index when it completes.  The specified block must be
     246             :    schedule-ready.
     247             : 
     248             :    In V1 of the dispatcher, this only works if there are no DISPATCHED
     249             :    transactions.
     250             : 
     251             :    Returns 0 on success, and -1 if block is not known.  After a
     252             :    successful return, the block tag block will not be known. */
     253             : int
     254             : fd_rdisp_abandon_block( fd_rdisp_t          * disp,
     255             :                         FD_RDISP_BLOCK_TAG_T  block );
     256             : 
     257             : 
     258             : /* fd_rdisp_{promote,demote}_block modify whether a block is STAGED or
     259             :    UNSTAGED.  Specifically, promote_block promotes the specified block
     260             :    from UNSTAGED to STAGED, using the specified staging_lane.
     261             :    demote_block demotes the specified block from STAGED to UNSTAGED.
     262             :    disp must be a valid local join.  If the block tag block is not
     263             :    known, or is not in the requisite state (UNSTAGED from promote,
     264             :    STAGED for demote), returns -1.
     265             : 
     266             :    When promote_block promotes the specified block, it is placed at the
     267             :    end of the linear chain in the specified staging_lane.  That means
     268             :    the operation is as if abandon_block were called on the specified
     269             :    block, then add_block with the specified staging_lane, and then all
     270             :    transactions in the PENDING and READY states in this block were
     271             :    re-added in the same order they were originally added.  It is
     272             :    undefined behavior if the specified block contains any transactions
     273             :    in the DISPATCHED stage.  As in add_block, upon successful return,
     274             :    the specified block will be insert-ready, but will only be
     275             :    schedule-ready if the specified staging lane was empty at the time of
     276             :    the call.
     277             : 
     278             :    demote_block has the additional requirement that the specified block
     279             :    must be schedule-ready and empty, that is, not containing any
     280             :    transactions in the PENDING, READY, or DISPATCHED states. */
     281             : 
     282             : int
     283             : fd_rdisp_promote_block( fd_rdisp_t *          disp,
     284             :                         FD_RDISP_BLOCK_TAG_T  block,
     285             :                         ulong                 staging_lane );
     286             : int
     287             : fd_rdisp_demote_block( fd_rdisp_t *          disp,
     288             :                        FD_RDISP_BLOCK_TAG_T  block );
     289             : 
     290             : 
     291             : /* fd_rdisp_rekey_block renames the block with tag old_tag so that it
     292             :    has tag new_tag instead.  The block retains all transactions, it's
     293             :    STAGED/UNSTAGED state, etc.  On successful return, tag old_tag will
     294             :    know longer be a known tag, and new_tag must be used in any future
     295             :    calls to refer to the block previously known as old_tag.
     296             : 
     297             :    disp must be a valid local join.  new_tag must not be a known tag,
     298             :    but old_tag must be a known tag.
     299             : 
     300             :    Return 0 on success and -1 on error.  The only error cases are if
     301             :    new_tag is already a known tag or old_tag is not a known tag. */
     302             : int
     303             : fd_rdisp_rekey_block( fd_rdisp_t *           disp,
     304             :                       FD_RDISP_BLOCK_TAG_T   new_tag,
     305             :                       FD_RDISP_BLOCK_TAG_T   old_tag );
     306             : 
     307             : /* fd_rdisp_add_txn adds a transaction to the block with tag
     308             :    insert_block in serial order.  That means that this dispatcher will
     309             :    ensure this transaction appears to execute after each transaction
     310             :    added to this block in a prior call.
     311             : 
     312             :    insert_block must be a known block that is insert-ready.  txn,
     313             :    payload, and alts describe the transaction to be added.  txn must be
     314             :    the result of parsing payload, and alts contains the expansion and
     315             :    selection of the address lookup tables mentioned in the transaction
     316             :    (i.e. all the writable accounts followed by all the read-only
     317             :    accounts).  alts may be NULL, even if the transaction specifies that
     318             :    it loads accounts from an address lookup table; in this case,
     319             :    addresses from ALTs are ignored.
     320             : 
     321             :    Shockingly, this dispatcher does not retain any read interest (much
     322             :    less write interest) in the transaction (txn, payload, or alts).  On
     323             :    success, it returns a transaction index that was previously in the
     324             :    FREE state.  This API is designed to facilitate a model of use where
     325             :    the replay tile copies the incoming transaction to a region of
     326             :    private memory, adds it to this dispatcher, and then copies it (using
     327             :    non-temporal stores) to the output dcache at a location determined by
     328             :    the returned index.  Although there are two memcpys in this approach,
     329             :    it should result in fewer cache misses.
     330             : 
     331             :    If serializing is non-zero, this transaction will be a serialization
     332             :    point: all transactions added prior to this one must complete before
     333             :    this transaction can be scheduled, and this transaction must complete
     334             :    before any subsequently added transactions can be scheduled.  This is
     335             :    not good for performance, but is useful for the rare case when
     336             :    repair/turbine is several slots ahead of replay, and a transaction
     337             :    loads some accounts from an address lookup table, but we haven't
     338             :    executed the transaction to populate that part of the address lookup
     339             :    table yet.  This is the primary use for alts==NULL.
     340             : 
     341             :    Returns 0 and does not add the transaction on failure.  Fails if
     342             :    there were no free transaction indices, if the block with tag
     343             :    insert_block did not exist, or if it was not schedule-ready.
     344             : 
     345             :    At the time this function returns, the returned transaction index
     346             :    will be in the PENDING or READY state, depending on whether it
     347             :    conflicts with something previously inserted. */
     348             : ulong
     349             : fd_rdisp_add_txn( fd_rdisp_t          *  disp,
     350             :                   FD_RDISP_BLOCK_TAG_T   insert_block,
     351             :                   fd_txn_t const       * txn,
     352             :                   uchar const          * payload,
     353             :                   fd_acct_addr_t const * alts,
     354             :                   int                    serializing );
     355             : 
     356             : /* fd_rdisp_get_next_ready returns the transaction index of a READY
     357             :    transaction that was inserted with block tag schedule_block if one
     358             :    exists, and 0 otherwise.  The block with the tag schedule_block must
     359             :    be schedule-ready.
     360             : 
     361             :    If there are multiple READY transactions, which exact one is returned
     362             :    is arbitrary.  That said, this function does make some effort to pick
     363             :    one that (upon completion) will unlock more parallelism.  disp must
     364             :    be a valid local join.  At the time this function returns, the
     365             :    returned transaction index (if nonzero) will transition to the
     366             :    DISPATCHED state. */
     367             : ulong
     368             : fd_rdisp_get_next_ready( fd_rdisp_t           * disp,
     369             :                          FD_RDISP_BLOCK_TAG_T   schedule_block );
     370             : 
     371             : /* fd_rdisp_complete_txn notifies the dispatcher that the specified
     372             :    transaction (which must have been in the DISPATCHED state) has
     373             :    completed.  Logs warning and returns on error (invalid txn_idx, not
     374             :    in DISPATCHED state).  This function may cause other transactions to
     375             :    transition from PENDING to READY.
     376             : 
     377             :    At the time this function returns, the specified transaction index
     378             :    will be in the FREE state, if reclaim!=0.  Otherwise, if reclaim==0,
     379             :    the specified transaction index will be in the ZOMBIE state.  A
     380             :    ZOMBIE transaction has the exact same effect as a FREE transaction on
     381             :    causing other transactions to transition from PENDING to READY.
     382             :    However, the dispatcher will not reclaim the specified transaction
     383             :    index, until a future invocation of fd_rdisp_complete_txn where
     384             :    reclaim!=0.  This is useful when there is non-execution work to be
     385             :    done asynchronously for the transaction, but the caller would like to
     386             :    unblock the execution of transactions that depend on this one. */
     387             : void
     388             : fd_rdisp_complete_txn( fd_rdisp_t * disp,
     389             :                        ulong        txn_idx,
     390             :                        int          reclaim );
     391             : 
     392             : 
     393             : typedef struct {
     394             :   FD_RDISP_BLOCK_TAG_T  schedule_ready_block;
     395             :   FD_RDISP_BLOCK_TAG_T  insert_ready_block;
     396             : } fd_rdisp_staging_lane_info_t;
     397             : 
     398             : /* fd_rdisp_staging_lane_info copies the current staging lane info to
     399             :    out.  Returns a 4-bit bitset, where bit i being set means that
     400             :    staging lane i is occupied.  If staging lane i is occupied, then
     401             :    out_sched[i] is populated. */
     402             : ulong
     403             : fd_rdisp_staging_lane_info( fd_rdisp_t           const * disp,
     404             :                             fd_rdisp_staging_lane_info_t out_sched[ static 4 ] );
     405             : 
     406             : /* fd_rdisp_verify does some light verification and internal consistency
     407             :    checks of some internal data structures.  Aborts with an error
     408             :    message if anything fails verification.  disp is a pointer to a valid
     409             :    local join, and scratch is a pointer to a region of scratch memory
     410             :    with at least depth+1 elements that will be clobbered (its contents
     411             :    at the time of the function call are ignored). */
     412             : void
     413             : fd_rdisp_verify( fd_rdisp_t const * disp,
     414             :                  uint             * scratch );
     415             : 
     416             : void *
     417             : fd_rdisp_leave( fd_rdisp_t * disp );
     418             : 
     419             : void *
     420             : fd_rdisp_delete( void * mem );
     421             : 
     422             : FD_PROTOTYPES_END
     423             : 
     424             : #endif /* HEADER_fd_src_discof_replay_fd_rdisp_h */

Generated by: LCOV version 1.14