LCOV - code coverage report
Current view: top level - discof/replay - fd_rdisp.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 1 2 50.0 %
Date: 2025-09-19 04:41:14 Functions: 0 0 -

          Line data    Source code
       1             : #ifndef HEADER_fd_src_discof_replay_fd_rdisp_h
       2             : #define HEADER_fd_src_discof_replay_fd_rdisp_h
       3             : 
       4             : #include "../../disco/fd_disco_base.h"
       5             : 
       6             : /* fd_rdisp defines methods for building a DAG (directed acyclic graph)
       7             :    of transactions, and executing them in the appropriate order with the
       8             :    maximum amount of parallelism.
       9             : 
      10             :    Transactions must appear to execute in the order in which they occur
      11             :    in the block (the "serial fiction").  However, when two transactions
      12             :    are independent, they can actually be scheduled in either order, or
      13             :    in parallel.  Two transactions are independent unless one writes to
      14             :    an account that the other reads or writes.  If two transactions are
      15             :    not independent, the one that comes earlier in the block is a
      16             :    predecessor transaction of the one that comes later in the block.  In
      17             :    order to ensure correct execution, a transaction cannot be executed
      18             :    until all its predecessor transactions have completed.  In this file,
      19             :    a transaction completing means that the results of its account writes
      20             :    will be visible to a subsequent transaction executing on any core.
      21             : 
      22             :    Shockingly, this dispatcher does not need to keep a copy of
      23             :    transactions (more on this later), which means it operates almost
      24             :    entirely on transaction indices.  An index is a positive integer
      25             :    (uint) up to some maximum.  The 0 index is a sentinel, and doesn't
      26             :    correspond to a real transaction.  It's up to the caller to maintain
      27             :    the mapping between each transaction index and the transaction itself.
      28             : 
      29             :    A transaction index is in one of the following states:
      30             :      * FREE, which means it doesn't correspond to a transaction
      31             :      * PENDING, which means it corresponds to a transaction that can't
      32             :        be scheduled yet because it must come after transactions that
      33             :        have not completed yet.
      34             :      * READY, which means all predecessor transactions have completed,
      35             :        but this transaction index has not been returned by
      36             :        get_next_ready yet
      37             :      * DISPATCHED, which means this transaction index was returned by
      38             :        get_next_ready but has not been completed yet.
      39             : 
      40             : 
      41             :                         --------------> PENDING
      42             :                 add_txn |                  |
      43             :          FREE  ---------|                  |
      44             :           ^             |                  V    get_next_ready
      45             :           |             -------------->  READY ----------------
      46             :           |                                                   |
      47             :           |                                                   |
      48             :           |                                                   |
      49             :           |                  complete_txn                     V
      50             :           ---------------------------------------------- DISPATCHED
      51             : 
      52             : 
      53             :    Additionally, fd_rdisp is block-aware, and even somewhat fork-aware.
      54             :    Prior to inserting a transaction, a block must be added.  Then, when
      55             :    a transaction is inserted, the caller must specify the block it is
      56             :    part of.
      57             : 
      58             :    At all times, each block is either STAGED or UNSTAGED.  Blocks that
      59             :    are STAGED benefit from the full parallelization-maximizing dispatch,
      60             :    but at most four linear chains of blocks can be STAGED at any time.
      61             :    Assuming the limit of four is still satisfied, when a block is added,
      62             :    it may be added as STAGED.  Otherwise, if it's added as UNSTAGED, it
      63             :    may be promoted to STAGED later, though this is worse for performance
      64             :    than adding it as STAGED initially.  Blocks may be demoted from
      65             :    STAGED to UNSTAGED only if the linear chain doesn't contain any
      66             :    PENDING, READY, or DISPATCHED transactions.
      67             : 
      68             :    Prior to properly introducing staging lanes, we need to introduce two
      69             :    more concepts.  A block can be insert-ready, schedule-ready, neither,
      70             :    or both.  A block must be insert-ready to call fd_rdisp_add_txn on
      71             :    it; a block must be schedule-ready to call get_next_ready on it.
      72             :    Various functions either require or modify these properties of a
      73             :    block.  These properties are necessary but not sufficient for the
      74             :    respective functions to succeed; for example, a block may be
      75             :    schedule-ready but empty, in which case, scheduling will still not
      76             :    succeed.  An UNSTAGED block is always both insert-ready and
      77             :    schedule-ready.
      78             : 
      79             :    A staging lane can contain a single block or a sequence of blocks.
      80             :    When a staging lane contains more than one block, some restrictions
      81             :    apply, namely, only the first block in the sequence is
      82             :    schedule-ready, and only the last block in the sequence is
      83             :    insert-ready, where first and last are determined by the order in
      84             :    which the block is staged.  Although this restriction makes the
      85             :    interface a bit more complicated, it's driven by performance
      86             :    requirements and common use cases.  Basically, there's no use for
      87             :    being able to replay a block before we've replayed its parent block.
      88             : 
      89             :    Basically, rdisp is designed to handle three situations well:
      90             :     * The normal case, where replay is pretty much caught up, and there
      91             :       are only one or two forks, containing only one or two un-replayed
      92             :       blocks each.
      93             :     * The startup case, where repair is far ahead of replay, but there's
      94             :       only a single fork, or very few forks
      95             :     * The DoS case, where there are many forks, perhaps duplicate
      96             :       blocks, and we don't know which to prioritize yet, but we will
      97             :       execute some, and prune the rest later.  This includes the case
      98             :       where we stop receiving transactions from some of the forks but
      99             :       don't know that the block has ended.
     100             : 
     101             :    In the normal case, there are few enough unreplayed blocks that it
     102             :    doesn't really matter how the staging lanes are used.  For the
     103             :    startup case, the long linear chains of blocks can all be STAGED using
     104             :    the same lane with no performance degradation.  In the DoS case, most
     105             :    of the forks will be UNSTAGED, and using some combination of
     106             :    replaying, cancelling, and demoting blocks, staging lanes can be freed
     107             :    up so that the canonical chain can emerge.
     108             : 
     109             :    Consider the following example of storing a fork tree in staging
     110             :    lanes:
     111             :     - Slot 10's parent is not specified
     112             :     - Slot 11's parent is slot 10
     113             :     - Slot 13's parent is slot 11
     114             :     - Slot 14's parent is also slot 11
     115             : 
     116             :    Since the caller chooses the staging lane, the caller may choose
     117             :    between
     118             :              Lane 0: 10 --> 11 --> 13
     119             :              Lane 1: 14
     120             :    and
     121             :              Lane 0: 10 --> 11 --> 14
     122             :              Lane 1: 13.
     123             :    or any of the various combinations that consume more staging lanes.
     124             :    Note that the concept of staging lanes is a performance optimization,
     125             :    not a safety feature.  With the first arrangment, the caller cannot
     126             :    call get_next_ready on slot 13 in between slots 10 and 11, but
     127             :    there's no issue with calling it on slot 14 then, which would
     128             :    obiously result in an incorrect replay.  It's ultimately the callers
     129             :    responsibility to ensure correct replay. */
     130             : 
     131           0 : #define FD_RDISP_MAX_DEPTH       0x7FFFFFUL /* 23 bit numbers, approx 8M */
     132             : #define FD_RDISP_MAX_BLOCK_DEPTH 0xFFFFUL   /* 16 bits */
     133        9033 : #define FD_RDISP_UNSTAGED        ULONG_MAX
     134             : 
     135             : struct fd_rdisp;
     136             : typedef struct fd_rdisp fd_rdisp_t;
     137             : 
     138             : /* fd_rdisp is set up so that the tag of a block can be adjusted to
     139             :    account for differences in handling duplicate blocks/equivocation. */
     140             : #define FD_RDISP_BLOCK_TAG_T ulong
     141             : 
     142             : FD_PROTOTYPES_BEGIN
     143             : 
     144             : /* fd_rdisp_{align,footprint} return the required alignment and
     145             :    footprint in bytes for a region of memory to be used as a dispatcher.
     146             :    depth is the maximum number of transaction indices that can be
     147             :    tracked at a time.  Depth must be at least 2 and cannot exceed
     148             :    FD_RDISP_MAX_DEPTH.  block_depth is the maximum number of blocks that
     149             :    this dispatcher can track.  block_depth must be at least 4 and cannot
     150             :    exceed FD_RDISP_MAX_BLOCK_DEPTH. */
     151             : ulong fd_rdisp_align    ( void        );
     152             : ulong fd_rdisp_footprint( ulong depth, ulong block_depth );
     153             : 
     154             : 
     155             : /* fd_rdisp_new formats a region of memory that satisfies the required
     156             :    footprint and alignment for use as a dispatcher.  depth and
     157             :    block_depth are as explained in fd_rdisp_footprint.  mem is a pointer
     158             :    to the first byte of a region of memory with the required alignment
     159             :    and footprint.  seed is an arbitrary ulong that is used to determine
     160             :    a seed of various internal hash tables.  On return, the caller will
     161             :    not be joined.
     162             : 
     163             :    fd_rdisp_join joins the caller to the dispatcher, enabling it for
     164             :    use. */
     165             : void *
     166             : fd_rdisp_new( void * mem,
     167             :               ulong  depth,
     168             :               ulong  block_depth,
     169             :               ulong  seed );
     170             : 
     171             : fd_rdisp_t *
     172             : fd_rdisp_join( void * mem );
     173             : 
     174             : /* fd_rdisp_suggest_staging_lane recommends a staging lane to use for a
     175             :    potential new block that has a parent block with block tag
     176             :    parent_block.  duplicate is non-zero if this is not the first block
     177             :    we've seen for its slot.
     178             : 
     179             :    This function uses the following logic:
     180             :    1. If it's a duplicate, suggest FD_RDISP_UNSTAGED
     181             :    2. If parent is the last block in any existing staging lane, suggest
     182             :       that lane
     183             :    3. If there is at least one free lane, suggest a free lane
     184             :    4. Else, suggest FD_RDISP_UNSTAGED
     185             :    Note that this function does not add the block (use add_block) for
     186             :    that, and does not modify the state of the dispatcher.  The caller
     187             :    should feel free to use or not use the suggested staging lane. */
     188             : ulong
     189             : fd_rdisp_suggest_staging_lane( fd_rdisp_t const *   disp,
     190             :                                FD_RDISP_BLOCK_TAG_T parent_block,
     191             :                                int                  duplicate );
     192             : 
     193             : 
     194             : /* fd_rdisp_add_block allocates a new block with the tag new_block from
     195             :    disp's internal pool.  new_block must not be the invalid block tag
     196             :    value, and it must be distinct from all other values passed as
     197             :    new_block in all prior calls to fd_rdisp_add_block.
     198             : 
     199             :    staging_lane must be either [0,4) or FD_RDISP_UNSTAGED.  If
     200             :    staging_lane is FD_RDISP_UNSTAGED, the block will be UNSTAGED (see
     201             :    the long comment at the beginning of this header), schedule-ready,
     202             :    and insert-ready.
     203             :    If staging_lane is in [0, 4), the block will be STAGED, and it will
     204             :    be insert-ready.  If the specified staging lane contained any blocks
     205             :    at the time of the call, the last one will no longer be insert-ready,
     206             :    making this the only insert-ready block in the lane.  If the
     207             :    specified staging lane did not contain any blocks at the time of the
     208             :    call, then the newly added block will also be schedule-ready.
     209             : 
     210             :    On successful return, the tag new_block will be usable for other
     211             :    functions that take a block tag block.
     212             : 
     213             :    Returns 0 on success, and -1 on error, which can only happen if out
     214             :    of resources (the number of unremoved blocks is greater than or equal
     215             :    to the block_depth) or if new_block was already known. */
     216             : int
     217             : fd_rdisp_add_block( fd_rdisp_t *          disp,
     218             :                     FD_RDISP_BLOCK_TAG_T  new_block,
     219             :                     ulong                 staging_lane );
     220             : 
     221             : /* fd_rdisp_remove_block deallocates a previously-allocated block with
     222             :    the block tag block, freeing all resources associated with it.  block
     223             :    must be empty (not contain any transactions in the PENDING, READY, or
     224             :    DISPATCHED states), and schedule-ready.
     225             :    Returns 0 on success, and -1 if block is not known.  After a
     226             :    successful return, the block tag block will not be known. */
     227             : int
     228             : fd_rdisp_remove_block( fd_rdisp_t *          disp,
     229             :                        FD_RDISP_BLOCK_TAG_T  block );
     230             : 
     231             : 
     232             : /* fd_rdisp_abandon_block is similar to remove_block, but works when the
     233             :    block contains transactions.  It immediately transitions all
     234             :    transactions part of the block to FREE, and then removes the block as
     235             :    in fd_rdisp_remove_block.  Note that if a transaction is DISPATCHED
     236             :    at the time of the call complete_txn should NOT be called on that
     237             :    transaction index when it completes.  The specified block must be
     238             :    schedule-ready.
     239             : 
     240             :    In V1 of the dispatcher, this only works if there are no DISPATCHED
     241             :    transactions.
     242             : 
     243             :    Returns 0 on success, and -1 if block is not known.  After a
     244             :    successful return, the block tag block will not be known. */
     245             : int
     246             : fd_rdisp_abandon_block( fd_rdisp_t          * disp,
     247             :                         FD_RDISP_BLOCK_TAG_T  block );
     248             : 
     249             : 
     250             : /* fd_rdisp_{promote,demote}_block modify whether a block is STAGED or
     251             :    UNSTAGED.  Specifically, promote_block promotes the specified block
     252             :    from UNSTAGED to STAGED, using the specified staging_lane.
     253             :    demote_block demotes the specified block from STAGED to UNSTAGED.
     254             :    disp must be a valid local join.  If the block tag block is not
     255             :    known, or is not in the requisite state (UNSTAGED from promote,
     256             :    STAGED for demote), returns -1.
     257             : 
     258             :    When promote_block promotes the specified block, it is placed at the
     259             :    end of the linear chain in the specified staging_lane.  That means
     260             :    the operation is as if abandon_block were called on the specified
     261             :    block, then add_block with the specified staging_lane, and then all
     262             :    transactions in the PENDING and READY states in this block were
     263             :    re-added in the same order they were originally added.  It is
     264             :    undefined behavior if the specified block contains any transactions
     265             :    in the DISPATCHED stage.  As in add_block, upon successful return,
     266             :    the specified block will be insert-ready, but will only be
     267             :    schedule-ready if the specified staging lane was empty at the time of
     268             :    the call.
     269             : 
     270             :    demote_block has the additional requirement that the specified block
     271             :    must be schedule-ready and empty, that is, not containing any
     272             :    transactions in the PENDING, READY, or DISPATCHED states. */
     273             : 
     274             : int
     275             : fd_rdisp_promote_block( fd_rdisp_t *          disp,
     276             :                         FD_RDISP_BLOCK_TAG_T  block,
     277             :                         ulong                 staging_lane );
     278             : int
     279             : fd_rdisp_demote_block( fd_rdisp_t *          disp,
     280             :                        FD_RDISP_BLOCK_TAG_T  block );
     281             : 
     282             : 
     283             : /* fd_rdisp_rekey_block renames the block with tag old_tag so that it
     284             :    has tag new_tag instead.  The block retains all transactions, it's
     285             :    STAGED/UNSTAGED state, etc.  On successful return, tag old_tag will
     286             :    know longer be a known tag, and new_tag must be used in any future
     287             :    calls to refer to the block previously known as old_tag.
     288             : 
     289             :    disp must be a valid local join.  new_tag must not be a known tag,
     290             :    but old_tag must be a known tag.
     291             : 
     292             :    Return 0 on success and -1 on error.  The only error cases are if
     293             :    new_tag is already a known tag or old_tag is not a known tag. */
     294             : int
     295             : fd_rdisp_rekey_block( fd_rdisp_t *           disp,
     296             :                       FD_RDISP_BLOCK_TAG_T   new_tag,
     297             :                       FD_RDISP_BLOCK_TAG_T   old_tag );
     298             : 
     299             : /* fd_rdisp_add_txn adds a transaction to the block with tag
     300             :    insert_block in serial order.  That means that this dispatcher will
     301             :    ensure this transaction appears to execute after each transaction
     302             :    added to this block in a prior call.
     303             : 
     304             :    insert_block must be a known block that is schedule-ready.  txn,
     305             :    payload, and alts describe the transaction to be added.  txn must be
     306             :    the result of parsing payload, and alts contains the expansion and
     307             :    selection of the address lookup tables mentioned in the transaction
     308             :    (i.e. all the writable accounts followed by all the read-only
     309             :    accounts).  alts may be NULL, even if the transaction specifies that
     310             :    it loads accounts from an address lookup table; in this case,
     311             :    addresses from ALTs are ignored.
     312             : 
     313             :    Shockingly, this dispatcher does not retain any read interest (much
     314             :    less write interest) in the transaction (txn, payload, or alts).  On
     315             :    success, it returns a transaction index that was previously in the
     316             :    FREE state.  This API is designed to facilitate a model of use where
     317             :    the replay tile copies the incoming transaction to a region of
     318             :    private memory, adds it to this dispatcher, and then copies it (using
     319             :    non-temporal stores) to the output dcache at a location determined by
     320             :    the returned index.  Although there are two memcpys in this approach,
     321             :    it should result in fewer cache misses.
     322             : 
     323             :    If serializing is non-zero, this transaction will be a serialization
     324             :    point: all transactions added prior to this one must complete before
     325             :    this transaction can be scheduled, and this transaction must complete
     326             :    before any subsequently added transactions can be scheduled.  This is
     327             :    not good for performance, but is useful for the rare case when
     328             :    repair/turbine is several slots ahead of replay, and a transaction
     329             :    loads some accounts from an address lookup table, but we haven't
     330             :    executed the transaction to populate that part of the address lookup
     331             :    table yet.  This is the primary use for alts==NULL.
     332             : 
     333             :    Returns 0 and does not add the transaction on failure.  Fails if
     334             :    there were no free transaction indices, if the block with tag
     335             :    insert_block did not exist, or if it was not schedule-ready.
     336             : 
     337             :    At the time this function returns, the returned transaction index
     338             :    will be in the PENDING or READY state, depending on whether it
     339             :    conflicts with something previously inserted. */
     340             : ulong
     341             : fd_rdisp_add_txn( fd_rdisp_t          *  disp,
     342             :                   FD_RDISP_BLOCK_TAG_T   insert_block,
     343             :                   fd_txn_t const       * txn,
     344             :                   uchar const          * payload,
     345             :                   fd_acct_addr_t const * alts,
     346             :                   int                    serializing );
     347             : 
     348             : /* fd_rdisp_get_next_ready returns the transaction index of a READY
     349             :    transaction that was inserted with block tag schedule_block if one
     350             :    exists, and 0 otherwise.  The block with the tag schedule_block must
     351             :    be schedule-ready.
     352             : 
     353             :    If there are multiple READY transactions, which exact one is returned
     354             :    is arbitrary.  That said, this function does make some effort to pick
     355             :    one that (upon completion) will unlock more parallelism.  disp must
     356             :    be a valid local join.  At the time this function returns, the
     357             :    returned transaction index (if nonzero) will transition to the
     358             :    DISPATCHED state. */
     359             : ulong
     360             : fd_rdisp_get_next_ready( fd_rdisp_t           * disp,
     361             :                          FD_RDISP_BLOCK_TAG_T   schedule_block );
     362             : 
     363             : /* fd_rdisp_complete_txn notifies the dispatcher that the
     364             :    specified transaction (which must have been in the DISPATCHED state)
     365             :    has completed.  Logs warning and returns on error (invalid txn_idx,
     366             :    not in DISPATCHED state).  At the time this function returns, the
     367             :    specified transaction index will be in the FREE state.  This function
     368             :    may cause other transaction to transition from PENDING to READY. */
     369             : void
     370             : fd_rdisp_complete_txn( fd_rdisp_t * disp,
     371             :                        ulong        txn_idx );
     372             : 
     373             : 
     374             : typedef struct {
     375             :   FD_RDISP_BLOCK_TAG_T  schedule_ready_block;
     376             :   FD_RDISP_BLOCK_TAG_T  insert_ready_block;
     377             : } fd_rdisp_staging_lane_info_t;
     378             : 
     379             : /* fd_rdisp_staging_lane_info copies the current staging lane info to
     380             :    out.  Returns a 4-bit bitset, where bit i being set means that
     381             :    staging lane i is occupied.  If staging lane i is occupied, then
     382             :    out_sched[i] is populated. */
     383             : ulong
     384             : fd_rdisp_staging_lane_info( fd_rdisp_t           const * disp,
     385             :                             fd_rdisp_staging_lane_info_t out_sched[ static 4 ] );
     386             : 
     387             : void
     388             : fd_rdisp_verify( fd_rdisp_t const * disp );
     389             : 
     390             : void *
     391             : fd_rdisp_leave( fd_rdisp_t * disp );
     392             : 
     393             : void *
     394             : fd_rdisp_delete( void * mem );
     395             : 
     396             : FD_PROTOTYPES_END
     397             : 
     398             : #endif /* HEADER_fd_src_discof_replay_fd_rdisp_h */

Generated by: LCOV version 1.14