Line data Source code
1 : #ifndef HEADER_fd_src_discof_tower_fd_hfork_h 2 : #define HEADER_fd_src_discof_tower_fd_hfork_h 3 : 4 : /* fd_hfork presents an API for detecting hard forks by monitoring 5 : votes from validators and comparing their bank hashes against ours. 6 : 7 : The Solana chain occasionally forks for what we will call soft and 8 : hard reasons. 9 : 10 : - Soft forks occur due to network latency and the distributed 11 : nature of the network. Different validators may see different 12 : blocks at different times, and so may disagree on ledger state 13 : temporarily until the network converges. This is expected and 14 : normal. 15 : 16 : - Hard forks occur when validators have a disagreement about the 17 : rules of the protocol and come to different conclusions about the 18 : permanent state of the ledger. This is not expected or normal. 19 : 20 : For Firedancer, it is useful to be able to detect when any hard fork 21 : has occurred, because it means there was likely some consensus bug 22 : that caused us to diverge from Agave. What we check is actually 23 : broader: it is possible that we disagreed about the result of some 24 : block, but did not diverge from Agave as the block we disagreed on 25 : did not become canonical. We still want to detect this case, as it 26 : indicates a consensus bug even if we did not ultimately diverge. 27 : 28 : We detect hard forked blocks by monitoring votes from validators, the 29 : specific criteria is: 30 : 31 : - 52% of stake has voted on a block_id and agreed on a bank_hash 32 : which is different than the bank_hash we have for that block_id. 33 : 34 : - 52% of stake has voted on a block_id and agreed on a bank_hash 35 : when we marked the block dead because it failed to execute. 36 : 37 : Interestingly we do not need to handle the case where we thought a 38 : block succeeded, but the rest of the cluster failed it, because this 39 : does not cause a hard fork. We simply switch to the canonical fork 40 : the rest of the cluster will converge on. It is not really possible 41 : to detect this case, because other nodes do not report that the block 42 : failed, they simply vote somewhere else. 43 : 44 : We are essentially receiving a 45 : 46 : Stream<Either<(VoteAccountPubkey, BlockId, BankHash), EpochStakes>> 47 : 48 : The first variant is a vote from some validator for a certain 49 : block_id and block_hash. This could come from either any replayed 50 : block, or gossip. It does not matter where it comes from, or even if 51 : the source is valid (e.g. it could come from a block which gets 52 : marked dead because it did not validate), all the matters is that the 53 : vote is validly signed by the vote account. 54 : 55 : Internally, we maintain a Map<BlockId, Map<BankHash, StakeWeight>>. 56 : Any time we receive a vote, if it causes an entry in this map to 57 : exceed 52% of the total stake weight, or it is from our own validator 58 : identity, we check if our vote is different, and potentially raise a 59 : warning or error. 60 : 61 : The last max_live_slots votes for each vote account are kept in a 62 : ring buffer and replaced as new votes come in, to prevent unbounded 63 : memory usage. This makes the structure somewhat heuristic: it might 64 : be that if some nodes are very far ahead, and some behind, we might 65 : evict old votes for those ahead and never see a fork exceed 52% in 66 : our window. This is unlikely to happen in practice, and even if it 67 : does, it only means we might miss detecting a hard fork, which is not 68 : catastrophic. The default behavior anyway should be to continue 69 : running on the forked chain. */ 70 : 71 : #include "../fd_choreo_base.h" 72 : #include "../tower/fd_tower_voters.h" 73 : 74 : struct fd_hfork; 75 : typedef struct fd_hfork fd_hfork_t; 76 : 77 : struct fd_hfork_blk { 78 : fd_hash_t block_id; /* blk_map key */ 79 : ulong prev; /* blk_map prev */ 80 : ulong next; /* pool next / blk_map next */ 81 : fd_hash_t our_bank_hash; /* our bank hash for this block id */ 82 : int replayed; /* whether we've replayed this block */ 83 : int dead; /* whether we marked this block as dead */ 84 : int flag; /* -1: mismatch, 0: not compared yet, 1: match */ 85 : ulong bhm_cnt; /* number of competing bank hashes for this block id */ 86 : void * bhm_dlist; /* dlist of bank hash objects for this block id */ 87 : }; 88 : typedef struct fd_hfork_blk fd_hfork_blk_t; 89 : 90 : FD_PROTOTYPES_BEGIN 91 : 92 : /* fd_hfork_{align,footprint} return the required alignment and 93 : footprint of a memory region suitable for use as a hfork. align 94 : returns fd_hfork_ALIGN. footprint returns fd_hfork_FOOTPRINT. */ 95 : 96 : FD_FN_CONST ulong 97 : fd_hfork_align( void ); 98 : 99 : FD_FN_CONST ulong 100 : fd_hfork_footprint( ulong per_vtr_max, 101 : ulong vtr_max ); 102 : 103 : /* fd_hfork_new formats an unused memory region for use as a hfork. mem 104 : is a non-NULL pointer to this region in the local address space with 105 : the required footprint and alignment. */ 106 : 107 : void * 108 : fd_hfork_new( void * mem, 109 : ulong per_vtr_max, 110 : ulong vtr_max, 111 : ulong seed ); 112 : 113 : /* fd_hfork_join joins the caller to the hfork. hfork points to the 114 : first byte of the memory region backing the hfork in the caller's 115 : address space. 116 : 117 : Returns a pointer in the local address space to hfork on success. */ 118 : 119 : fd_hfork_t * 120 : fd_hfork_join( void * hfork ); 121 : 122 : /* fd_hfork_leave leaves a current local join. Returns a pointer to the 123 : underlying shared memory region on success and NULL on failure (logs 124 : details). Reasons for failure include hfork is NULL. */ 125 : 126 : void * 127 : fd_hfork_leave( fd_hfork_t const * hfork ); 128 : 129 : /* fd_hfork_delete unformats a memory region used as a hfork. Assumes 130 : only the local process is joined to the region. Returns a pointer to 131 : the underlying shared memory region or NULL if used obviously in 132 : error (e.g. hfork is obviously not a hfork ... logs details). The 133 : ownership of the memory region is transferred to the caller. */ 134 : 135 : void * 136 : fd_hfork_delete( void * hfork ); 137 : 138 : /* fd_hfork_count_vote return codes. */ 139 : 140 0 : #define FD_HFORK_SUCCESS_MATCHED ( 1) /* vote counted, bank hash matches ours */ 141 0 : #define FD_HFORK_SUCCESS ( 0) /* vote counted successfully */ 142 0 : #define FD_HFORK_ERR_MISMATCHED (-1) /* vote counted, bank hash does not match ours */ 143 0 : #define FD_HFORK_ERR_UNKNOWN_VTR (-2) /* voter not in vtr_map */ 144 3 : #define FD_HFORK_ERR_ALREADY_VOTED (-3) /* voter already voted for this block_id */ 145 0 : #define FD_HFORK_ERR_VOTE_TOO_OLD (-4) /* vote slot not newer than previous */ 146 : 147 : /* fd_hfork_count_vote updates the hard fork detector with a newly 148 : observed vote. Returns FD_HFORK_SUCCESS_MATCHED if the bank hash 149 : matches ours, FD_HFORK_SUCCESS if counted but not yet compared, 150 : FD_HFORK_ERR_MISMATCHED if the bank hash does not match ours, or a 151 : negative FD_HFORK_ERR_* code if the vote was not counted. */ 152 : 153 : int 154 : fd_hfork_count_vote( fd_hfork_t * hfork, 155 : fd_pubkey_t const * vote_acc, 156 : fd_hash_t const * block_id, 157 : fd_hash_t const * bank_hash, 158 : ulong slot, 159 : ulong stake, 160 : ulong total_stake ); 161 : 162 : /* fd_hfork_record_our_bank_hash updates the hard fork detector with our 163 : bank hash (computed on replay) for a given block ID. If bank_hash is 164 : NULL, this indicates the block was marked dead during replay and we 165 : did not think it was valid. Always returns a pointer to the 166 : associated fd_hfork_blk_t. The caller should inspect blk->flag to 167 : determine the outcome: 1 (match), -1 (mismatch), or 0 (not yet 168 : compared). */ 169 : 170 : fd_hfork_blk_t * 171 : fd_hfork_record_our_bank_hash( fd_hfork_t * hfork, 172 : fd_hash_t const * block_id, 173 : fd_hash_t const * bank_hash, 174 : ulong total_stake ); 175 : 176 : /* fd_hfork_update_voters updates the set of voters tracked by the hard 177 : fork detector. Voters not in tower_voters are removed along with all 178 : their vote entries. New voters are added. This should be called on 179 : each epoch boundary when the stake-weighted voter set changes. */ 180 : 181 : void 182 : fd_hfork_update_voters( fd_hfork_t * hfork, 183 : fd_tower_voters_t const * tower_voters ); 184 : 185 : FD_PROTOTYPES_END 186 : 187 : #endif /* HEADER_fd_src_discof_tower_fd_hfork_h */