Line data Source code
1 : #ifndef HEADER_fd_src_discof_replay_fd_replay_tile_private_h
2 : #define HEADER_fd_src_discof_replay_fd_replay_tile_private_h
3 :
4 : #include "fd_replay_tile.h"
5 : #include "fd_vote_tracker.h"
6 : #include "../../disco/topo/fd_wksp_mon.h"
7 : #include "../../disco/store/fd_store.h"
8 : #include "../../disco/bundle/fd_bundle_crank.h"
9 : #include "../../disco/keyguard/fd_keyswitch.h"
10 : #include "../../discof/reasm/fd_reasm.h"
11 : #include "../../discof/replay/fd_sched.h"
12 : #include "../../flamenco/accdb/fd_accdb_admin.h"
13 : #include "../../flamenco/capture/fd_capture_ctx.h"
14 : #include "../../flamenco/genesis/fd_genesis_parse.h"
15 : #include "../../flamenco/leaders/fd_multi_epoch_leaders.h"
16 : #include "../../flamenco/progcache/fd_progcache.h"
17 : #include "../../flamenco/runtime/fd_bank.h"
18 : #include "../../flamenco/runtime/fd_txncache.h"
19 : #include "../../flamenco/runtime/tests/fd_dump_pb.h"
20 : #include <stdio.h>
21 :
22 : struct fd_replay_in_link {
23 : fd_wksp_t * mem;
24 : ulong chunk0;
25 : ulong wmark;
26 : ulong mtu;
27 : };
28 :
29 : typedef struct fd_replay_in_link fd_replay_in_link_t;
30 :
31 : struct fd_replay_out_link {
32 : ulong idx;
33 : fd_wksp_t * mem;
34 : ulong chunk0;
35 : ulong wmark;
36 : ulong chunk;
37 : };
38 :
39 : typedef struct fd_replay_out_link fd_replay_out_link_t;
40 :
41 : /* fd_block_id_map is a simple map of block-ids to bank indices. The
42 : map sits on top of an array of fd_block_id_ele_t. This serves as a
43 : translation layer between block ids to bank indices. The data
44 : array is indexed by bank index and the latest observed merkle root
45 : for the bank index is stored in the array. Once the block id has
46 : been observed, the entry is keyed by the latest merkle root (aka the
47 : block id). */
48 :
49 : struct fd_block_id_ele {
50 : fd_hash_t latest_mr;
51 : uint latest_fec_idx;
52 : int block_id_seen;
53 : ulong slot;
54 : ulong next_;
55 : };
56 : typedef struct fd_block_id_ele fd_block_id_ele_t;
57 :
58 : #define MAP_NAME fd_block_id_map
59 : #define MAP_ELE_T fd_block_id_ele_t
60 : #define MAP_KEY_T fd_hash_t
61 0 : #define MAP_KEY latest_mr
62 0 : #define MAP_NEXT next_
63 0 : #define MAP_KEY_EQ(k0,k1) (!memcmp((k0),(k1), sizeof(fd_hash_t)))
64 0 : #define MAP_KEY_HASH(key,seed) (fd_hash((seed),(key),sizeof(fd_hash_t)))
65 : #include "../../util/tmpl/fd_map_chain.c"
66 :
67 : struct fd_replay_tile {
68 : fd_wksp_t * wksp;
69 :
70 : uint rng_seed;
71 : fd_rng_t rng[ 1 ];
72 :
73 : fd_accdb_admin_t accdb_admin[1];
74 : fd_accdb_user_t accdb[1];
75 : fd_progcache_join_t progcache[1];
76 : fd_wksp_mon_t progcache_wksp_mon[1];
77 : fd_wksp_mon_t accdb_cache_wksp_mon[1];
78 :
79 : fd_txncache_t * txncache;
80 : fd_store_t * store;
81 : fd_banks_t * banks;
82 : ulong frontier_indices[ FD_BANKS_MAX_BANKS ];
83 : ulong frontier_cnt;
84 :
85 : /* This flag is 1 If we have seen a vote signature that our node has
86 : sent out get rooted at least one time. The value is 0 otherwise.
87 : We can't become leader and pack blocks until this flag has been
88 : set. This parallels the Agave 'has_new_vote_been_rooted'. */
89 : int identity_vote_rooted;
90 : int wait_for_vote_to_start_leader;
91 :
92 : /* wfs_enabled is 1 if the validator is booted in
93 : wait_for_supermajority mode. In this mode replay (and, by extension,
94 : downstream consumers) is not allowed to make progress until 80% of
95 : the cluster has published their ContactInfo in Gossip with a
96 : shred version matching expected_shred_version. When this happens,
97 : wfs_complete will be set to 1. */
98 : int wfs_enabled;
99 : int wfs_complete;
100 :
101 : fd_hash_t expected_bank_hash;
102 :
103 : ulong reasm_seed;
104 : fd_reasm_t * reasm;
105 : fd_reasm_fec_t * reasm_evicted; /* evicted FEC by reasm_insert must be stored in returnable_frag, and then drained in after_credit */
106 :
107 : fd_sched_t * sched;
108 : ulong in_cnt;
109 : ulong execrp_idle_cnt;
110 :
111 : ulong vote_tracker_seed;
112 : fd_vote_tracker_t * vote_tracker;
113 :
114 : int has_genesis_hash;
115 : char genesis_path[ PATH_MAX ];
116 : fd_hash_t genesis_hash[1];
117 : fd_genesis_t genesis[1];
118 : ulong cluster_type;
119 :
120 : int has_genesis_timestamp;
121 : ulong genesis_timestamp;
122 : int has_expected_genesis_timestamp;
123 : ulong expected_genesis_timestamp;
124 :
125 : ulong hard_fork_cnt;
126 : fd_hard_fork_t hard_forks[ FD_HARD_FORKS_MAX ];
127 :
128 : ushort expected_shred_version;
129 : ushort ipecho_shred_version;
130 :
131 : /* A note on publishing ...
132 :
133 : The watermarks are used to publish our fork-aware structures. For
134 : example, store, banks, and txncache need to be published to release
135 : resources occupied by rooted or dead blocks. In general,
136 : publishing has the effect of pruning forks in those structures,
137 : indicating that it is ok to release the memory being occupied by
138 : the blocks on said forks. Tower is responsible for informing us of
139 : the latest block on the consensus rooted fork. As soon as we can,
140 : we should move the published root as close as possible to the
141 : latest consensus root, publishing/pruning everything on the fork
142 : tree along the way. That is, all the blocks that directly descend
143 : from the current published root (inclusive) to the new published
144 : root (exclusive) on the rooted fork, as well as all the minority
145 : forks that branch from said blocks.
146 :
147 : Ideally, we'd move the published root to the consensus root
148 : immediately upon receiving a new consensus root. However, that's
149 : not always safe to do. One thing we need to be careful about is
150 : making sure that there are no more users/consumers of
151 : soon-to-be-pruned blocks, lest a use-after-free occurs. This can
152 : be done by using a reference counter for each block. Any
153 : concurrent activity, such as transaction execution in the exec
154 : tiles, should retain a refcnt on the block for as
155 : long as it needs access to the shared fork-aware structures related
156 : to that block. Eventually, refcnt on a given block will drop down
157 : to 0 as the block either finishes replaying or gets marked as dead,
158 : and any other tile that has retained a refcnt on the block releases
159 : it. At that point, it becomes a candidate for pruning. The key to
160 : safe publishing then becomes figuring out how far we could advance
161 : the published root, such that every minority fork branching off of
162 : blocks in between the current published root (inclusive) and the
163 : new published root (exclusive) is safe to be pruned. This is a
164 : straightforward tree traversal, where if a block B on the rooted
165 : fork has refcnt 0, and all minority forks branching off of B also
166 : have refcnt 0, then B is safe to be pruned. We advance the
167 : published root to the farthest consecutively prunable block on the
168 : rooted fork. Note that reasm presents the replay tile with a clean
169 : view of the world where every block is chained off of a parent
170 : block. So there are no orpahned/dangling tree nodes to worry
171 : about. The world is a nice single tree as far as replay is
172 : concerned.
173 :
174 : In the following fork tree, every node is a block and the number in
175 : parentheses is the refcnt on the block. The chain marked with
176 : double slashes is the rooted fork. Suppose the published root is
177 : at block P, and consensus root is at block T. We can't publish
178 : past block P because Q has refcnt 1.
179 :
180 :
181 : P(0)
182 : / \\
183 : Q(1) A(0)
184 : / || \
185 : X(0) B(0) C(0)
186 : / || \
187 : Y(0) M(0) R(0)
188 : / || / \
189 : D(2) T(0) J(0) L(0)
190 : ||
191 : ..
192 : ..
193 : ..
194 : ||
195 : blocks we might be actively replaying
196 :
197 :
198 : When refcnt on Q drops to 0, we would be able to advance the
199 : published root to block M, because blocks P, A, and B, as well as
200 : all subtrees branching off of them, have refcnt 0, and therefore
201 : can be pruned. Block M itself cannot be pruned yet because its
202 : child block D has refcnt 2. After publishing/pruning, the fork
203 : tree would be:
204 :
205 :
206 : M(0)
207 : / ||
208 : D(2) T(0)
209 : ||
210 : ..
211 : ..
212 : ..
213 : ||
214 : blocks we might be actively replaying
215 :
216 :
217 : As a result, the shared fork-aware structures can free resources
218 : for blocks P, A, B, and all subtrees branching off of them.
219 :
220 : For the reference counting part, the replay tile is the sole entity
221 : that can update the refcnt. This ensures that all refcnt increment
222 : and decrement attempts are serialized at the replay tile, and that
223 : there are no racy resurrection of a soon-to-be-pruned block. If a
224 : refcnt increment request arrives after a block has been pruned,
225 : replay simply rejects the request.
226 :
227 : A note on the implementation of the above ...
228 :
229 : Upon receiving a new consensus root, we descend down the rooted
230 : fork from the current published root to the new consensus root. On
231 : each node/block of the rooted fork, we do a summation of the refcnt
232 : on the block and all the minority fork blocks branching from the
233 : block. If the summation is 0, the block is safe for pruning. We
234 : advance the published root to the far end of the consecutive run of
235 : 0 refcnt sums originating from the current published root. On our
236 : descent down the minority forks, we also mark any block that hasn't
237 : finished replaying as dead, so we don't waste time executing them.
238 : No more transactions shall be dispatched for execution from dead
239 : blocks.
240 :
241 : Blocks start out with a refcnt of 0. Other tiles may send a
242 : request to the replay tile for a reference on a block. The
243 : transaction dispatcher is another source of refcnt updates. On
244 : every dispatch of a transaction for block B, we increment the
245 : refcnt for B. And on every transaction finalization, we decrement
246 : the refcnt for B. This means that whenever the refcnt on a block
247 : is 0, there is no more reference on that block from the execution
248 : pipeline. While it might be tempting to simply increment the
249 : refcnt once when we start replaying a block, and decrement the
250 : refcnt once when we finish a block, this more fine-grained refcnt
251 : update strategy allows for aborting and potentially immediate
252 : pruning of blocks under interleaved block replay. Upon receiving a
253 : new consensus root, we can simply look at the refcnt on minority
254 : fork blocks, and a refcnt of 0 would imply that the block is safe
255 : for pruning, even if we haven't finished replaying it. Without the
256 : fine-grained refcnt, we would need to first stop dispatching from
257 : the aborted block, and then wait for a full drain of the execution
258 : pipeline to know for sure that there are no more in-flight
259 : transactions executing on the aborted block. Note that this will
260 : allow the refcnt on any block to transiently drop down to 0. We
261 : will not mistakenly prune an actively replaying block, aka a leaf
262 : node, that is chaining off of the rooted fork, because the
263 : consensus root is always an ancestor of the actively replaying tip.
264 : */
265 : fd_hash_t consensus_root; /* The most recent block to have reached max lockout in the tower. */
266 : ulong consensus_root_slot; /* slot number of the above. */
267 : ulong consensus_root_bank_idx; /* bank index of the above. */
268 : ulong published_root_slot; /* slot number of the published root. */
269 : ulong published_root_bank_idx; /* bank index of the published root. */
270 :
271 : /* Randomly generated block id for the initial genesis/snapshot slot.
272 : To be replaced with block id in the snapshot manifest when SIMD-333
273 : is activated. */
274 :
275 : fd_hash_t initial_block_id;
276 :
277 : /* We need to maintain a tile-local mapping of block-ids to bank index
278 : and vice versa. This translation layer is needed for conversion
279 : since tower operates on block-ids and downstream consumers of FEC
280 : sets operate on bank indices. This mapping must happen both ways:
281 : 1. tower sends us block ids and we must map them to bank indices.
282 : 2. when a block is completed, we must map the bank index to a block
283 : id to send a slot complete message to tower. */
284 : ulong block_id_len;
285 : fd_block_id_ele_t * block_id_arr;
286 : ulong block_id_map_seed;
287 : fd_block_id_map_t * block_id_map;
288 :
289 : /* Capture-related configs */
290 : fd_capture_ctx_t * capture_ctx;
291 : FILE * capture_file;
292 : fd_capture_link_buf_t cap_repl_out[1];
293 :
294 : /* Protobuf dumping context for debugging runtime execution and
295 : collecting seed corpora. */
296 : fd_dump_proto_ctx_t * dump_proto_ctx;
297 :
298 : /* Whether the runtime has been booted either from snapshot loading
299 : or from genesis. */
300 : int is_booted;
301 :
302 : /* Buffer to store vote towers that need to be published to the Tower
303 : tile. */
304 :
305 : fd_multi_epoch_leaders_t * mleaders;
306 :
307 : int larger_max_cost_per_block;
308 :
309 : /* When we transition to becoming leader, we can only unbecome the
310 : leader if we have received a block id from the FEC reassembler, and
311 : a message from PoH that the leader slot has ended. After both of
312 : these conditions are met, then we are free to unbecome the leader.
313 : */
314 : uint is_leader : 1;
315 : uint supports_leader : 1;
316 : int recv_poh;
317 : ulong next_leader_slot;
318 : long next_leader_tickcount;
319 : ulong highwater_leader_slot;
320 : ulong reset_slot;
321 : fd_bank_t * reset_bank;
322 : fd_hash_t reset_block_id;
323 : long reset_timestamp_nanos;
324 : double slot_duration_nanos;
325 : double slot_duration_ticks;
326 : fd_bank_t * leader_bank;
327 :
328 : fd_pubkey_t identity_pubkey[1];
329 : ulong identity_idx;
330 :
331 : fd_keyswitch_t * keyswitch;
332 : int halt_leader;
333 :
334 : ulong resolv_tile_cnt;
335 :
336 : int in_kind[ 128 ];
337 : fd_replay_in_link_t in[ 128 ];
338 :
339 : fd_replay_out_link_t exec_out[ 1 ];
340 :
341 : fd_replay_out_link_t replay_out[1];
342 :
343 : fd_replay_out_link_t epoch_out[1];
344 :
345 : /* The rpc tile needs to occasionally own a reference to a live bank.
346 : Replay needs to know if the rpc as a consumer is enabled so it can
347 : increment the bank's refcnt before publishing bank_idx. */
348 : int rpc_enabled;
349 :
350 : /* For dumping blocks to protobuf. For backtest only. */
351 : fd_block_dump_ctx_t * block_dump_ctx;
352 :
353 : /* We need a few pieces of information to compute the right addresses
354 : for bundle crank information that we need to send to pack. */
355 : struct {
356 : int enabled;
357 : fd_pubkey_t vote_account;
358 : fd_bundle_crank_gen_t gen[1];
359 : } bundle;
360 :
361 : struct {
362 : ulong store_query_acquire;
363 : ulong store_query_release;
364 : fd_histf_t store_query_wait[1];
365 : fd_histf_t store_query_work[1];
366 : ulong store_query_cnt;
367 : ulong store_query_missing_cnt;
368 : ulong store_query_mr;
369 : ulong store_query_missing_mr;
370 :
371 : ulong slots_total;
372 : ulong transactions_total;
373 :
374 : ulong reasm_latest_slot;
375 : ulong reasm_latest_fec_idx;
376 :
377 : ulong sched_full;
378 : ulong reasm_empty;
379 : ulong leader_bid_wait;
380 : ulong banks_full;
381 : ulong storage_root_behind;
382 :
383 : fd_histf_t root_slot_dur[1];
384 : fd_histf_t root_account_dur[1];
385 : } metrics;
386 :
387 : uchar __attribute__((aligned(FD_MULTI_EPOCH_LEADERS_ALIGN))) mleaders_mem[ FD_MULTI_EPOCH_LEADERS_FOOTPRINT ];
388 :
389 : ulong runtime_stack_seed;
390 : fd_runtime_stack_t * runtime_stack;
391 : };
392 :
393 : typedef struct fd_replay_tile fd_replay_tile_t;
394 :
395 : #endif /* HEADER_fd_src_discof_replay_fd_replay_tile_private_h */
|