Line data Source code
1 : #ifndef HEADER_fd_src_discof_replay_fd_sched_h
2 : #define HEADER_fd_src_discof_replay_fd_sched_h
3 :
4 : #include "fd_rdisp.h"
5 : #include "../../disco/store/fd_store.h" /* for fd_store_fec_t */
6 : #include "../../disco/pack/fd_microblock.h" /* for fd_txn_p_t */
7 :
8 : #include "../../flamenco/accdb/fd_accdb_user.h"
9 : #include "../../util/spad/fd_spad.h" /* for ALUTs */
10 :
11 : /* fd_sched wraps all the smarts and mechanical chores around scheduling
12 : transactions for replay execution. It is built on top of the
13 : dispatcher fd_rdisp. The dispatcher is responsible for high
14 : performance lane-based scheduling of transactions. On top of that,
15 : we add fork-aware management of lanes, and policies regarding which
16 : lanes to prioritize for execution.
17 :
18 : Conceptually, transactions in a block form a DAG. We would like to
19 : make our way through a block with a sufficient degree of parallelism,
20 : such that the execution time of the critical path of the DAG is the
21 : limiting factor. The dispatcher does a good job of emerging the
22 : critical path of the DAG on the fly. Blocks are tracked by the
23 : dispatcher either as a block staged on a lane, or as an unstaged
24 : block. When a block is staged, it will enjoy the most intelligent
25 : online scheduling that the dispatcher has to offer. Lanes have to
26 : consist of linear chains of blocks down a fork. So to map a fork
27 : tree to lanes, we will need multiple lanes. Ideally, every branch in
28 : the fork tree sits on some lane. However, memory footprint limits us
29 : to a few number of lanes.
30 :
31 : This module implements a state machine for ensuring that blocks enter
32 : into and exit ouf of lanes in an orderly fashion. The public APIs of
33 : this module are invoked to drive state transitions on a small number
34 : of events, such as new transactions arriving, or transactions
35 : completing, or a block being aborted/abandoned. We also implement
36 : policies for deciding which blocks get staged onto lanes, or evicted
37 : from lanes, as well as which lanes to prioritize for execution.
38 :
39 :
40 : The general order in which calls happen under the normal case is:
41 :
42 : fd_sched_fec_ingest()* ... fd_sched_txn_next_ready()* ... fd_sched_txn_done()* ...
43 : more ingest, more ready, more done ...
44 : ...
45 : fd_sched_txn_next_ready() indicates that the last transaction in the block is being scheduled
46 : fd_sched_txn_done()*
47 : fd_sched_block_is_done()
48 : end-of-block processing in caller
49 : fd_sched_txn_next_ready() starts returning transactions from the next block
50 : more ingest, more ready, more done ...
51 : ... */
52 :
53 : #define FD_SCHED_MIN_DEPTH 478
54 : #define FD_SCHED_MAX_DEPTH FD_RDISP_MAX_DEPTH
55 :
56 : struct fd_sched;
57 : typedef struct fd_sched fd_sched_t;
58 :
59 : struct fd_sched_alut_ctx {
60 : fd_accdb_user_t accdb[1];
61 : fd_funk_txn_xid_t xid[1];
62 : ulong els; /* Effective lookup slot. */
63 : };
64 : typedef struct fd_sched_alut_ctx fd_sched_alut_ctx_t;
65 :
66 : struct fd_sched_fec {
67 : ulong bank_idx; /* Index of the block. Assumed to be in [0, block_cnt_max). Caller
68 : is responsible for ensuring that bank idx is in bounds and unique
69 : across equivocated blocks. */
70 : ulong parent_bank_idx; /* Index of the parent block. Assumed to be in [0, block_cnt_max).
71 : Caller is responsible for ensuring that parent bank idx is in
72 : bounds and unique across equivocated blocks. */
73 : ulong slot; /* Slot number of the block. */
74 : ulong parent_slot; /* Slot number of the parent block. */
75 : fd_store_fec_t * fec; /* FEC set data. */
76 : uint shred_cnt; /* Number of shreds in the FEC set. */
77 : uint is_last_in_batch:1; /* Set if this is the last FEC set in the batch; relevant because the
78 : parser should ignore trailing bytes at the end of a batch. */
79 : uint is_last_in_block:1; /* Set if this is the last FEC set in the block. */
80 : uint is_first_in_block:1; /* Set if this is the first FEC set in the block. Bank should increment refcnt for sched if such a FEC set has been ingested by sched. */
81 :
82 : fd_sched_alut_ctx_t alut_ctx[ 1 ];
83 : };
84 : typedef struct fd_sched_fec fd_sched_fec_t;
85 :
86 : /* The state of a transaction. Non mutually exclusive. */
87 0 : #define FD_SCHED_TXN_EXEC_DONE (0x0001UL)
88 0 : #define FD_SCHED_TXN_SIGVERIFY_DONE (0x0002UL)
89 0 : #define FD_SCHED_TXN_IS_COMMITTABLE (0x0004UL)
90 0 : #define FD_SCHED_TXN_IS_FEES_ONLY (0x0008UL)
91 : #define FD_SCHED_TXN_REPLAY_DONE (FD_SCHED_TXN_EXEC_DONE|FD_SCHED_TXN_SIGVERIFY_DONE)
92 :
93 : struct fd_sched_txn_info {
94 : ulong flags;
95 : int txn_err;
96 : long tick_parsed;
97 : long tick_sigverify_disp;
98 : long tick_sigverify_done;
99 : long tick_exec_disp;
100 : long tick_exec_done;
101 : };
102 : typedef struct fd_sched_txn_info fd_sched_txn_info_t;
103 :
104 : /* The scheduler may return one of the following types of tasks for the
105 : replay tile.
106 :
107 : e - passed down to exec tiles.
108 : i - replay completes the task immediately.
109 : q - replay may either do it immediately or queue the task up. */
110 0 : #define FD_SCHED_TT_NULL (0UL)
111 0 : #define FD_SCHED_TT_BLOCK_START (1UL) /* (i) Start-of-block processing. */
112 0 : #define FD_SCHED_TT_BLOCK_END (2UL) /* (q) End-of-block processing. */
113 0 : #define FD_SCHED_TT_TXN_EXEC (3UL) /* (e) Transaction execution. */
114 0 : #define FD_SCHED_TT_TXN_SIGVERIFY (4UL) /* (e) Transaction sigverify. */
115 : #define FD_SCHED_TT_LTHASH (5UL) /* (e) Account lthash. */
116 0 : #define FD_SCHED_TT_POH_HASH (6UL) /* (e) PoH hashing. */
117 0 : #define FD_SCHED_TT_MARK_DEAD (7UL) /* (i) Mark the block dead. */
118 :
119 : struct fd_sched_block_start {
120 : ulong bank_idx; /* Same as in fd_sched_fec_t. */
121 : ulong parent_bank_idx; /* Same as in fd_sched_fec_t. */
122 : ulong slot; /* Slot number of the block. */
123 : };
124 : typedef struct fd_sched_block_start fd_sched_block_start_t;
125 :
126 : struct fd_sched_block_end {
127 : ulong bank_idx;
128 : };
129 : typedef struct fd_sched_block_end fd_sched_block_end_t;
130 :
131 : struct fd_sched_txn_exec {
132 : ulong bank_idx;
133 : ulong slot;
134 : ulong txn_idx;
135 : ulong exec_idx;
136 : };
137 : typedef struct fd_sched_txn_exec fd_sched_txn_exec_t;
138 :
139 : struct fd_sched_txn_sigverify {
140 : ulong bank_idx;
141 : ulong txn_idx;
142 : ulong exec_idx;
143 : };
144 : typedef struct fd_sched_txn_sigverify fd_sched_txn_sigverify_t;
145 :
146 : struct fd_sched_poh_hash {
147 : ulong bank_idx;
148 : ulong mblk_idx;
149 : ulong exec_idx;
150 : ulong hashcnt;
151 : fd_hash_t hash[ 1 ];
152 : };
153 : typedef struct fd_sched_poh_hash fd_sched_poh_hash_t;
154 :
155 : struct fd_sched_mark_dead {
156 : ulong bank_idx;
157 : };
158 : typedef struct fd_sched_mark_dead fd_sched_mark_dead_t;
159 :
160 : struct fd_sched_task {
161 : ulong task_type; /* Set to one of the task types defined above. */
162 : union {
163 : fd_sched_block_start_t block_start[ 1 ];
164 : fd_sched_block_end_t block_end[ 1 ];
165 : fd_sched_txn_exec_t txn_exec[ 1 ];
166 : fd_sched_txn_sigverify_t txn_sigverify[ 1 ];
167 : fd_sched_poh_hash_t poh_hash[ 1 ];
168 : fd_sched_mark_dead_t mark_dead[ 1 ];
169 : };
170 : };
171 : typedef struct fd_sched_task fd_sched_task_t;
172 :
173 : FD_PROTOTYPES_BEGIN
174 :
175 : /* fd_sched_{align,footprint} return the required alignment and
176 : footprint in bytes for a region of memory to be used as a scheduler.
177 : footprint silently returns 0 if params are invalid (thus convenient
178 : to validate params).
179 :
180 : depth controls the reorder buffer transaction count (~1 million
181 : recommended for live replay, ~10k recommended for async replay).
182 : block_cnt_max is the maximum number of blocks that will be tracked by
183 : the scheduler. */
184 :
185 : ulong
186 : fd_sched_align( void );
187 :
188 : ulong
189 : fd_sched_footprint( ulong depth, /* in [FD_SCHED_MIN_DEPTH,FD_SCHED_MAX_DEPTH] */
190 : ulong block_cnt_max ); /* >= 1 */
191 :
192 : /* fd_sched_new creates a sched object backed by the given memory region
193 : (conforming to align() and footprint()). Returns NULL if any
194 : parameter is invalid. */
195 :
196 : void *
197 : fd_sched_new( void * mem,
198 : fd_rng_t * rng,
199 : ulong depth,
200 : ulong block_cnt_max,
201 : ulong exec_cnt );
202 :
203 : fd_sched_t *
204 : fd_sched_join( void * mem );
205 :
206 : /* Add the data in the FEC set to the scheduler. If is_last_fec is 1,
207 : then this is the last FEC set in the block. Transactions may span
208 : FEC set boundaries. The scheduler is responsible for incrementally
209 : parsing transactions from concatenated FEC set data. Assumes that
210 : FEC sets are delivered in replay order. That is, forks form a
211 : partial ordering over FEC sets: in-order per fork, but arbitrary
212 : ordering across forks. The fork tree is implied by the stream of
213 : parent-child relationships delivered in FEC sets. Also assumes that
214 : there is enough space in the scheduler to ingest the FEC set. The
215 : caller should generally call fd_sched_fec_can_ingest() first.
216 :
217 : Returns 1 on success, 0 if the block is bad and should be marked
218 : dead. */
219 : FD_WARN_UNUSED int
220 : fd_sched_fec_ingest( fd_sched_t * sched, fd_sched_fec_t * fec );
221 :
222 : /* Check if there is enough space in the scheduler to ingest the data in
223 : the FEC set. Returns 1 if there is, 0 otherwise. This is a cheap
224 : and conservative check. */
225 : int
226 : fd_sched_fec_can_ingest( fd_sched_t * sched, fd_sched_fec_t * fec );
227 :
228 : /* Returns the number of worst-case FEC sets sched can ingest. This is a
229 : cheap and conservative check. */
230 : ulong
231 : fd_sched_can_ingest_cnt( fd_sched_t * sched );
232 :
233 : /* Returns 1 if sched is drained, 0 otherwise. A drained scheduler will
234 : not return more work. Otherwise, next_ready will return more work,
235 : so long as there are exec tiles available. */
236 : int
237 : fd_sched_is_drained( fd_sched_t * sched );
238 :
239 : /* Obtain a transaction eligible for execution. This implies that all
240 : prior transactions with w-r or w-w conflicts have completed.
241 : Information regarding the scheduled transaction is written to the out
242 : pointer. Returns 1 on success, 0 on failure. Failures are generally
243 : transient and non-fatal, and are simply an indication that no
244 : transaction is ready for execution yet. When in-flight transactions
245 : retire or when more FEC sets are ingested, more transactions may
246 : become ready for execution.
247 :
248 : Transactions on the same fork will be returned in a way that
249 : maintains the serial fiction. That is, reordering can happen, but
250 : only within the constraint that transactions appear to be ready in
251 : the order in which they occur in the block. Transactions from
252 : different forks may interleave, and the caller should be prepared to
253 : switch execution context in response to interleavings. The scheduler
254 : will barrier on block boundaries, in the sense that transactions from
255 : a subsequent block will not be returned for execution until all
256 : transactions from the previous block have completed. This gives the
257 : caller a chance to perform end-of-block processing before
258 : transactions from a subsequent block start executing. In general,
259 : the caller should check if the last transaction in the current block
260 : is done, and if so, do end-of-block processing before calling this
261 : function to start the next block.
262 :
263 : In addition to returning transactions for execution, this function
264 : may also return a sigverify task. Sigverify can be completed
265 : aynschronously outside the critical path of transaction execution, as
266 : long as every transaction in a block passes sigverify before we
267 : commit the block. The scheduler prioritizes actual execution of
268 : transactions over sigverify, and in general sigverify tasks are only
269 : returned when no real transaction can be dispatched. In other words,
270 : the scheduler tries to exploit idle cycles in the exec tiles during
271 : times of low parallelism critical path progression.
272 :
273 : This function may also return a PoH hashing task. These tasks are
274 : lower priority than transaction execution, but higher priority than
275 : sigverify. This is because sigverify tasks are generally bite-sized,
276 : whereas PoH hashing can be longer, so we would like to get started on
277 : hashing sooner rather than later. */
278 : ulong
279 : fd_sched_task_next_ready( fd_sched_t * sched, fd_sched_task_t * out );
280 :
281 : /* Mark a task as complete. For transaction execution, this means that
282 : the effects of the execution are now visible on any core that could
283 : execute a subsequent transaction. Returns 0 on success, -1 if given
284 : the result of the task, the block turns out to be bad. -1 is only
285 : returned from PoH tasks.
286 :
287 : If a block has been abandoned or marked dead for any reason, it'll be
288 : pruned the moment in-flight task count hits 0 due to the last task
289 : completing. Then, in the immediate ensuing stem run loop,
290 : sched_pruned_next() will return the index for the corresponding bank
291 : so the refcnt can be decremented for sched.
292 :
293 : The transaction at the given index may be freed upon return from this
294 : function. Nonetheless, as long as there is no intervening FEC
295 : ingestion, it would still be safe to query the transaction using
296 : get_txn(). */
297 : int
298 : fd_sched_task_done( fd_sched_t * sched, ulong task_type, ulong txn_idx, ulong exec_idx, void * data );
299 :
300 : /* Abandon a block. This means that we are no longer interested in
301 : executing the block. This also implies that any block which chains
302 : off of the provided block shall be abandoned. This is mainly used
303 : when a block is aborted because we decided that it would be a
304 : dead/invalid block, and so there's no point in spending resources
305 : executing it. The scheduler will no longer return transactions from
306 : abandoned blocks for execution. This should only be invoked on an
307 : actively replayed block, and should only be invoked once on it.
308 :
309 : An abandoned block will be pruned from sched as soon as, and only if,
310 : the block has no more in-flight tasks associated with it. No sooner,
311 : no later. In the immediate ensuing stem run loop,
312 : sched_pruned_next() will return the index for the corresponding bank
313 : so the refcnt can be decremented for sched. After that point, the
314 : bank_idx may be recycled for another block. */
315 : void
316 : fd_sched_block_abandon( fd_sched_t * sched, ulong bank_idx );
317 :
318 : /* Add a block as immediately done to the scheduler. This is useful for
319 : installing the snapshot slot, or for informing the scheduler of a
320 : packed leader block. Parent block should be ULONG_MAX for the
321 : snapshot slot, and otherwise a block that hasn't been pruned. */
322 : void
323 : fd_sched_block_add_done( fd_sched_t * sched, ulong bank_idx, ulong parent_bank_idx, ulong slot );
324 :
325 : /* Advance the root, pruning all blocks across forks that do not descend
326 : from the new root. Assumes the new root is in the fork tree and
327 : connected to the current root. Also assumes that there are no more
328 : in-flight transactions from the soon-to-be-pruned blocks. This
329 : should be called after root_notify() and the caller is responsible
330 : for figuring out the new root to safely prune to. */
331 : void
332 : fd_sched_advance_root( fd_sched_t * sched, ulong root_idx );
333 :
334 : /* Notify the scheduler of a new root. This has the effect of calling
335 : abandon() on all minority forks that do not descend from the new
336 : root. Shortly after a call to this function, in-flight transactions
337 : from these abandoned blocks should retire from the execution
338 : pipeline, and the new root will be safe for pruning. */
339 : void
340 : fd_sched_root_notify( fd_sched_t * sched, ulong root_idx );
341 :
342 : /* Returns the index of a bank whose refcnt should be decremented for
343 : sched. This function should be called in a loop to drain all
344 : outstanding refcnt decrements before any other sched API is called in
345 : a stem run loop. Returns ULONG_MAX when there are no more
346 : outstanding refrences from sched and the loop should break. */
347 : ulong
348 : fd_sched_pruned_block_next( fd_sched_t * sched );
349 :
350 : void
351 : fd_sched_set_poh_params( fd_sched_t * sched, ulong bank_idx, ulong tick_height, ulong max_tick_height, ulong hashes_per_tick, fd_hash_t const * start_poh );
352 :
353 : fd_txn_p_t *
354 : fd_sched_get_txn( fd_sched_t * sched, ulong txn_idx );
355 :
356 : fd_sched_txn_info_t *
357 : fd_sched_get_txn_info( fd_sched_t * sched, ulong txn_idx );
358 :
359 : fd_hash_t *
360 : fd_sched_get_poh( fd_sched_t * sched, ulong bank_idx );
361 :
362 : uint
363 : fd_sched_get_shred_cnt( fd_sched_t * sched, ulong bank_idx );
364 :
365 : void
366 : fd_sched_metrics_write( fd_sched_t * sched );
367 :
368 : /* Serialize the current state as a cstr to the returned buffer. Caller
369 : may read from the buffer until the next invocation of any fd_sched
370 : function. */
371 : char *
372 : fd_sched_get_state_cstr( fd_sched_t * sched );
373 :
374 : void *
375 : fd_sched_leave( fd_sched_t * sched );
376 :
377 : void *
378 : fd_sched_delete( void * mem );
379 :
380 : FD_PROTOTYPES_END
381 :
382 : #endif /* HEADER_fd_src_discof_replay_fd_sched_h */
|