Line data Source code
1 : #ifndef HEADER_fd_src_discof_replay_fd_sched_h
2 : #define HEADER_fd_src_discof_replay_fd_sched_h
3 :
4 : #include "fd_rdisp.h"
5 : #include "../../disco/store/fd_store.h" /* for fd_store_fec_t */
6 : #include "../../disco/pack/fd_microblock.h" /* for fd_txn_p_t */
7 :
8 : #include "../../flamenco/accdb/fd_accdb_user.h"
9 : #include "../../util/spad/fd_spad.h" /* for ALUTs */
10 :
11 : /* fd_sched wraps all the smarts and mechanical chores around scheduling
12 : transactions for replay execution. It is built on top of the
13 : dispatcher fd_rdisp. The dispatcher is responsible for high
14 : performance lane-based scheduling of transactions. On top of that,
15 : we add fork-aware management of lanes, and policies regarding which
16 : lanes to prioritize for execution.
17 :
18 : Conceptually, transactions in a block form a DAG. We would like to
19 : make our way through a block with a sufficient degree of parallelism,
20 : such that the execution time of the critical path of the DAG is the
21 : limiting factor. The dispatcher does a good job of emerging the
22 : critical path of the DAG on the fly. Blocks are tracked by the
23 : dispatcher either as a block staged on a lane, or as an unstaged
24 : block. When a block is staged, it will enjoy the most intelligent
25 : online scheduling that the dispatcher has to offer. Lanes have to
26 : consist of linear chains of blocks down a fork. So to map a fork
27 : tree to lanes, we will need multiple lanes. Ideally, every branch in
28 : the fork tree sits on some lane. However, memory footprint limits us
29 : to a few number of lanes.
30 :
31 : This module implements a state machine for ensuring that blocks enter
32 : into and exit ouf of lanes in an orderly fashion. The public APIs of
33 : this module are invoked to drive state transitions on a small number
34 : of events, such as new transactions arriving, or transactions
35 : completing, or a block being aborted/abandoned. We also implement
36 : policies for deciding which blocks get staged onto lanes, or evicted
37 : from lanes, as well as which lanes to prioritize for execution.
38 :
39 :
40 : The general order in which calls happen under the normal case is:
41 :
42 : fd_sched_fec_ingest()* ... fd_sched_txn_next_ready()* ... fd_sched_txn_done()* ...
43 : more ingest, more ready, more done ...
44 : ...
45 : fd_sched_txn_next_ready() indicates that the last transaction in the block is being scheduled
46 : fd_sched_txn_done()*
47 : fd_sched_block_is_done()
48 : end-of-block processing in caller
49 : fd_sched_txn_next_ready() starts returning transactions from the next block
50 : more ingest, more ready, more done ...
51 : ... */
52 :
53 : #define FD_SCHED_MIN_DEPTH 478
54 : #define FD_SCHED_MAX_DEPTH FD_RDISP_MAX_DEPTH
55 :
56 : struct fd_sched;
57 : typedef struct fd_sched fd_sched_t;
58 :
59 :
60 : struct fd_sched_alut_ctx {
61 : fd_accdb_user_t accdb[1];
62 : fd_funk_txn_xid_t xid[1];
63 : ulong els; /* Effective lookup slot. */
64 : };
65 : typedef struct fd_sched_alut_ctx fd_sched_alut_ctx_t;
66 :
67 : struct fd_sched_fec {
68 : ulong bank_idx; /* Index of the block. Assumed to be in [0, block_cnt_max). Caller
69 : is responsible for ensuring that bank idx is in bounds and unique
70 : across equivocated blocks. */
71 : ulong parent_bank_idx; /* Index of the parent block. Assumed to be in [0, block_cnt_max).
72 : Caller is responsible for ensuring that parent bank idx is in
73 : bounds and unique across equivocated blocks. */
74 : ulong slot; /* Slot number of the block. */
75 : ulong parent_slot; /* Slot number of the parent block. */
76 : fd_store_fec_t * fec; /* FEC set data. */
77 : uint shred_cnt; /* Number of shreds in the FEC set. */
78 : uint is_last_in_batch:1; /* Set if this is the last FEC set in the batch; relevant because the
79 : parser should ignore trailing bytes at the end of a batch. */
80 : uint is_last_in_block:1; /* Set if this is the last FEC set in the block. */
81 : uint is_first_in_block:1; /* Set if this is the first FEC set in the block. Bank should increment refcnt for sched if such a FEC set has been ingested by sched. */
82 :
83 : fd_sched_alut_ctx_t alut_ctx[ 1 ];
84 : };
85 : typedef struct fd_sched_fec fd_sched_fec_t;
86 :
87 : /* The scheduler may return one of the following types of tasks for the
88 : replay tile.
89 :
90 : e - passed down to exec tiles.
91 : i - replay completes the task immediately.
92 : q - replay may either do it immediately or queue the task up. */
93 0 : #define FD_SCHED_TT_NULL (0UL)
94 0 : #define FD_SCHED_TT_BLOCK_START (1UL) /* (i) Start-of-block processing. */
95 0 : #define FD_SCHED_TT_BLOCK_END (2UL) /* (q) End-of-block processing. */
96 0 : #define FD_SCHED_TT_TXN_EXEC (3UL) /* (e) Transaction execution. */
97 0 : #define FD_SCHED_TT_TXN_SIGVERIFY (4UL) /* (e) Transaction sigverify. */
98 : #define FD_SCHED_TT_LTHASH (5UL) /* (e) Account lthash. */
99 0 : #define FD_SCHED_TT_POH_HASH (6UL) /* (e) PoH hashing. */
100 0 : #define FD_SCHED_TT_MARK_DEAD (7UL) /* (i) Mark the block dead. */
101 :
102 : struct fd_sched_block_start {
103 : ulong bank_idx; /* Same as in fd_sched_fec_t. */
104 : ulong parent_bank_idx; /* Same as in fd_sched_fec_t. */
105 : ulong slot; /* Slot number of the block. */
106 : };
107 : typedef struct fd_sched_block_start fd_sched_block_start_t;
108 :
109 : struct fd_sched_block_end {
110 : ulong bank_idx;
111 : };
112 : typedef struct fd_sched_block_end fd_sched_block_end_t;
113 :
114 : struct fd_sched_txn_exec {
115 : ulong bank_idx;
116 : ulong slot;
117 : ulong txn_idx;
118 : ulong exec_idx;
119 : };
120 : typedef struct fd_sched_txn_exec fd_sched_txn_exec_t;
121 :
122 : struct fd_sched_txn_sigverify {
123 : ulong bank_idx;
124 : ulong txn_idx;
125 : ulong exec_idx;
126 : };
127 : typedef struct fd_sched_txn_sigverify fd_sched_txn_sigverify_t;
128 :
129 : struct fd_sched_poh_hash {
130 : ulong bank_idx;
131 : ulong mblk_idx;
132 : ulong exec_idx;
133 : ulong hashcnt;
134 : fd_hash_t hash[ 1 ];
135 : };
136 : typedef struct fd_sched_poh_hash fd_sched_poh_hash_t;
137 :
138 : struct fd_sched_mark_dead {
139 : ulong bank_idx;
140 : };
141 : typedef struct fd_sched_mark_dead fd_sched_mark_dead_t;
142 :
143 : struct fd_sched_task {
144 : ulong task_type; /* Set to one of the task types defined above. */
145 : union {
146 : fd_sched_block_start_t block_start[ 1 ];
147 : fd_sched_block_end_t block_end[ 1 ];
148 : fd_sched_txn_exec_t txn_exec[ 1 ];
149 : fd_sched_txn_sigverify_t txn_sigverify[ 1 ];
150 : fd_sched_poh_hash_t poh_hash[ 1 ];
151 : fd_sched_mark_dead_t mark_dead[ 1 ];
152 : };
153 : };
154 : typedef struct fd_sched_task fd_sched_task_t;
155 :
156 : FD_PROTOTYPES_BEGIN
157 :
158 : /* fd_sched_{align,footprint} return the required alignment and
159 : footprint in bytes for a region of memory to be used as a scheduler.
160 : footprint silently returns 0 if params are invalid (thus convenient
161 : to validate params).
162 :
163 : depth controls the reorder buffer transaction count (~1 million
164 : recommended for live replay, ~10k recommended for async replay).
165 : block_cnt_max is the maximum number of blocks that will be tracked by
166 : the scheduler. */
167 :
168 : ulong
169 : fd_sched_align( void );
170 :
171 : ulong
172 : fd_sched_footprint( ulong depth, /* in [FD_SCHED_MIN_DEPTH,FD_SCHED_MAX_DEPTH] */
173 : ulong block_cnt_max ); /* >= 1 */
174 :
175 : /* fd_sched_new creates a sched object backed by the given memory region
176 : (conforming to align() and footprint()). Returns NULL if any
177 : parameter is invalid. */
178 :
179 : void *
180 : fd_sched_new( void * mem,
181 : ulong depth,
182 : ulong block_cnt_max,
183 : ulong exec_cnt );
184 :
185 : fd_sched_t *
186 : fd_sched_join( void * mem );
187 :
188 : /* Add the data in the FEC set to the scheduler. If is_last_fec is 1,
189 : then this is the last FEC set in the block. Transactions may span
190 : FEC set boundaries. The scheduler is responsible for incrementally
191 : parsing transactions from concatenated FEC set data. Assumes that
192 : FEC sets are delivered in replay order. That is, forks form a
193 : partial ordering over FEC sets: in-order per fork, but arbitrary
194 : ordering across forks. The fork tree is implied by the stream of
195 : parent-child relationships delivered in FEC sets. Also assumes that
196 : there is enough space in the scheduler to ingest the FEC set. The
197 : caller should generally call fd_sched_fec_can_ingest() first.
198 :
199 : Returns 1 on success, 0 if the block is bad and should be marked
200 : dead. */
201 : FD_WARN_UNUSED int
202 : fd_sched_fec_ingest( fd_sched_t * sched, fd_sched_fec_t * fec );
203 :
204 : /* Check if there is enough space in the scheduler to ingest the data in
205 : the FEC set. Returns 1 if there is, 0 otherwise. This is a cheap
206 : and conservative check. */
207 : int
208 : fd_sched_fec_can_ingest( fd_sched_t * sched, fd_sched_fec_t * fec );
209 :
210 : /* Check if there is enough space in the scheduler to ingest fec_cnt
211 : worst-case FEC sets. Returns 1 if there is, 0 otherwise. This is a
212 : cheap and conservative check, and has less precision than
213 : fd_sched_fec_can_ingest(). */
214 : int
215 : fd_sched_can_ingest( fd_sched_t * sched, ulong fec_cnt );
216 :
217 : /* Obtain a transaction eligible for execution. This implies that all
218 : prior transactions with w-r or w-w conflicts have completed.
219 : Information regarding the scheduled transaction is written to the out
220 : pointer. Returns 1 on success, 0 on failure. Failures are generally
221 : transient and non-fatal, and are simply an indication that no
222 : transaction is ready for execution yet. When in-flight transactions
223 : retire or when more FEC sets are ingested, more transactions may
224 : become ready for execution.
225 :
226 : Transactions on the same fork will be returned in a way that
227 : maintains the serial fiction. That is, reordering can happen, but
228 : only within the constraint that transactions appear to be ready in
229 : the order in which they occur in the block. Transactions from
230 : different forks may interleave, and the caller should be prepared to
231 : switch execution context in response to interleavings. The scheduler
232 : will barrier on block boundaries, in the sense that transactions from
233 : a subsequent block will not be returned for execution until all
234 : transactions from the previous block have completed. This gives the
235 : caller a chance to perform end-of-block processing before
236 : transactions from a subsequent block start executing. In general,
237 : the caller should check if the last transaction in the current block
238 : is done, and if so, do end-of-block processing before calling this
239 : function to start the next block.
240 :
241 : In addition to returning transactions for execution, this function
242 : may also return a sigverify task. Sigverify can be completed
243 : aynschronously outside the critical path of transaction execution, as
244 : long as every transaction in a block passes sigverify before we
245 : commit the block. The scheduler prioritizes actual execution of
246 : transactions over sigverify, and in general sigverify tasks are only
247 : returned when no real transaction can be dispatched. In other words,
248 : the scheduler tries to exploit idle cycles in the exec tiles during
249 : times of low parallelism critical path progression.
250 :
251 : This function may also return a PoH hashing task. These tasks are
252 : lower priority than transaction execution, but higher priority than
253 : sigverify. This is because sigverify tasks are generally bite-sized,
254 : whereas PoH hashing can be longer, so we would like to get started on
255 : hashing sooner rather than later. */
256 : ulong
257 : fd_sched_task_next_ready( fd_sched_t * sched, fd_sched_task_t * out );
258 :
259 : /* Mark a task as complete. For transaction execution, this means that
260 : the effects of the execution are now visible on any core that could
261 : execute a subsequent transaction. Returns 0 on success, -1 if given
262 : the result of the task, the block turns out to be bad. -1 is only
263 : returned from PoH tasks.
264 :
265 : If a block has been abandoned or marked dead for any reason, it'll be
266 : pruned the moment in-flight task count hits 0 due to the last task
267 : completing. Then, in the immediate ensuing stem run loop,
268 : sched_pruned_next() will return the index for the corresponding bank
269 : so the refcnt can be decremented for sched. */
270 : int
271 : fd_sched_task_done( fd_sched_t * sched, ulong task_type, ulong txn_idx, ulong exec_idx, void * data );
272 :
273 : /* Abandon a block. This means that we are no longer interested in
274 : executing the block. This also implies that any block which chains
275 : off of the provided block shall be abandoned. This is mainly used
276 : when a block is aborted because we decided that it would be a
277 : dead/invalid block, and so there's no point in spending resources
278 : executing it. The scheduler will no longer return transactions from
279 : abandoned blocks for execution. This should only be invoked on an
280 : actively replayed block, and should only be invoked once on it.
281 :
282 : An abandoned block will be pruned from sched as soon as, and only if,
283 : the block has no more in-flight tasks associated with it. No sooner,
284 : no later. In the immediate ensuing stem run loop,
285 : sched_pruned_next() will return the index for the corresponding bank
286 : so the refcnt can be decremented for sched. After that point, the
287 : bank_idx may be recycled for another block. */
288 : void
289 : fd_sched_block_abandon( fd_sched_t * sched, ulong bank_idx );
290 :
291 : /* Add a block as immediately done to the scheduler. This is useful for
292 : installing the snapshot slot, or for informing the scheduler of a
293 : packed leader block. Parent block should be ULONG_MAX for the
294 : snapshot slot, and otherwise a block that hasn't been pruned. */
295 : void
296 : fd_sched_block_add_done( fd_sched_t * sched, ulong bank_idx, ulong parent_bank_idx, ulong slot );
297 :
298 : /* Advance the root, pruning all blocks across forks that do not descend
299 : from the new root. Assumes the new root is in the fork tree and
300 : connected to the current root. Also assumes that there are no more
301 : in-flight transactions from the soon-to-be-pruned blocks. This
302 : should be called after root_notify() and the caller is responsible
303 : for figuring out the new root to safely prune to. */
304 : void
305 : fd_sched_advance_root( fd_sched_t * sched, ulong root_idx );
306 :
307 : /* Notify the scheduler of a new root. This has the effect of calling
308 : abandon() on all minority forks that do not descend from the new
309 : root. Shortly after a call to this function, in-flight transactions
310 : from these abandoned blocks should retire from the execution
311 : pipeline, and the new root will be safe for pruning. */
312 : void
313 : fd_sched_root_notify( fd_sched_t * sched, ulong root_idx );
314 :
315 : /* Returns the index of a bank whose refcnt should be decremented for
316 : sched. This function should be called in a loop to drain all
317 : outstanding refcnt decrements before any other sched API is called in
318 : a stem run loop. Returns ULONG_MAX when there are no more
319 : outstanding refrences from sched and the loop should break. */
320 : ulong
321 : fd_sched_pruned_block_next( fd_sched_t * sched );
322 :
323 : void
324 : fd_sched_set_poh_params( fd_sched_t * sched, ulong bank_idx, ulong tick_height, ulong max_tick_height, ulong hashes_per_tick, fd_hash_t const * start_poh );
325 :
326 : fd_txn_p_t *
327 : fd_sched_get_txn( fd_sched_t * sched, ulong txn_idx );
328 :
329 : fd_hash_t *
330 : fd_sched_get_poh( fd_sched_t * sched, ulong bank_idx );
331 :
332 : uint
333 : fd_sched_get_shred_cnt( fd_sched_t * sched, ulong bank_idx );
334 :
335 : /* Serialize the current state as a cstr to the returned buffer. Caller
336 : may read from the buffer until the next invocation of any fd_sched
337 : function. */
338 : char *
339 : fd_sched_get_state_cstr( fd_sched_t * sched );
340 :
341 : void *
342 : fd_sched_leave( fd_sched_t * sched );
343 :
344 : void *
345 : fd_sched_delete( void * mem );
346 :
347 : FD_PROTOTYPES_END
348 :
349 : #endif /* HEADER_fd_src_discof_replay_fd_sched_h */
|