Line data Source code
1 : #ifndef HEADER_fd_src_flamenco_accdb_fd_accdb_h
2 : #define HEADER_fd_src_flamenco_accdb_fd_accdb_h
3 :
4 : #include "fd_accdb_shmem.h"
5 : #include "../../util/bits/fd_bits.h"
6 :
7 : /* The accdb is a fork aware database that can be queried to get the
8 : current state of any accounts as-of a given fork, and update them. */
9 :
10 11523 : #define FD_ACCDB_ALIGN (128UL)
11 :
12 : /* Well-known file descriptor numbers for the accounts database backing
13 : file. Tiles inherit these from the parent process which dups the
14 : accounts file to these fds before fork+exec, so seccomp filters can
15 : pin syscalls to a fixed fd. fd 123462 is reserved by XDP. */
16 :
17 0 : #define FD_ACCDB_FD_RW (123461)
18 0 : #define FD_ACCDB_FD_RO (123460)
19 :
20 : struct fd_accdb_private;
21 : typedef struct fd_accdb_private fd_accdb_t;
22 :
23 : struct fd_accdb_fork_id { ushort val; };
24 : typedef struct fd_accdb_fork_id fd_accdb_fork_id_t;
25 :
26 : struct fd_accdb_entry {
27 : uchar pubkey[ 32UL ];
28 : uchar owner[ 32UL ];
29 : ulong lamports;
30 : int executable;
31 :
32 : ulong data_len;
33 : uchar * data;
34 :
35 : uchar prior_owner[ 32UL ];
36 : ulong prior_lamports;
37 : int prior_executable;
38 : ulong prior_data_len;
39 : uchar * prior_data;
40 :
41 : int commit;
42 :
43 : int _writable;
44 : int _overwrite;
45 :
46 : ushort _fork_id;
47 : uint _generation;
48 : ulong _acc_map_idx;
49 :
50 : ulong _original_size_class;
51 : ulong _original_cache_idx;
52 :
53 : struct {
54 : ulong destination_cache_idx[ 8UL ];
55 : } _write;
56 : };
57 :
58 : typedef struct fd_accdb_entry fd_acc_t;
59 :
60 : FD_PROTOTYPES_BEGIN
61 :
62 : #if FD_HAS_INT128
63 :
64 : static inline ulong
65 483951606 : fd_xxh3_mul128_fold64( ulong lhs, ulong rhs ) {
66 483951606 : uint128 product = (uint128)lhs * (uint128)rhs;
67 483951606 : return (ulong)product ^ (ulong)( product>>64 );
68 483951606 : }
69 :
70 : static inline ulong
71 : fd_xxh3_mix16b( ulong i0, ulong i1,
72 : ulong s0, ulong s1,
73 483951606 : ulong seed ) {
74 483951606 : return fd_xxh3_mul128_fold64( i0 ^ (s0 + seed), i1 ^ (s1 - seed) );
75 483951606 : }
76 :
77 : FD_FN_PURE static inline ulong
78 : fd_accdb_hash( uchar const key[ 32 ],
79 241975803 : ulong seed ) {
80 241975803 : ulong k0 = FD_LOAD( ulong, key+ 0 );
81 241975803 : ulong k1 = FD_LOAD( ulong, key+ 8 );
82 241975803 : ulong k2 = FD_LOAD( ulong, key+16 );
83 241975803 : ulong k3 = FD_LOAD( ulong, key+24 );
84 241975803 : ulong acc = 32 * 0x9E3779B185EBCA87ULL;
85 241975803 : acc += fd_xxh3_mix16b( k0, k1, 0xbe4ba423396cfeb8UL, 0x1cad21f72c81017cUL, seed );
86 241975803 : acc += fd_xxh3_mix16b( k2, k3, 0xdb979083e96dd4deUL, 0x1f67b3b7a4a44072UL, seed );
87 241975803 : acc = acc ^ (acc >> 37);
88 241975803 : acc *= 0x165667919E3779F9ULL;
89 241975803 : acc = acc ^ (acc >> 32);
90 241975803 : return acc;
91 241975803 : }
92 :
93 : #else
94 :
95 : /* If the target does not support xxHash3, fallback to the 'old' key
96 : hash function.
97 :
98 : FIXME This version is vulnerable to HashDoS */
99 :
100 : FD_FN_PURE static inline ulong
101 : fd_accdb_hash( uchar const key[ 32 ],
102 : ulong seed ) {
103 : /* tons of ILP */
104 : return (fd_ulong_hash( seed ^ (1UL<<0) ^ FD_LOAD( ulong, key+ 0 ) ) ^
105 : fd_ulong_hash( seed ^ (1UL<<1) ^ FD_LOAD( ulong, key+ 8 ) ) ) ^
106 : (fd_ulong_hash( seed ^ (1UL<<2) ^ FD_LOAD( ulong, key+16 ) ) ^
107 : fd_ulong_hash( seed ^ (1UL<<3) ^ FD_LOAD( ulong, key+24 ) ) );
108 : }
109 :
110 : #endif /* FD_HAS_INT128 */
111 :
112 : FD_FN_CONST ulong
113 : fd_accdb_align( void );
114 :
115 : FD_FN_CONST ulong
116 : fd_accdb_footprint( ulong max_live_slots );
117 :
118 : /* fd_accdb_new constructs the local joiner state for an accdb writer
119 : (or compaction tile). fd is an O_RDWR fd of the on-disk file.
120 :
121 : external_epoch_cnt and external_epoch_slots provide a list of
122 : additional epoch publish slots to scan during compaction's
123 : deferred-free reclamation. These point at memory owned by other
124 : processes (typically the per-tile fseq of read-only consumers like
125 : the rpc tile), mapped read-only into this joiner's address space.
126 : Each *external_epoch_slots[i] is updated by the owning RO joiner
127 : on each epoch-protected operation (and reset to ULONG_MAX when
128 : idle), and is used by this joiner's compaction scan to determine
129 : when on-disk partitions can be safely reclaimed.
130 :
131 : For joiners that do not need to track external RO consumers (i.e.
132 : any joiner that is not the compaction tile, or a writer-only
133 : topology), pass external_epoch_cnt=0 and external_epoch_slots=NULL.
134 : The pointer array is borrowed and must remain valid for the
135 : lifetime of the join. */
136 :
137 : void *
138 : fd_accdb_new( void * ljoin,
139 : fd_accdb_shmem_t * shmem,
140 : int fd,
141 : ulong external_epoch_cnt,
142 : ulong const ** external_epoch_slots );
143 :
144 : fd_accdb_t *
145 : fd_accdb_join( void * shaccdb );
146 :
147 : /* fd_accdb_join_readonly is the read-only counterpart of fd_accdb_new +
148 : fd_accdb_join. shmem_ro may point into a read-only mapping of the
149 : shmem region; the function will not write to it. my_epoch_slot_rw
150 : must point at a ulong owned by this joiner that it can write to
151 : (typically a private per-tile fseq that the accdb tile maps read-only
152 : and passes through external_epoch_slots[] in fd_accdb_new). fd_ro
153 : must be opened O_RDONLY on the same file the writer joiner opened RW.
154 :
155 : The joiner publishes its current epoch into *my_epoch_slot_rw on
156 : entry to each epoch-protected operation (and resets to ULONG_MAX on
157 : exit). The accdb tile's compaction scan observes this slot via its
158 : external_epoch_slots[] pointer and defers partition reclamation
159 : accordingly, the same way it does for in-shmem joiner_epochs[].
160 :
161 : Only fd_accdb_read_one_nocache, fd_accdb_exists, and
162 : fd_accdb_lamports are supported on a readonly join; any other API is
163 : undefined behavior. */
164 :
165 : fd_accdb_t *
166 : fd_accdb_join_readonly( void * ljoin,
167 : fd_accdb_shmem_t * shmem_ro,
168 : ulong * my_epoch_slot_rw,
169 : int fd_ro );
170 :
171 : /* fd_accdb_snapshot_load_{begin,end} toggle a mode on this writer
172 : joiner that causes layer-0 partition handoffs to backfill tiering
173 : for older snapshot-loaded partitions. Specifically, when a new
174 : partition P is opened at layer 0, the partition at P-2 is retiered
175 : to Warm (layer 1) and the partition at P-3 is retiered to Cold
176 : (layer 2). This compensates for the fact that snapshot-loaded
177 : accounts never get a second write and therefore never get promoted
178 : by normal compaction-driven tiering.
179 :
180 : Only the snapin tile is expected to use this. The flag is
181 : per-joiner and is not visible across processes. */
182 :
183 : void
184 : fd_accdb_snapshot_load_begin( fd_accdb_t * accdb );
185 :
186 : void
187 : fd_accdb_snapshot_load_end( fd_accdb_t * accdb );
188 :
189 : /* fd_accdb_snapshot_recovery_t captures layer-0 write head metadata.
190 : Used by fd_accdb_snapshot_{save,revert}_whead to save and restore
191 : accdb state across an incremental snapshot attempt. */
192 :
193 : struct fd_accdb_snapshot_recovery {
194 : ulong whead_val; /* whead[0].val */
195 : int has_partition; /* has_partition[0] */
196 : ulong partition_max; /* partition_max */
197 : ulong disk_current_bytes; /* disk_current_bytes metric */
198 : ulong savepoint_bytes_freed; /* bytes_freed of the save-point partition */
199 : };
200 :
201 : typedef struct fd_accdb_snapshot_recovery fd_accdb_snapshot_recovery_t;
202 :
203 : /* fd_accdb_snapshot_save_whead captures the current layer-0 write head,
204 : partition state, and disk_current_bytes metric into the provided
205 : recovery struct. Also captures the save-point partition's
206 : bytes_freed. */
207 :
208 : void
209 : fd_accdb_snapshot_save_whead( fd_accdb_t * accdb,
210 : fd_accdb_snapshot_recovery_t * out );
211 :
212 : /* fd_accdb_snapshot_revert_whead restores the layer-0 write head to a
213 : previously saved position.
214 :
215 : It internally waits for the pending background purge command to
216 : complete on T2 before releasing partitions, so the caller does not
217 : need to insert a separate wait_cmd barrier.
218 :
219 : Previously allocated partitions (with indices in the range
220 : [saved_partition_max, current partition_max)) are released back
221 : to the partition pool. disk_current_bytes is restored to the saved
222 : value rather than computed per-partition, and the save-point
223 : partition's bytes_freed and write_offset are reset. */
224 :
225 : void
226 : fd_accdb_snapshot_revert_whead( fd_accdb_t * accdb,
227 : fd_accdb_snapshot_recovery_t const * recover );
228 :
229 : /* fd_accdb_attach_child allocates a new fork as a child of
230 : parent_fork_id and returns the new fork's id. This must be done
231 : any time a new fork is being inserted into the accounts database,
232 : so that the accounts database can maintain ancestry information
233 : in order to support queries correctly.
234 :
235 : To create the initial root fork, pass a sentinel value with
236 : val==USHORT_MAX as parent_fork_id. This must be done exactly
237 : once, before any other fork operations.
238 :
239 : For non-root forks, parent_fork_id must refer to a fork that has
240 : already been attached. The ancestry must form a tree and it is
241 : undefined behavior to create cycles. */
242 :
243 : fd_accdb_fork_id_t
244 : fd_accdb_attach_child( fd_accdb_t * accdb,
245 : fd_accdb_fork_id_t parent_fork_id );
246 :
247 : /* fd_accdb_advance_root advances the root of the accounts database to
248 : the given fork_id. fork_id must be a direct child of the current
249 : root (i.e. fork->parent_id equals the current root_fork_id).
250 :
251 : Any competing sibling forks (and their entire subtrees) are removed.
252 : For accounts updated on the newly rooted fork, any older versions on
253 : ancestor forks are tombstoned for later compaction. After this call
254 : the old root fork slot is freed and fork_id becomes the new root.
255 :
256 : IMPORTANT: The caller must guarantee that all outstanding
257 : acquire/release pairs on every sibling of fork_id (and their entire
258 : subtrees) have completed before calling advance_root. advance_root
259 : implicitly purges those sibling subtrees, which frees their fork pool
260 : slots for recycling.
261 :
262 : Once a fork is rooted, its generation becomes the new
263 : root_generation. Concurrent acquires that observe the new root will
264 : use the generation fast path (generation <= root_generation) for all
265 : accounts from that fork and its ancestors, bypassing descends_set
266 : entirely. This is what makes fork pool slot recycling safe: by the
267 : time a slot is freed and reusable, no reader will ever consult
268 : descends_set for the old fork_id. */
269 :
270 : void
271 : fd_accdb_advance_root( fd_accdb_t * accdb,
272 : fd_accdb_fork_id_t fork_id );
273 :
274 : /* fd_accdb_purge removes the provided fork and all of its descendants
275 : from the accounts database. This is an extremely rare operation,
276 : used to handle cases where a leader equivocated and produced two
277 : competing blocks for the same slot.
278 :
279 : All accounts written on the purged fork and any child or
280 : grandchild forks are removed from the index, and their disk
281 : space is freed for compaction. The ancestry information for all
282 : purged forks is also removed.
283 :
284 : IMPORTANT: The caller must guarantee that all outstanding
285 : acquire/release pairs on the purged fork and every descendant
286 : have completed before calling purge. The same fork pool slot
287 : recycling hazard described for advance_root applies here. */
288 :
289 : void
290 : fd_accdb_purge( fd_accdb_t * accdb,
291 : fd_accdb_fork_id_t fork_id );
292 :
293 : /* fd_accdb_acquire brings all of the requested accounts as-of the given
294 : fork_idx into the cache, and refcnts them in the cache so they cannot
295 : be evicted until later released.
296 :
297 : fork_idx is the fork index from replay to query as-of, and must exist
298 : for the entire duration of the acquire call, meaning, whoever is
299 : acquiring must have a refcnt on the bank corresponding to fork_idx,
300 : and not release it until after the accounts are acquired. It is safe
301 : to release the bank after the acquire call returns, and this will not
302 : cause the acquired accounts to be evicted from the cache.
303 :
304 : pubkeys_cnt is the number of accounts to acquire, and pubkeys is an
305 : array of pointers to the 32-byte pubkeys of the accounts to acquire.
306 : writable is an array of flags indicating whether each corresponding
307 : account in pubkeys is being acquired for read (0) or write (1).
308 : Writes provide a temporary buffer of 10MiB in all cases, which the
309 : caller can use for staging changes to the data, and this allows
310 : account resizing, or cancelling of any data written (for example if a
311 : transaction fails) without needing to restore it. If an account is
312 : acquired for write, the caller must set the commit bit on the acc
313 : to non-zero to have the changes written back to the database on
314 : release, or leave it at zero to discard the changes. The commit bit
315 : must be set even if only the metadata has changed.
316 :
317 : IMPORTANT: The caller must guarantee that for any given (pubkey,
318 : fork) pair, there is no concurrent acquire that holds a writable
319 : acc while another acquire for the same account on the same fork is
320 : outstanding (whether readable or writable). Specifically:
321 :
322 : - Multiple concurrent read-only acquires of the same account on the
323 : same fork are permitted.
324 : - A writable acquire of an account on a given fork must not overlap
325 : with any other acquire (read or write) of that same account on
326 : that same fork.
327 : - Acquires of the same account on _different_ forks are always safe
328 : and may overlap freely, provided that all releases on an ancestor
329 : fork have completed before any acquire on a descendant fork
330 : begins. In particular, a fork must finish all of its transaction
331 : execution (including committing or cancelling every writable
332 : account) before a child fork is attached and begins acquiring.
333 : This is naturally guaranteed by the replay scheduler, which does
334 : not activate a child block until the parent block is fully done.
335 : Concurrent acquires across unrelated sibling forks have no
336 : ordering requirement.
337 :
338 : Violating this contract is undefined behavior and will likely crash
339 : with an assertion failure inside the cache refcount logic. In
340 : practice, these constraints are naturally satisfied by the Solana
341 : execution model: each transaction has exclusive write locks on its
342 : writable accounts within a slot, the scheduler ensures no two
343 : concurrent transactions write to the same account on the same fork,
344 : and the replay scheduler serializes parent block completion before
345 : child block activation on the same fork chain.
346 :
347 : When a writable account is committed as an "overwrite" (same
348 : fork), the acc pool element's metadata fields (size, lamports,
349 : offset) are mutated in place, and the cache line's owner field is
350 : updated. This is safe because these mutations
351 : only happen on the acc element whose generation matches the
352 : committing fork. A concurrent acquire on a different fork cannot
353 : observe an in-place mutation of the same acc element for a child fork
354 : to even exist, the parent must be frozen and no longer undergoing
355 : modifications. All acc pool fields are effectively immutable from
356 : the perspective of any concurrent cross-fork reader.
357 :
358 : out_accs is an array of pubkeys_cnt cache accs to be filled in
359 : with the acquired accounts. The cache will fill the owner, lamports,
360 : data_len, and data fields of each acc if the acquire is successful,
361 : and the account exists. If the account does not exist, the lamports
362 : field will be set to zero and other fields are undefined. */
363 :
364 : void
365 : fd_accdb_acquire( fd_accdb_t * accdb,
366 : fd_accdb_fork_id_t fork_id,
367 : ulong pubkeys_cnt,
368 : uchar const * const * pubkeys,
369 : int * writable,
370 : fd_acc_t * out_accs );
371 :
372 : void
373 : fd_accdb_acquire_a( fd_accdb_t * accdb,
374 : fd_accdb_fork_id_t fork_id,
375 : ulong pubkeys_cnt,
376 : uchar const * const * pubkeys,
377 : int * writable,
378 : fd_acc_t * out_accs );
379 :
380 : void
381 : fd_accdb_acquire_b( fd_accdb_t * accdb,
382 : fd_accdb_fork_id_t fork_id,
383 : ulong reserved_cnt,
384 : ulong pubkeys_cnt,
385 : uchar const * const * pubkeys,
386 : int * writable,
387 : fd_acc_t * out_accs );
388 :
389 : /* fd_accdb_release releases previously acquired accounts back to the
390 : cache, and if any of the released writable accounts have their commit
391 : bit set, the cache will write the changes back to the database. The
392 : caller must guarantee that the accs being released were previously
393 : acquired and not yet released, and that the pubkeys in the accs
394 : match the pubkeys of the acquired accounts. The accs need not be
395 : a specific set that was acquired together, although this is
396 : recommended. The fork that each acc refers to must still exist
397 : (not yet purged or advanced past) at the time of release. This
398 : includes forks that would be implicitly purged by a concurrent
399 : advance_root on a sibling — the caller must ensure advance_root
400 : is not called until all releases on affected forks have completed.
401 : Releasing accounts for a fork that has been purged or recycled is
402 : undefined behavior. */
403 :
404 : void
405 : fd_accdb_release( fd_accdb_t * accdb,
406 : ulong accs_cnt,
407 : fd_acc_t * accs );
408 :
409 : void
410 : fd_accdb_release_ab( fd_accdb_t * accdb,
411 : ulong accs_cnt,
412 : fd_acc_t * accs,
413 : ulong execs_cnt,
414 : fd_acc_t * execs );
415 :
416 : fd_acc_t
417 : fd_accdb_read_one( fd_accdb_t * accdb,
418 : fd_accdb_fork_id_t fork_id,
419 : uchar const * pubkey );
420 :
421 : fd_acc_t
422 : fd_accdb_write_one( fd_accdb_t * accdb,
423 : fd_accdb_fork_id_t fork_id,
424 : uchar const * pubkey );
425 :
426 : void
427 : fd_accdb_unwrite_one( fd_accdb_t * accdb,
428 : fd_acc_t * acc );
429 :
430 : void
431 : fd_accdb_unread_one( fd_accdb_t * accdb,
432 : fd_acc_t * acc );
433 :
434 : int
435 : fd_accdb_exists( fd_accdb_t * accdb,
436 : fd_accdb_fork_id_t fork_id,
437 : uchar const * pubkey );
438 :
439 : /* fd_accdb_read_one_nocache reads one account at fork_id into
440 : caller-provided output buffers. Suitable for processes that mmap the
441 : accdb data region read-only: it never mutates any cache line, index
442 : entry, or record. The only write it makes into accdb shmem is
443 : publishing this joiner's epoch (to hold off compaction for the
444 : duration of the read), and that is done through a separately-mmap'd
445 : writable page aliasing the joiner's own epoch slot, not the read-only
446 : region.
447 :
448 : out_owner must point at a 32-byte buffer. out_data must point at a
449 : buffer of at least FD_RUNTIME_ACC_SZ_MAX (10 MiB) bytes, the maximum
450 : account data size; the function does not bound-check against the
451 : account's actual length. On a cache hit the bytes are memcpy'd from
452 : the cache slot using a try-read-test (ABA) loop; on a miss the owner
453 : and data are preadv2'd from the disk fd passed at join time, scattered
454 : into out_owner and out_data via iovec (looping on short reads).
455 :
456 : If the account does not exist, *out_lamports is set to zero and the
457 : other outputs are undefined; otherwise *out_lamports is non-zero and
458 : out_executable, out_owner, out_data, and out_data_len are all filled
459 : in.
460 :
461 : The function takes no reference; nothing needs to be released. */
462 :
463 : void
464 : fd_accdb_read_one_nocache( fd_accdb_t * accdb,
465 : fd_accdb_fork_id_t fork_id,
466 : uchar const * pubkey,
467 : ulong * out_lamports,
468 : int * out_executable,
469 : uchar * out_owner,
470 : uchar * out_data,
471 : ulong * out_data_len );
472 :
473 : /* fd_accdb_lamports returns the lamports of the account at fork_id, or
474 : zero if the account does not exist. */
475 :
476 : ulong
477 : fd_accdb_lamports( fd_accdb_t * accdb,
478 : fd_accdb_fork_id_t fork_id,
479 : uchar const * pubkey );
480 :
481 : /* fd_accdb_reset reinitializes the accdb to the state immediately after
482 : fd_accdb_new. All in-memory index state is cleared and all pool
483 : joins are re-established. The caller is responsible for truncating
484 : the on-disk file separately (e.g. via the snapwr tile).
485 :
486 : The caller must guarantee that no other thread is concurrently
487 : accessing the accdb (no outstanding acquires, no background work). */
488 :
489 : void
490 : fd_accdb_reset( fd_accdb_t * accdb );
491 :
492 : /* fd_accdb_snapshot_write_one inserts or replaces an account during
493 : snapshot loading. Returns -1 if the write was ignored (an existing
494 : acc has a higher slot), 1 if a new acc was inserted, 2 if an
495 : existing acc was replaced. When 2 is returned, *out_replaced_lamports
496 : is set to the lamports of the replaced acc. Otherwise it is set to
497 : 0. out_replaced_lamports must be non-NULL.
498 :
499 : slot must be <= UINT_MAX. The slot is held in a 32-bit scratch field
500 : during snapshot loading; the accdb format must be widened before
501 : Solana reaches slot 2^32. Passing a larger slot crashes the
502 : process.
503 :
504 : fork_id controls recovery behavior:
505 :
506 : USHORT_MAX, full-snapshot mode. Existing entries with the same
507 : pubkey are replaced in-place. No txn entries are
508 : created.
509 :
510 : other, incremental-snapshot mode. Cross-snapshot overrides
511 : (existing entry from a different fork) insert a NEW
512 : acc_pool entry alongside the old one and create a txn
513 : record on fork_id, so fd_accdb_purge can revert the
514 : incremental writes on failure. Intra-fork duplicates
515 : (same pubkey from the same fork) are still replaced
516 : in-place. */
517 :
518 : int
519 : fd_accdb_snapshot_write_one( fd_accdb_t * accdb,
520 : fd_accdb_fork_id_t fork_id,
521 : uchar const * pubkey,
522 : ulong slot,
523 : ulong lamports,
524 : ulong data_len,
525 : int executable,
526 : ulong * out_replaced_lamports );
527 :
528 : /* fd_accdb_snapshot_write_batch processes up to 8 accounts at once,
529 : using software prefetching to overlap hash chain memory latency with
530 : useful work. Each pubkey[i] points to a 32-byte public key.
531 : *out_replaced_lamports is set to the sum of the lamports of all
532 : accounts replaced by this batch (i.e. the previous lamports value of
533 : each account whose acc was overwritten). *out_ignored_lamports is
534 : set to the sum of the lamports of all accounts ignored by this batch
535 : (i.e. the lamports of each input account whose write was dropped
536 : because an acc with a higher slot already exists). Returns 0 on
537 : success, -1 if the batch contained two entries with the same pubkey
538 : (a corrupt-snapshot signal — the caller should flag the snapshot
539 : malformed). Output counters are not meaningful when -1 is returned.
540 :
541 : Each slots[i] must be <= UINT_MAX (see fd_accdb_snapshot_write_one
542 : for the rationale). Passing a larger slot crashes the process.
543 :
544 : fork_id has the same semantics as in fd_accdb_snapshot_write_one:
545 : USHORT_MAX for full-snapshot mode, otherwise incremental mode with
546 : txn tracking on the specified fork. */
547 :
548 : int
549 : fd_accdb_snapshot_write_batch( fd_accdb_t * accdb,
550 : fd_accdb_fork_id_t fork_id,
551 : ulong cnt,
552 : uchar const * const pubkeys[],
553 : ulong const slots[],
554 : ulong const lamports[],
555 : ulong const data_lens[],
556 : int const executables[],
557 : ulong * accounts_ignored,
558 : ulong * accounts_replaced,
559 : ulong * accounts_loaded,
560 : ulong * out_replaced_lamports,
561 : ulong * out_ignored_lamports );
562 :
563 : /* fd_accdb_background performs one unit of background work.
564 :
565 : THREADING MODEL
566 :
567 : The accdb API is split across three thread roles:
568 :
569 : T1 (replay): calls attach_child, advance_root, purge, acquire, and
570 : release. attach_child runs inline on T1. advance_root and
571 : purge submit a command into a shared- memory slot and return
572 : immediately; the heavy work is deferred to T2.
573 :
574 : T2 (accdb tile / background): calls fd_accdb_background repeatedly.
575 : This is the only function T2 should call.
576 :
577 : T3 (executor tiles, 1..N): call acquire and release.
578 :
579 : acquire and release may be called concurrently from T1 and any number
580 : of T3 threads. They must never be called concurrently with
581 : advance_root or purge on the same fork.
582 :
583 : fd_accdb_background must be called from exactly one thread (T2). It
584 : must not be called concurrently with itself.
585 :
586 : BEHAVIOR
587 :
588 : First checks for a pending advance_root or purge command from T1; if
589 : one is present it executes the command, sets *charge_busy to 1, and
590 : returns immediately without doing compaction. Otherwise, attempts one
591 : step of compaction at each layer, setting *charge_busy if work was
592 : done. */
593 :
594 : void
595 : fd_accdb_background( fd_accdb_t * accdb,
596 : int * charge_busy );
597 :
598 : /* fd_accdb_shmetrics returns a pointer to the shared metrics counters
599 : for the given accdb instance. The returned pointer remains valid
600 : for the lifetime of the underlying shmem. */
601 :
602 : fd_accdb_shmem_metrics_t const *
603 : fd_accdb_shmetrics( fd_accdb_t * accdb );
604 :
605 : /* fd_accdb_metrics returns a pointer to the per-thread metrics counters
606 : for the given accdb instance. The returned pointer remains valid
607 : for the lifetime of the underlying shmem. */
608 :
609 : fd_accdb_metrics_t const *
610 : fd_accdb_metrics( fd_accdb_t * accdb );
611 :
612 : /* fd_accdb_cache_class_occupancy snapshots the current per-size-class
613 : cache occupancy and capacity into the caller-provided arrays, each
614 : of which must have FD_ACCDB_CACHE_CLASS_CNT entries. used[c] is the
615 : number of slots in class c that currently hold a cache acc (i.e.
616 : slots that have been allocated lazily and are not sitting in the
617 : free list). max[c] is the total slot capacity of class c. Reads
618 : are done with relaxed (volatile) loads and may be momentarily
619 : inconsistent with each other under contention. */
620 :
621 : void
622 : fd_accdb_cache_class_occupancy( fd_accdb_t * accdb,
623 : ulong * used,
624 : ulong * max,
625 : ulong * reserved );
626 :
627 : /* fd_accdb_cache_class_thresholds returns the per-size-class preeviction
628 : thresholds, expressed as used-slot counts (so they're directly
629 : comparable to occupancy.used and occupancy.max). Each output array
630 : must have FD_ACCDB_CACHE_CLASS_CNT entries. target_used[c] is the
631 : used count the background preevict pass tries to drive towards (max -
632 : cache_free_target). low_water_used[c] is the used count at which the
633 : preevict pass starts firing (max - cache_free_low_water). Both are
634 : set once at init and are stable for the lifetime of the cache. */
635 :
636 : void
637 : fd_accdb_cache_class_thresholds( fd_accdb_t * accdb,
638 : ulong * target_used,
639 : ulong * low_water_used );
640 :
641 : /* FD_ACCDB_METRICS_WRITE publishes the per-joiner accdb runtime metrics
642 : for tile prefix TILE. TILE must be a tile that declares the
643 : AccdbAccountAcquired/... counters in metrics.xml (e.g. EXECLE,
644 : EXECRP, REPLAY, TOWER, ACCDB). m must be a fd_accdb_metrics_t const *
645 : for the joiner whose counters should be published. */
646 :
647 0 : #define FD_ACCDB_METRICS_WRITE( TILE, m ) do { \
648 0 : fd_accdb_metrics_t const * _m = (m); \
649 0 : FD_MCNT_ENUM_COPY( TILE, ACCDB_ACCOUNT_ACQUIRED, _m->accounts_acquired_per_class ); \
650 0 : FD_MCNT_ENUM_COPY( TILE, ACCDB_ACCOUNT_WRITABLE_ACQUIRED, _m->writable_accounts_acquired_per_class ); \
651 0 : FD_MCNT_ENUM_COPY( TILE, ACCDB_ACCOUNT_EVICTED, _m->accounts_evicted_per_class ); \
652 0 : FD_MCNT_ENUM_COPY( TILE, ACCDB_ACCOUNT_COMMITTED_NEW, _m->accounts_committed_new_per_class ); \
653 0 : FD_MCNT_ENUM_COPY( TILE, ACCDB_ACCOUNT_COMMITTED_OVERWRITE, _m->accounts_committed_overwrite_per_class ); \
654 0 : FD_MCNT_ENUM_COPY( TILE, ACCDB_ACCOUNT_NOT_FOUND, _m->accounts_not_found_per_class ); \
655 0 : FD_MCNT_SET( TILE, ACCDB_ACCOUNT_WAITED, _m->accounts_waited ); \
656 0 : FD_MCNT_SET( TILE, ACCDB_BATCH_ACQUIRED, _m->acquire_calls ); \
657 0 : FD_MCNT_SET( TILE, ACCDB_ACQUIRE_FAILED, _m->acquire_failed ); \
658 0 : FD_MCNT_SET( TILE, ACCDB_BYTES_READ, _m->bytes_read ); \
659 0 : FD_MCNT_SET( TILE, ACCDB_READ_OPERATION, _m->read_ops ); \
660 0 : FD_MCNT_SET( TILE, ACCDB_BYTES_WRITTEN, _m->bytes_written ); \
661 0 : FD_MCNT_SET( TILE, ACCDB_WRITE_OPERATION, _m->write_ops ); \
662 0 : FD_MCNT_SET( TILE, ACCDB_BYTES_COPIED, _m->bytes_copied ); \
663 0 : } while(0)
664 :
665 : /* FD_ACCDB_METRICS_WRITE_RO is the read-only joiner subset of
666 : FD_ACCDB_METRICS_WRITE. It only emits the counters that
667 : fd_accdb_read_one_nocache touches; tiles that join readonly
668 : (e.g. RPC) declare only this subset of counters in metrics.xml. */
669 :
670 0 : #define FD_ACCDB_METRICS_WRITE_RO( TILE, m ) do { \
671 0 : fd_accdb_metrics_t const * _m = (m); \
672 0 : FD_MCNT_ENUM_COPY( TILE, ACCDB_ACCOUNT_ACQUIRED, _m->accounts_acquired_per_class ); \
673 0 : FD_MCNT_ENUM_COPY( TILE, ACCDB_ACCOUNT_NOT_FOUND, _m->accounts_not_found_per_class ); \
674 0 : FD_MCNT_SET( TILE, ACCDB_ACCOUNT_WAITED, _m->accounts_waited ); \
675 0 : FD_MCNT_SET( TILE, ACCDB_BATCH_ACQUIRED, _m->acquire_calls ); \
676 0 : FD_MCNT_SET( TILE, ACCDB_BYTES_READ, _m->bytes_read ); \
677 0 : FD_MCNT_SET( TILE, ACCDB_READ_OPERATION, _m->read_ops ); \
678 0 : FD_MCNT_SET( TILE, ACCDB_BYTES_COPIED, _m->bytes_copied ); \
679 0 : } while(0)
680 :
681 : FD_PROTOTYPES_END
682 :
683 : #endif /* HEADER_fd_src_flamenco_accdb_fd_accdb_h */
|