Line data Source code
1 : #ifndef HEADER_fd_src_flamenco_runtime_fd_blockstore_h
2 : #define HEADER_fd_src_flamenco_runtime_fd_blockstore_h
3 :
4 : /* Blockstore is a high-performance database for in-memory indexing and
5 : durably storing blocks.
6 :
7 : `fd_blockstore` defines a number of useful types e.g. `fd_block_t`,
8 : `fd_block_shred`, etc.
9 :
10 : The blockstore alloc is used for allocating wksp resources for shred
11 : headers, microblock headers, and blocks. This is an fd_alloc.
12 : Allocations from this allocator will be tagged with wksp_tag and
13 : operations on this allocator will use concurrency group 0. */
14 :
15 : #include "../../ballet/block/fd_microblock.h"
16 : #include "../../ballet/shred/fd_deshredder.h"
17 : #include "../../ballet/shred/fd_shred.h"
18 : #include "../fd_flamenco_base.h"
19 : #include "../types/fd_types.h"
20 : #include "fd_rwseq_lock.h"
21 : #include "stdbool.h"
22 : #include <fcntl.h>
23 :
24 : /* FD_BLOCKSTORE_ALIGN specifies the alignment needed for blockstore.
25 : ALIGN is double x86 cache line to mitigate various kinds of false
26 : sharing (eg. ACLPF adjacent cache line prefetch). */
27 :
28 0 : #define FD_BLOCKSTORE_ALIGN (128UL)
29 :
30 : /* FD_BLOCKSTORE_MAGIC defines a magic number for verifying the memory
31 : of blockstore is not corrupted. */
32 :
33 0 : #define FD_BLOCKSTORE_MAGIC (0xf17eda2ce7b10c00UL) /* firedancer bloc version 0 */
34 :
35 : /* DO NOT MODIFY. */
36 : // #define FD_BUF_SHRED_MAP_MAX (1UL << 24UL) /* 16 million shreds can be buffered */
37 : // #define FD_TXN_MAP_LG_MAX (24) /* 16 million txns can be stored in the txn map */
38 :
39 : /* TODO this can be removed if we explicitly manage a memory pool for
40 : the fd_block_map_t entries */
41 0 : #define FD_BLOCKSTORE_CHILD_SLOT_MAX (32UL) /* the maximum # of children a slot can have */
42 0 : #define FD_BLOCKSTORE_ARCHIVE_MIN_SIZE (1UL << 26UL) /* 64MB := ceil(MAX_DATA_SHREDS_PER_SLOT*1228) */
43 :
44 : /* FD_SLICE_ALIGN specifies the alignment needed for a block slice.
45 : ALIGN is double x86 cache line to mitigate various kinds of false
46 : sharing (eg. ACLPF adjacent cache line prefetch). */
47 :
48 : #define FD_SLICE_ALIGN (128UL)
49 :
50 : /* FD_SLICE_MAX specifies the maximum size of an entry batch. This is
51 : equivalent to the maximum size of a block (ie. a block with a single
52 : entry batch). */
53 :
54 0 : #define FD_SLICE_MAX (FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT)
55 :
56 : /* 64 ticks per slot, and then one min size transaction per microblock
57 : for all the remaining microblocks.
58 : This bound should be used along with the transaction parser and tick
59 : verifier to enforce the assumptions.
60 : This is NOT a standalone conservative bound against malicious
61 : validators.
62 : A tighter bound could probably be derived if necessary. */
63 :
64 0 : #define FD_MICROBLOCK_MAX_PER_SLOT ((FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT - 64UL*sizeof(fd_microblock_hdr_t)) / (sizeof(fd_microblock_hdr_t)+FD_TXN_MIN_SERIALIZED_SZ) + 64UL) /* 200,796 */
65 : /* 64 ticks per slot, and a single gigantic microblock containing min
66 : size transactions. */
67 0 : #define FD_TXN_MAX_PER_SLOT ((FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT - 65UL*sizeof(fd_microblock_hdr_t)) / (FD_TXN_MIN_SERIALIZED_SZ)) /* 272,635 */
68 :
69 : // TODO centralize these
70 : // https://github.com/firedancer-io/solana/blob/v1.17.5/sdk/program/src/clock.rs#L34
71 : #define FD_MS_PER_TICK 6
72 :
73 : // https://github.com/firedancer-io/solana/blob/v1.17.5/core/src/repair/repair_service.rs#L55
74 : #define FD_REPAIR_TIMEOUT (200 / FD_MS_PER_TICK)
75 :
76 0 : #define FD_BLOCKSTORE_SUCCESS 0
77 : #define FD_BLOCKSTORE_SUCCESS_SLOT_COMPLETE 1
78 0 : #define FD_BLOCKSTORE_ERR_INVAL (-1)
79 : #define FD_BLOCKSTORE_ERR_AGAIN (-2)
80 0 : #define FD_BLOCKSTORE_ERR_CORRUPT (-3)
81 : #define FD_BLOCKSTORE_ERR_EMPTY (-4)
82 : #define FD_BLOCKSTORE_ERR_FULL (-5)
83 0 : #define FD_BLOCKSTORE_ERR_KEY (-6)
84 : #define FD_BLOCKSTORE_ERR_SHRED_FULL -1 /* no space left for shreds */
85 : #define FD_BLOCKSTORE_ERR_SLOT_FULL -2 /* no space left for slots */
86 : #define FD_BLOCKSTORE_ERR_TXN_FULL -3 /* no space left for txns */
87 : #define FD_BLOCKSTORE_ERR_SHRED_MISSING -4
88 0 : #define FD_BLOCKSTORE_ERR_SLOT_MISSING -5
89 : #define FD_BLOCKSTORE_ERR_TXN_MISSING -6
90 0 : #define FD_BLOCKSTORE_ERR_SHRED_INVALID -7 /* shred was invalid */
91 : #define FD_BLOCKSTORE_ERR_DESHRED_INVALID -8 /* deshredded block was invalid */
92 0 : #define FD_BLOCKSTORE_ERR_NO_MEM -9 /* no mem */
93 : #define FD_BLOCKSTORE_ERR_UNKNOWN -99
94 :
95 0 : static inline char const * fd_blockstore_strerror( int err ) {
96 0 : switch( err ) {
97 0 : case FD_BLOCKSTORE_SUCCESS: return "success";
98 0 : case FD_BLOCKSTORE_ERR_INVAL: return "bad input";
99 0 : case FD_BLOCKSTORE_ERR_AGAIN: return "try again";
100 0 : case FD_BLOCKSTORE_ERR_CORRUPT: return "corruption detected";
101 0 : case FD_BLOCKSTORE_ERR_EMPTY: return "empty";
102 0 : case FD_BLOCKSTORE_ERR_FULL: return "full";
103 0 : case FD_BLOCKSTORE_ERR_KEY: return "key not found";
104 0 : default: break;
105 0 : }
106 0 : return "unknown";
107 0 : }
108 :
109 : struct fd_shred_key {
110 : ulong slot;
111 : uint idx;
112 : };
113 : typedef struct fd_shred_key fd_shred_key_t;
114 :
115 : static const fd_shred_key_t fd_shred_key_null = { 0 };
116 : #define FD_SHRED_KEY_NULL fd_shred_key_null
117 : #define FD_SHRED_KEY_INVAL(key) (!((key).slot) & !((key).idx))
118 0 : #define FD_SHRED_KEY_EQ(k0,k1) (!(((k0).slot) ^ ((k1).slot))) & !(((k0).idx) ^ (((k1).idx)))
119 0 : #define FD_SHRED_KEY_HASH(key) ((uint)(((key).slot)<<15UL) | (((key).idx))) /* current max shred idx is 32KB = 2 << 15*/
120 :
121 : /* fd_buf_shred is a thin wrapper around fd_shred_t that facilitates
122 : buffering data shreds before all the shreds for a slot have been
123 : received. After all shreds are received, these buffered shreds are
124 : released back into memory pool and future queries for the shreds are
125 : offset into the block data directly.
126 :
127 : The blockstore is only aware of data shreds and all APIs involving
128 : shreds refers to data shreds.
129 :
130 : Shreds are buffered into a map as they are received:
131 :
132 : | 0 | 1 | 2 | x | x | 5 | x |
133 : ^ ^
134 : c r
135 :
136 : c = "consumed" = contiguous window starting from index 0
137 : r = "received" = highest index received so far
138 :
139 : Shred memory layout while stored in the map:
140 :
141 : | shred hdr | shred payload |
142 : */
143 : struct __attribute__((aligned(128UL))) fd_buf_shred {
144 : fd_shred_key_t key;
145 : ulong prev;
146 : ulong next;
147 : ulong memo;
148 : int eqvoc; /* we've seen an equivocating version of this
149 : shred (same key but different payload). */
150 : union {
151 : fd_shred_t hdr; /* shred header */
152 : uchar buf[FD_SHRED_MIN_SZ]; /* the entire shred buffer, both header and payload. */
153 : };
154 : };
155 : typedef struct fd_buf_shred fd_buf_shred_t;
156 :
157 : #define POOL_NAME fd_buf_shred_pool
158 0 : #define POOL_ELE_T fd_buf_shred_t
159 : #include "../../util/tmpl/fd_pool_para.c"
160 :
161 : #define MAP_NAME fd_buf_shred_map
162 0 : #define MAP_ELE_T fd_buf_shred_t
163 0 : #define MAP_KEY_T fd_shred_key_t
164 0 : #define MAP_KEY_EQ(k0,k1) (FD_SHRED_KEY_EQ(*k0,*k1))
165 : #define MAP_KEY_EQ_IS_SLOW 1
166 0 : #define MAP_KEY_HASH(key,seed) (FD_SHRED_KEY_HASH(*key)^seed)
167 : #include "../../util/tmpl/fd_map_chain_para.c"
168 :
169 : #define DEQUE_NAME fd_slot_deque
170 0 : #define DEQUE_T ulong
171 : #include "../../util/tmpl/fd_deque_dynamic.c"
172 :
173 : /* fd_block_shred_t is a shred that has been assembled into a block. The
174 : shred begins at `off` relative to the start of the block's data
175 : region. */
176 : struct fd_block_shred {
177 : fd_shred_t hdr; /* ptr to the data shred header */
178 : ulong off; /* offset to the payload relative to the start of the block's data region */
179 : };
180 : typedef struct fd_block_shred fd_block_shred_t;
181 :
182 : /*
183 : * fd_block_entry_batch_t is a microblock/entry batch within a block.
184 : * The offset is relative to the start of the block's data region,
185 : * and indicates where the batch ends. The (exclusive) end offset of
186 : * batch i is the (inclusive) start offset of batch i+1. The 0th batch
187 : * always starts at offset 0.
188 : * On the wire, the presence of one of the COMPLETE flags in a data
189 : * shred marks the end of a batch.
190 : * In other words, batch ends are aligned with shred ends, and batch
191 : * starts are aligned with shred starts. Usually a batch comprises
192 : * multiple shreds, and a block comprises multiple batches.
193 : * This information is useful because bincode deserialization needs to
194 : * be performed on a per-batch basis. Precisely a single array of
195 : * microblocks/entries is expected to be deserialized from a batch.
196 : * Trailing bytes in each batch are ignored by default.
197 : */
198 : struct fd_block_entry_batch {
199 : ulong end_off; /* exclusive */
200 : };
201 : typedef struct fd_block_entry_batch fd_block_entry_batch_t;
202 :
203 : /* fd_block_micro_t is a microblock ("entry" in Solana parlance) within
204 : a block. The microblock begins at `off` relative to the start of the
205 : block's data region. */
206 : struct fd_block_micro {
207 : ulong off; /* offset into block data */
208 : };
209 : typedef struct fd_block_micro fd_block_micro_t;
210 :
211 : /* fd_block_txn_t is a transaction that has been parsed and is part of a
212 : block. The transaction begins at `off` relative to the start of the
213 : block's data region. */
214 : struct fd_block_txn {
215 : ulong txn_off; /* offset into block data of transaction */
216 : ulong id_off; /* offset into block data of transaction identifiers */
217 : ulong sz;
218 : };
219 : typedef struct fd_block_txn fd_block_txn_t;
220 :
221 : /* If the 0th bit is set, this indicates the block is preparing, which
222 : means it might be partially executed e.g. a subset of the microblocks
223 : have been executed. It is not safe to remove, relocate, or modify
224 : the block in any way at this time.
225 :
226 : Callers holding a pointer to a block should always make sure to
227 : inspect this flag.
228 :
229 : Other flags mainly provide useful metadata for read-only callers, eg.
230 : RPC. */
231 :
232 0 : #define FD_BLOCK_FLAG_RECEIVING 0 /* xxxxxxx1 still receiving shreds */
233 0 : #define FD_BLOCK_FLAG_COMPLETED 1 /* xxxxxx1x received the block ie. all shreds (SLOT_COMPLETE) */
234 0 : #define FD_BLOCK_FLAG_REPLAYING 2 /* xxxxx1xx replay in progress (DO NOT REMOVE) */
235 0 : #define FD_BLOCK_FLAG_PROCESSED 3 /* xxxx1xxx successfully replayed the block */
236 0 : #define FD_BLOCK_FLAG_EQVOCSAFE 4 /* xxxx1xxx 52% of cluster has voted on this (slot, bank hash) */
237 0 : #define FD_BLOCK_FLAG_CONFIRMED 5 /* xxx1xxxx 2/3 of cluster has voted on this (slot, bank hash) */
238 0 : #define FD_BLOCK_FLAG_FINALIZED 6 /* xx1xxxxx 2/3 of cluster has rooted this slot */
239 : #define FD_BLOCK_FLAG_DEADBLOCK 7 /* x1xxxxxx failed to replay the block */
240 :
241 : /* Rewards assigned after block is executed */
242 :
243 : struct fd_block_rewards {
244 : ulong collected_fees;
245 : fd_hash_t leader;
246 : ulong post_balance;
247 : };
248 : typedef struct fd_block_rewards fd_block_rewards_t;
249 :
250 : /* Remaining bits [4, 8) are reserved.
251 :
252 : To avoid confusion, please use `fd_bits.h` API
253 : ie. `fd_uchar_set_bit`, `fd_uchar_extract_bit`. */
254 :
255 : #define SET_NAME fd_block_set
256 : #define SET_MAX FD_SHRED_BLK_MAX
257 : #include "../../util/tmpl/fd_set.c"
258 :
259 : struct fd_block_info {
260 : ulong slot; /* map key */
261 : ulong next; /* reserved for use by fd_map_giant.c */
262 :
263 : /* Ancestry */
264 :
265 : ulong parent_slot;
266 : ulong child_slots[FD_BLOCKSTORE_CHILD_SLOT_MAX];
267 : ulong child_slot_cnt;
268 :
269 : /* Metadata */
270 :
271 : ulong block_height;
272 : fd_hash_t block_hash;
273 : fd_hash_t bank_hash;
274 : fd_hash_t merkle_hash; /* the last FEC set's merkle hash */
275 : ulong fec_cnt; /* the number of FEC sets in the slot */
276 : uchar flags;
277 : long ts; /* the wallclock time when we finished receiving the block. */
278 :
279 : /* Windowing
280 :
281 : Shreds are buffered into a map as they are received:
282 :
283 : | 0 | 1 | 2 | x | x | 5 | x |
284 : ^ ^ ^
285 : c b r
286 :
287 : c = "consumed" = contiguous shred idxs that have been consumed.
288 : the "consumer" is replay and the idx is
289 : incremented after replaying each block slice.
290 : b = "buffered" = contiguous shred idxs that have been buffered.
291 : when buffered == block_slice_end the next slice of
292 : a block is ready for replay.
293 : r = "received" = highest shred idx received so far. used to detect
294 : when repair is needed.
295 : */
296 :
297 : uint consumed_idx; /* the highest shred idx we've contiguously consumed (consecutive from 0). */
298 : uint buffered_idx; /* the highest shred idx we've contiguously buffered (consecutive from 0). */
299 : uint received_idx; /* the highest shred idx we've received (can be out-of-order). */
300 :
301 : uint data_complete_idx; /* the highest shred idx wrt contiguous entry batches (inclusive). */
302 : uint slot_complete_idx; /* the highest shred idx for the entire slot (inclusive). */
303 :
304 : /* This is a bit vec (fd_set) that tracks every shred idx marked with
305 : FD_SHRED_DATA_FLAG_DATA_COMPLETE. The bit position in the fd_set
306 : corresponds to the shred's index. Note shreds can be received
307 : out-of-order so higher bits might be set before lower bits. */
308 :
309 : fd_block_set_t data_complete_idxs[FD_SHRED_BLK_MAX / sizeof(ulong)];
310 :
311 : /* Helpers for batching tick verification */
312 :
313 : ulong ticks_consumed;
314 : ulong tick_hash_count_accum;
315 : fd_hash_t in_poh_hash; /* TODO: might not be best place to hold this */
316 :
317 : /* Block */
318 :
319 : ulong block_gaddr; /* global address to the start of the allocated fd_block_t */
320 : };
321 : typedef struct fd_block_info fd_block_info_t;
322 :
323 : #define MAP_NAME fd_block_map
324 0 : #define MAP_ELE_T fd_block_info_t
325 0 : #define MAP_KEY slot
326 0 : #define MAP_ELE_IS_FREE(ctx, ele) ((ele)->slot == ULONG_MAX)
327 0 : #define MAP_ELE_FREE(ctx, ele) ((ele)->slot = ULONG_MAX)
328 0 : #define MAP_ELE_MOVE(ctx,dst,src) do { MAP_ELE_T * _src = (src); (*(dst)) = *_src; _src->MAP_KEY = (MAP_KEY_T)ULONG_MAX; } while(0)
329 0 : #define MAP_KEY_HASH(key, seed) (void)(seed), (*(key))
330 : #include "../../util/tmpl/fd_map_slot_para.c"
331 :
332 0 : #define BLOCK_INFO_LOCK_CNT 1024UL
333 : #define BLOCK_INFO_PROBE_CNT 2UL
334 : /*
335 : Rationale for block_map parameters:
336 : - each lock manages block_max / lock_cnt elements, so with block_max
337 : at 4096, each lock would manage 4 contiguous elements.
338 : - Since keys are unique and increment by 1, we can index key to map
339 : bucket by taking key % ele_max directly. This way in theory we
340 : have perfect hashing and never need to probe.
341 : - This breaks when we store more than 4096 contiguous slots,
342 : i.e.: slot 0 collides with slot 4096, but this is at heart an
343 : OOM issue.
344 : - Causes possible contention - consider if we execute n, but are
345 : storing shreds for n+1 -- these are managed by the same lock.
346 : Perhaps opportunity for optimization.
347 : */
348 :
349 : /* fd_block_idx is an in-memory index of finalized blocks that have been
350 : archived to disk. It records the slot together with the byte offset
351 : relative to the start of the file. */
352 :
353 : struct fd_block_idx {
354 : ulong slot;
355 : ulong next;
356 : uint hash;
357 : ulong off;
358 : fd_hash_t block_hash;
359 : fd_hash_t bank_hash;
360 : };
361 : typedef struct fd_block_idx fd_block_idx_t;
362 :
363 : #define MAP_NAME fd_block_idx
364 0 : #define MAP_T fd_block_idx_t
365 0 : #define MAP_KEY slot
366 0 : #define MAP_KEY_HASH(key) ((uint)(key)) /* finalized slots are guaranteed to be unique so perfect hashing */
367 0 : #define MAP_KEY_INVAL(k) (k == ULONG_MAX)
368 : #include "../../util/tmpl/fd_map_dynamic.c"
369 :
370 : struct fd_txn_key {
371 : ulong v[FD_ED25519_SIG_SZ / sizeof( ulong )];
372 : };
373 : typedef struct fd_txn_key fd_txn_key_t;
374 :
375 : struct fd_txn_map {
376 : fd_txn_key_t sig;
377 : ulong next;
378 : ulong slot;
379 : ulong offset;
380 : ulong sz;
381 : ulong meta_gaddr; /* ptr to the transaction metadata */
382 : ulong meta_sz; /* metadata size */
383 : };
384 : typedef struct fd_txn_map fd_txn_map_t;
385 :
386 : FD_FN_PURE int fd_txn_key_equal(fd_txn_key_t const * k0, fd_txn_key_t const * k1);
387 : FD_FN_PURE ulong fd_txn_key_hash(fd_txn_key_t const * k, ulong seed);
388 :
389 : #define MAP_NAME fd_txn_map
390 0 : #define MAP_T fd_txn_map_t
391 0 : #define MAP_KEY sig
392 : #define MAP_KEY_T fd_txn_key_t
393 0 : #define MAP_KEY_EQ(k0,k1) fd_txn_key_equal(k0,k1)
394 0 : #define MAP_KEY_HASH(k,seed) fd_txn_key_hash(k, seed)
395 : #include "../../util/tmpl/fd_map_giant.c"
396 :
397 : /* fd_blockstore_archiver outlines the format of metadata
398 : at the start of an archive file - needed so that archive
399 : files can be read back on initialization. */
400 :
401 : struct fd_blockstore_archiver {
402 : ulong fd_size_max; /* maximum size of the archival file */
403 : ulong num_blocks; /* number of blocks in the archival file. needed for reading back */
404 : ulong head; /* location of least recently written block */
405 : ulong tail; /* location after most recently written block */
406 : };
407 : typedef struct fd_blockstore_archiver fd_blockstore_archiver_t;
408 0 : #define FD_BLOCKSTORE_ARCHIVE_START sizeof(fd_blockstore_archiver_t)
409 :
410 : /* CONCURRENCY NOTES FOR BLOCKSTORE ENJOINERS:
411 :
412 : With the parallelization of the shred map and block map, parts of the
413 : blockstore are concurrent, and parts are not. Block map and shred map
414 : have their own locks, which are managed through the
415 : query_try/query_test APIs. When accessing buf_shred_t and
416 : block_info_t items then, the caller does not need to use
417 : blockstore_start/end_read/write. However, the
418 : blockstore_start/end_read/write still protects the blockstore_shmem_t
419 : object. If you are reading and writing any blockstore_shmem fields
420 : and at the same time accessing the block_info_t or buf_shred_t, you
421 : should call both the blockstore_start/end_read/write APIs AND the map
422 : query_try/test APIs. These are locks of separate concerns and will
423 : not deadlock with each other. TODO update docs when we switch to
424 : fenced read/write for primitive fields in shmem_t. */
425 : struct __attribute__((aligned(FD_BLOCKSTORE_ALIGN))) fd_blockstore_shmem {
426 :
427 : /* Metadata */
428 :
429 : ulong magic;
430 : ulong blockstore_gaddr;
431 : ulong wksp_tag;
432 : ulong seed;
433 :
434 : /* Persistence */
435 :
436 : fd_blockstore_archiver_t archiver;
437 : ulong mrw_slot; /* most recently written slot */
438 :
439 : /* Slot metadata */
440 :
441 : ulong lps; /* latest processed slot */
442 : ulong hcs; /* highest confirmed slot */
443 : ulong wmk; /* watermark. DO NOT MODIFY DIRECTLY. */
444 :
445 : /* Config limits */
446 :
447 : ulong shred_max; /* maximum # of shreds that can be held in memory */
448 : ulong block_max; /* maximum # of blocks that can be held in memory */
449 : ulong idx_max; /* maximum # of blocks that can be indexed from the archival file */
450 : ulong txn_max; /* maximum # of transactions that can be indexed from blocks */
451 : ulong alloc_max; /* maximum bytes that can be allocated */
452 :
453 : //ulong block_map_gaddr; /* map of slot->(slot_meta, block) */
454 : ulong block_idx_gaddr; /* map of slot->byte offset in archival file */
455 : ulong slot_deque_gaddr; /* deque of slot numbers */
456 :
457 : /* IMPORTANT: the txn_map is not safe to write to from multiple threads. */
458 : ulong txn_map_gaddr;
459 : ulong alloc_gaddr;
460 : };
461 : typedef struct fd_blockstore_shmem fd_blockstore_shmem_t;
462 :
463 : /* fd_blockstore_t is a local join to the blockstore. This is specific
464 : to the local address space should not be shared across tiles. */
465 :
466 : struct fd_blockstore {
467 :
468 : /* shared memory region */
469 :
470 : fd_blockstore_shmem_t * shmem; /* read/writes to shmem must call fd_blockstore_start_read()*/
471 :
472 : /* local join handles */
473 :
474 : fd_buf_shred_pool_t shred_pool[1];
475 : fd_buf_shred_map_t shred_map[1];
476 : fd_block_map_t block_map[1];
477 : };
478 : typedef struct fd_blockstore fd_blockstore_t;
479 :
480 : FD_PROTOTYPES_BEGIN
481 :
482 : /* Construction API */
483 :
484 : FD_FN_CONST static inline ulong
485 0 : fd_blockstore_align( void ) {
486 0 : return FD_BLOCKSTORE_ALIGN;
487 0 : }
488 :
489 : /* fd_blockstore_footprint returns the footprint of the entire
490 : blockstore shared memory region occupied by `fd_blockstore_shmem_t`
491 : including data structures. */
492 :
493 : FD_FN_CONST static inline ulong
494 0 : fd_blockstore_footprint( ulong shred_max, ulong block_max, ulong idx_max, ulong txn_max ) {
495 : /* TODO -- when removing, make change in fd_blockstore_new as well */
496 0 : block_max = fd_ulong_pow2_up( block_max );
497 0 : ulong lock_cnt = fd_ulong_min( block_max, BLOCK_INFO_LOCK_CNT );
498 :
499 0 : int lg_idx_max = fd_ulong_find_msb( fd_ulong_pow2_up( idx_max ) );
500 0 : return FD_LAYOUT_FINI(
501 0 : FD_LAYOUT_APPEND(
502 0 : FD_LAYOUT_APPEND(
503 0 : FD_LAYOUT_APPEND(
504 0 : FD_LAYOUT_APPEND(
505 0 : FD_LAYOUT_APPEND(
506 0 : FD_LAYOUT_APPEND(
507 0 : FD_LAYOUT_APPEND(
508 0 : FD_LAYOUT_APPEND(
509 0 : FD_LAYOUT_APPEND(
510 0 : FD_LAYOUT_APPEND(
511 0 : FD_LAYOUT_INIT,
512 0 : alignof(fd_blockstore_shmem_t), sizeof(fd_blockstore_shmem_t) ),
513 0 : alignof(fd_buf_shred_t), sizeof(fd_buf_shred_t) * shred_max ),
514 0 : fd_buf_shred_pool_align(), fd_buf_shred_pool_footprint() ),
515 0 : fd_buf_shred_map_align(), fd_buf_shred_map_footprint( shred_max ) ),
516 0 : alignof(fd_block_info_t), sizeof(fd_block_info_t) * block_max ),
517 0 : fd_block_map_align(), fd_block_map_footprint( block_max, lock_cnt, BLOCK_INFO_PROBE_CNT ) ),
518 0 : fd_block_idx_align(), fd_block_idx_footprint( lg_idx_max ) ),
519 0 : fd_slot_deque_align(), fd_slot_deque_footprint( block_max ) ),
520 0 : fd_txn_map_align(), fd_txn_map_footprint( txn_max ) ),
521 0 : fd_alloc_align(), fd_alloc_footprint() ),
522 0 : fd_blockstore_align() );
523 0 : }
524 :
525 : /* fd_blockstore_new formats a memory region with the appropriate
526 : alignment and footprint into a blockstore. shmem points in the
527 : caller's address space of the memory region to format. Returns shmem
528 : on success (blockstore has ownership of the memory region) and NULL
529 : on failure (no changes, logs details). Caller is not joined on
530 : return. The blockstore will be empty and unlocked. */
531 :
532 : void *
533 : fd_blockstore_new( void * shmem,
534 : ulong wksp_tag,
535 : ulong seed,
536 : ulong shred_max,
537 : ulong block_max,
538 : ulong idx_max,
539 : ulong txn_max );
540 :
541 : /* fd_blockstore_join joins a blockstore. ljoin points to a
542 : fd_blockstore_t compatible memory region in the caller's address
543 : space used to hold info about the local join, shblockstore points in
544 : the caller's address space to the memory region containing the
545 : blockstore. Returns a handle to the caller's local join on success
546 : (join has ownership of the ljoin region) and NULL on failure (no
547 : changes, logs details). */
548 :
549 : fd_blockstore_t *
550 : fd_blockstore_join( void * ljoin, void * shblockstore );
551 :
552 : void *
553 : fd_blockstore_leave( fd_blockstore_t * blockstore );
554 :
555 : void *
556 : fd_blockstore_delete( void * shblockstore );
557 :
558 : /* fd_blockstore_init initializes a blockstore with the given
559 : `slot_bank`. This bank is used for initializing fields (SMR, etc.),
560 : and should be the bank upon finishing a snapshot load if booting from
561 : a snapshot, genesis bank otherwise. It is also used to "fake" the
562 : snapshot block as if that block's data were available. The metadata
563 : for this block's slot will be populated (fd_block_map_t) but the
564 : actual block data (fd_block_t) won't exist. This is done to bootstrap
565 : the various components for live replay (turbine, repair, etc.)
566 :
567 : `fd` is a file descriptor for the blockstore archival file. As part
568 : of `init`, blockstore rebuilds an in-memory index of the archival
569 : file. */
570 :
571 : fd_blockstore_t *
572 : fd_blockstore_init( fd_blockstore_t * blockstore,
573 : int fd,
574 : ulong fd_size_max,
575 : ulong slot );
576 :
577 : /* fd_blockstore_fini finalizes a blockstore.
578 :
579 : IMPORTANT! Caller MUST hold the read lock when calling this
580 : function. */
581 :
582 : void
583 : fd_blockstore_fini( fd_blockstore_t * blockstore );
584 :
585 : /* Accessors */
586 :
587 : /* fd_blockstore_wksp returns the local join to the wksp backing the
588 : blockstore. The lifetime of the returned pointer is at least as long
589 : as the lifetime of the local join. Assumes blockstore is a current
590 : local join. */
591 :
592 : FD_FN_PURE static inline fd_wksp_t *
593 0 : fd_blockstore_wksp( fd_blockstore_t * blockstore ) {
594 0 : return (fd_wksp_t *)( ( (ulong)blockstore->shmem ) - blockstore->shmem->blockstore_gaddr );
595 0 : }
596 :
597 : /* fd_blockstore_wksp_tag returns the workspace allocation tag used by
598 : the blockstore for its wksp allocations. Will be positive. Assumes
599 : blockstore is a current local join. */
600 :
601 : FD_FN_PURE static inline ulong
602 0 : fd_blockstore_wksp_tag( fd_blockstore_t const * blockstore ) {
603 0 : return blockstore->shmem->wksp_tag;
604 0 : }
605 :
606 : /* fd_blockstore_seed returns the hash seed used by the blockstore for various hash
607 : functions. Arbitrary value. Assumes blockstore is a current local join.
608 : TODO: consider renaming hash_seed? */
609 : FD_FN_PURE static inline ulong
610 0 : fd_blockstore_seed( fd_blockstore_t const * blockstore ) {
611 0 : return blockstore->shmem->seed;
612 0 : }
613 :
614 : /* fd_block_idx returns a pointer in the caller's address space to the
615 : fd_block_idx_t in the blockstore wksp. Assumes blockstore is local
616 : join. Lifetime of the returned pointer is that of the local join. */
617 :
618 : FD_FN_PURE static inline fd_block_idx_t *
619 0 : fd_blockstore_block_idx( fd_blockstore_t * blockstore ) {
620 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore ), blockstore->shmem->block_idx_gaddr );
621 0 : }
622 :
623 : /* fd_slot_deque returns a pointer in the caller's address space to the
624 : fd_slot_deque_t in the blockstore wksp. Assumes blockstore is local
625 : join. Lifetime of the returned pointer is that of the local join. */
626 :
627 : FD_FN_PURE static inline ulong *
628 0 : fd_blockstore_slot_deque( fd_blockstore_t * blockstore ) {
629 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore), blockstore->shmem->slot_deque_gaddr );
630 0 : }
631 :
632 : /* fd_txn_map returns a pointer in the caller's address space to the blockstore's
633 : block map. Assumes blockstore is local join. Lifetime of the returned pointer is that of the
634 : local join. */
635 :
636 : FD_FN_PURE static inline fd_txn_map_t *
637 0 : fd_blockstore_txn_map( fd_blockstore_t * blockstore ) {
638 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore), blockstore->shmem->txn_map_gaddr );
639 0 : }
640 :
641 : /* fd_blockstore_alloc returns a pointer in the caller's address space to
642 : the blockstore's allocator. */
643 :
644 : FD_FN_PURE static inline fd_alloc_t * /* Lifetime is that of the local join */
645 0 : fd_blockstore_alloc( fd_blockstore_t * blockstore ) {
646 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore), blockstore->shmem->alloc_gaddr );
647 0 : }
648 :
649 : /* fd_blockstore_shred_test returns 1 if a shred keyed by (slot, idx) is
650 : already in the blockstore and 0 otherwise. */
651 :
652 : int
653 : fd_blockstore_shred_test( fd_blockstore_t * blockstore, ulong slot, uint idx );
654 :
655 : /* fd_buf_shred_query_copy_data queries the blockstore for shred at
656 : slot, shred_idx. Copies the shred data to the given buffer and
657 : returns the data size. Returns -1 on failure.
658 :
659 : IMPORTANT! Caller MUST hold the read lock when calling this
660 : function. */
661 :
662 : long
663 : fd_buf_shred_query_copy_data( fd_blockstore_t * blockstore,
664 : ulong slot,
665 : uint shred_idx,
666 : void * buf,
667 : ulong buf_max );
668 :
669 : /* fd_blockstore_block_hash_query performs a blocking query (concurrent
670 : writers are not blocked) for the block hash of slot. Returns
671 : FD_BLOCKSTORE_SUCCESS on success and FD_BLOCKSTORE_ERR_KEY if slot is
672 : not in blockstore. Cannot fail. On success, a copy of the block
673 : hash will be populated in `block_hash`. Retains no interest in
674 : `slot` or `block_hash`.
675 :
676 : The block hash is the final poh hash for a slot and available on the
677 : last microblock header. */
678 :
679 : int
680 : fd_blockstore_block_hash_query( fd_blockstore_t * blockstore, ulong slot, fd_hash_t * block_hash );
681 :
682 : /* fd_blockstore_bank_hash_query performs a blocking query (concurrent
683 : writers are not blocked) for the bank hash of slot. Returns
684 : FD_BLOCKSTORE_SUCCESS on success and FD_BLOCKSTORE_ERR_KEY if slot is
685 : not in blockstore. Cannot fail. On success, a copy of the bank hash
686 : will be populated in `bank_hash`. Retains no interest in `slot` or
687 : `bank_hash`.
688 :
689 : The bank hash is a hash of the execution state (the "bank") after
690 : executing the block for a given slot. */
691 :
692 : int
693 : fd_blockstore_bank_hash_query( fd_blockstore_t * blockstore, ulong slot, fd_hash_t * bank_hash );
694 :
695 : /* fd_blockstore_block_map_query queries the blockstore for the block
696 : map entry at slot. Returns a pointer to the slot meta or NULL if not
697 : in blockstore.
698 :
699 : IMPORTANT! This should only be used for single-threaded / offline
700 : use-cases as it does not test the query. Read notes below for
701 : block_map usage in live. */
702 :
703 : fd_block_info_t *
704 : fd_blockstore_block_map_query( fd_blockstore_t * blockstore, ulong slot );
705 :
706 : /* IMPORTANT! NOTES FOR block_map USAGE:
707 :
708 : The block_info entries must be queried using the query_try/query_test
709 : pattern. This will frequently look like:
710 :
711 : int err = FD_MAP_ERR_AGAIN;
712 : loop while( err == FD_MAP_ERR_AGAIN )
713 : block_map_query_t query;
714 : err = fd_block_map_query_try( nonblocking );
715 : block_info_t * ele = fd_block_map_query_ele(query);
716 : if ERROR is FD_MAP_ERR_KEY, then the slot is not found.
717 : if ERROR is FD_MAP_ERR_AGAIN, then immediately continue.
718 : // important to handle ALL possible return err codes *before*
719 : // accessing the ele, as the ele will be the sentinel (usually NULL)
720 : speculatively execute <stuff>
721 : - no side effects
722 : - no early return
723 : err = fd_block_map_query_test(query)
724 : end loop
725 :
726 : Some accessors are provided to callers that already do this pattern,
727 : and handle the looping querying. For example, block_hash_copy, and
728 : parent_slot_query. However, for most caller use cases, it would be
729 : much more effecient to use the query_try/query_test pattern directly.
730 :
731 : Example: if you are accessing a block_info_t m, and m->parent_slot to
732 : the blockstore->shmem->smr, then you will need to start_write on the
733 : blockstore, query_try for the block_info_t object, set
734 : shmem->smr = meta->parent_slot, and then query_test, AND call
735 : blockstore_end_write. In the case that there's block_info contention,
736 : i.e. another thread is removing the block_info_t object of interest
737 : as we are trying to access it, the query_test will ERR_AGAIN, we will
738 : loop back and try again, hit the FD_MAP_ERR_KEY condition
739 : (and exit the loop gracefully), and we will have an incorrectly set
740 : shmem->smr.
741 :
742 : So depending on the complexity of what's being executed, it's easiest
743 : to directly copy what you need from the block_info_t into a variable
744 : outside the context of the loop, and use it further below, ex:
745 :
746 : ulong map_item = NULL_ITEM;
747 : loop {
748 : query_try
749 : map_item = ele->map_item; // like parent_slot
750 : query_test
751 : }
752 : check if map_item is NULL_ITEM
753 : fd_blockstore_start_write
754 : use map_item
755 : fd_blockstore_end_write
756 :
757 : Writes and updates (blocking). The pattern is:
758 : int err = fd_block_map_prepare( &slot, query, blocking );
759 : block_info_t * ele = fd_block_map_query_ele(query);
760 :
761 : IF slot was an existing key, then ele->slot == slot, and you are MODIFYING
762 : <modify ele>
763 : If slot was not an existing key, then ele->slot == 0, and you are INSERTING
764 : ele->slot = slot;
765 : <initialize ele>
766 :
767 : fd_block_map_publish(query); // will always succeed */
768 :
769 : /* fd_blockstore_parent_slot_query queries the parent slot of slot.
770 :
771 : This is non-blocking. */
772 : ulong
773 : fd_blockstore_parent_slot_query( fd_blockstore_t * blockstore, ulong slot );
774 :
775 : /* fd_blockstore_block_map_query_volatile is the same as above except it
776 : only copies out the metadata (fd_block_map_t). Returns
777 : FD_BLOCKSTORE_SLOT_MISSING if slot is missing, otherwise
778 : FD_BLOCKSTORE_SUCCESS. */
779 :
780 : int
781 : fd_blockstore_block_map_query_volatile( fd_blockstore_t * blockstore,
782 : int fd,
783 : ulong slot,
784 : fd_block_info_t * block_info_out ) ;
785 :
786 : /* fd_blockstore_txn_query queries the transaction data for the given
787 : signature.
788 :
789 : IMPORTANT! Caller MUST hold the read lock when calling this
790 : function. */
791 :
792 : fd_txn_map_t *
793 : fd_blockstore_txn_query( fd_blockstore_t * blockstore, uchar const sig[static FD_ED25519_SIG_SZ] );
794 :
795 : /* Query the transaction data for the given signature in a thread
796 : safe manner. The transaction data is copied out. txn_data_out can
797 : be NULL if you are only interested in the transaction metadata. */
798 : int
799 : fd_blockstore_txn_query_volatile( fd_blockstore_t * blockstore,
800 : int fd,
801 : uchar const sig[static FD_ED25519_SIG_SZ],
802 : fd_txn_map_t * txn_out,
803 : long * blk_ts,
804 : uchar * blk_flags,
805 : uchar txn_data_out[FD_TXN_MTU] );
806 :
807 : /* fd_blockstore_block_info_test tests if a block meta entry exists for
808 : the given slot. Returns 1 if the entry exists and 0 otherwise.
809 :
810 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
811 : this function. */
812 : int
813 : fd_blockstore_block_info_test( fd_blockstore_t * blockstore, ulong slot );
814 :
815 : /* fd_blockstore_block_info_remove removes a block meta entry for
816 : the given slot. Returns SUCCESS if the entry exists and an
817 : error code otherwise.
818 :
819 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
820 : this function. */
821 : int
822 : fd_blockstore_block_info_remove( fd_blockstore_t * blockstore, ulong slot );
823 :
824 : /* fd_blockstore_slot_remove removes slot from blockstore, including all
825 : relevant internal structures.
826 :
827 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
828 : this function. */
829 : void
830 : fd_blockstore_slot_remove( fd_blockstore_t * blockstore, ulong slot );
831 :
832 : /* Operations */
833 :
834 : /* fd_blockstore_shred_insert inserts shred into the blockstore, fast
835 : O(1). Returns the current `consumed_idx` for the shred's slot if
836 : insert is successful, otherwise returns FD_SHRED_IDX_NULL on error.
837 : Reasons for error include this shred is already in the blockstore or
838 : the blockstore is full.
839 :
840 : fd_blockstore_shred_insert will manage locking, so the caller
841 : should NOT be acquiring the blockstore read/write lock before
842 : calling this function. */
843 :
844 : void
845 : fd_blockstore_shred_insert( fd_blockstore_t * blockstore, fd_shred_t const * shred );
846 :
847 : /* fd_blockstore_buffered_shreds_remove removes all the unassembled shreds
848 : for a slot */
849 : void
850 : fd_blockstore_shred_remove( fd_blockstore_t * blockstore, ulong slot, uint idx );
851 :
852 : /* fd_blockstore_slice_query queries for the block slice beginning from
853 : shred `start_idx`, ending at `end_idx`, inclusive. Validates start
854 : and end_idx as valid batch boundaries. Copies at most `max` bytes of
855 : the shred payloads, and returns FD_BLOCKSTORE_NO_MEM if the buffer is
856 : too small.
857 :
858 : Returns FD_BLOCKSTORE_SUCCESS (0) on success and a FD_MAP_ERR
859 : (negative) on failure. On success, `buf` will be populated with the
860 : copied slice and `buf_sz` will contain the number of bytes copied.
861 : Caller must ignore the values of `buf` and `buf_sz` on failure.
862 :
863 : Implementation is lockfree and safe with concurrent operations on
864 : blockstore. */
865 :
866 : int
867 : fd_blockstore_slice_query( fd_blockstore_t * blockstore,
868 : ulong slot,
869 : uint start_idx,
870 : uint end_idx,
871 : ulong max,
872 : uchar * buf,
873 : ulong * buf_sz );
874 :
875 : /* fd_blockstore_shreds_complete should be a replacement for anywhere that is
876 : querying for an fd_block_t * for existence but not actually using the block data.
877 : Semantically equivalent to query_block( slot ) != NULL.
878 :
879 : Implementation is lockfree and safe with concurrent operations on
880 : blockstore. */
881 : int
882 : fd_blockstore_shreds_complete( fd_blockstore_t * blockstore, ulong slot );
883 :
884 : /* fd_blockstore_block_height_update sets the block height.
885 :
886 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
887 : this function. */
888 : void
889 : fd_blockstore_block_height_update( fd_blockstore_t * blockstore, ulong slot, ulong block_height );
890 :
891 : ulong
892 : fd_blockstore_block_height_query( fd_blockstore_t * blockstore, ulong slot );
893 :
894 : /* fd_blockstore_publish publishes all blocks until the current
895 : blockstore smr (`blockstore->smr`). Publishing entails 1. pruning
896 : and 2. archiving. Pruning removes any blocks that are not part of
897 : the same fork as the smr (hence the name pruning, like pruning the
898 : branches of a tree). Archiving removes from memory any slots < smr
899 : that are on the same fork, but writes those blocks out to disk using
900 : the provided file descriptor to the archival file `fd`.
901 :
902 : Note that slots < smr are ancestors of the smr, and are therefore
903 : finalized slots which is why they are archived. Blocks removed as a
904 : result of pruning are not finalized, and therefore not archived.
905 :
906 : IMPORTANT! Caller MUST hold the write lock when calling this
907 : function. */
908 :
909 : void
910 : fd_blockstore_publish( fd_blockstore_t * blockstore, int fd, ulong wmk );
911 :
912 : void
913 : fd_blockstore_log_block_status( fd_blockstore_t * blockstore, ulong around_slot );
914 :
915 : /* fd_blockstore_log_mem_usage logs the memory usage of blockstore in a
916 : human-readable format. Caller MUST hold the read lock. */
917 :
918 : void
919 : fd_blockstore_log_mem_usage( fd_blockstore_t * blockstore );
920 :
921 : FD_PROTOTYPES_END
922 :
923 : #ifndef BLOCK_ARCHIVING
924 : #define BLOCK_ARCHIVING 0
925 : #endif
926 :
927 : #endif /* HEADER_fd_src_flamenco_runtime_fd_blockstore_h */
|