Line data Source code
1 : #ifndef HEADER_fd_src_flamenco_runtime_fd_blockstore_h
2 : #define HEADER_fd_src_flamenco_runtime_fd_blockstore_h
3 :
4 : /* Blockstore is a high-performance database for in-memory indexing and
5 : durably storing blocks.
6 :
7 : `fd_blockstore` defines a number of useful types e.g. `fd_block_t`,
8 : `fd_block_shred`, etc.
9 :
10 : The blockstore alloc is used for allocating wksp resources for shred
11 : headers, microblock headers, and blocks. This is an fd_alloc.
12 : Allocations from this allocator will be tagged with wksp_tag and
13 : operations on this allocator will use concurrency group 0. */
14 :
15 : #include "../../ballet/block/fd_microblock.h"
16 : #include "../../ballet/shred/fd_deshredder.h"
17 : #include "../../ballet/shred/fd_shred.h"
18 : #include "../fd_flamenco_base.h"
19 : #include "../types/fd_types.h"
20 : #include "fd_rwseq_lock.h"
21 : #include "stdbool.h"
22 : #include <fcntl.h>
23 :
24 : /* FD_BLOCKSTORE_ALIGN specifies the alignment needed for blockstore.
25 : ALIGN is double x86 cache line to mitigate various kinds of false
26 : sharing (eg. ACLPF adjacent cache line prefetch). */
27 :
28 0 : #define FD_BLOCKSTORE_ALIGN (128UL)
29 :
30 : /* FD_BLOCKSTORE_MAGIC defines a magic number for verifying the memory
31 : of blockstore is not corrupted. */
32 :
33 0 : #define FD_BLOCKSTORE_MAGIC (0xf17eda2ce7b10c00UL) /* firedancer bloc version 0 */
34 :
35 : /* DO NOT MODIFY. */
36 : // #define FD_BUF_SHRED_MAP_MAX (1UL << 24UL) /* 16 million shreds can be buffered */
37 :
38 : /* TODO this can be removed if we explicitly manage a memory pool for
39 : the fd_block_map_t entries */
40 0 : #define FD_BLOCKSTORE_CHILD_SLOT_MAX (32UL) /* the maximum # of children a slot can have */
41 0 : #define FD_BLOCKSTORE_ARCHIVE_MIN_SIZE (1UL << 26UL) /* 64MB := ceil(MAX_DATA_SHREDS_PER_SLOT*1228) */
42 :
43 : /* FD_SLICE_ALIGN specifies the alignment needed for a block slice.
44 : ALIGN is double x86 cache line to mitigate various kinds of false
45 : sharing (eg. ACLPF adjacent cache line prefetch). */
46 :
47 : #define FD_SLICE_ALIGN (128UL)
48 :
49 : /* FD_SLICE_MAX specifies the maximum size of an entry batch. This is
50 : equivalent to the maximum size of a block (ie. a block with a single
51 : entry batch). */
52 :
53 0 : #define FD_SLICE_MAX (FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT)
54 :
55 : /* FD_SLICE_MAX_WITH_HEADERS specifies the maximum size of all of the
56 : shreds that can be in an entry batch. This is equivalent to max
57 : number of shreds (including header and payload) that can be in a
58 : single slot. */
59 :
60 0 : #define FD_SLICE_MAX_WITH_HEADERS (FD_SHRED_DATA_HEADER_MAX_PER_SLOT + FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT)
61 :
62 : /* 64 ticks per slot, and then one min size transaction per microblock
63 : for all the remaining microblocks.
64 : This bound should be used along with the transaction parser and tick
65 : verifier to enforce the assumptions.
66 : This is NOT a standalone conservative bound against malicious
67 : validators.
68 : A tighter bound could probably be derived if necessary. */
69 :
70 0 : #define FD_MICROBLOCK_MAX_PER_SLOT ((FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT - 64UL*sizeof(fd_microblock_hdr_t)) / (sizeof(fd_microblock_hdr_t)+FD_TXN_MIN_SERIALIZED_SZ) + 64UL) /* 200,796 */
71 : /* 64 ticks per slot, and a single gigantic microblock containing min
72 : size transactions. */
73 : #define FD_TXN_MAX_PER_SLOT ((FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT - 65UL*sizeof(fd_microblock_hdr_t)) / (FD_TXN_MIN_SERIALIZED_SZ)) /* 272,635 */
74 :
75 : // TODO centralize these
76 : // https://github.com/firedancer-io/solana/blob/v1.17.5/sdk/program/src/clock.rs#L34
77 : #define FD_MS_PER_TICK 6
78 :
79 : // https://github.com/firedancer-io/solana/blob/v1.17.5/core/src/repair/repair_service.rs#L55
80 : #define FD_REPAIR_TIMEOUT (200 / FD_MS_PER_TICK)
81 :
82 0 : #define FD_BLOCKSTORE_SUCCESS 0
83 : #define FD_BLOCKSTORE_SUCCESS_SLOT_COMPLETE 1
84 0 : #define FD_BLOCKSTORE_ERR_INVAL (-1)
85 : #define FD_BLOCKSTORE_ERR_AGAIN (-2)
86 0 : #define FD_BLOCKSTORE_ERR_CORRUPT (-3)
87 : #define FD_BLOCKSTORE_ERR_EMPTY (-4)
88 : #define FD_BLOCKSTORE_ERR_FULL (-5)
89 0 : #define FD_BLOCKSTORE_ERR_KEY (-6)
90 : #define FD_BLOCKSTORE_ERR_SHRED_FULL -1 /* no space left for shreds */
91 : #define FD_BLOCKSTORE_ERR_SLOT_FULL -2 /* no space left for slots */
92 : #define FD_BLOCKSTORE_ERR_SHRED_MISSING -4
93 0 : #define FD_BLOCKSTORE_ERR_SLOT_MISSING -5
94 0 : #define FD_BLOCKSTORE_ERR_SHRED_INVALID -7 /* shred was invalid */
95 : #define FD_BLOCKSTORE_ERR_DESHRED_INVALID -8 /* deshredded block was invalid */
96 0 : #define FD_BLOCKSTORE_ERR_NO_MEM -9 /* no mem */
97 : #define FD_BLOCKSTORE_ERR_UNKNOWN -99
98 :
99 0 : static inline char const * fd_blockstore_strerror( int err ) {
100 0 : switch( err ) {
101 0 : case FD_BLOCKSTORE_SUCCESS: return "success";
102 0 : case FD_BLOCKSTORE_ERR_INVAL: return "bad input";
103 0 : case FD_BLOCKSTORE_ERR_AGAIN: return "try again";
104 0 : case FD_BLOCKSTORE_ERR_CORRUPT: return "corruption detected";
105 0 : case FD_BLOCKSTORE_ERR_EMPTY: return "empty";
106 0 : case FD_BLOCKSTORE_ERR_FULL: return "full";
107 0 : case FD_BLOCKSTORE_ERR_KEY: return "key not found";
108 0 : default: break;
109 0 : }
110 0 : return "unknown";
111 0 : }
112 :
113 : struct fd_shred_key {
114 : ulong slot;
115 : uint idx;
116 : };
117 : typedef struct fd_shred_key fd_shred_key_t;
118 :
119 : static const fd_shred_key_t fd_shred_key_null = { 0 };
120 : #define FD_SHRED_KEY_NULL fd_shred_key_null
121 : #define FD_SHRED_KEY_INVAL(key) (!((key).slot) & !((key).idx))
122 0 : #define FD_SHRED_KEY_EQ(k0,k1) (!(((k0).slot) ^ ((k1).slot))) & !(((k0).idx) ^ (((k1).idx)))
123 0 : #define FD_SHRED_KEY_HASH(key) ((uint)(((key).slot)<<15UL) | (((key).idx))) /* current max shred idx is 32KB = 2 << 15*/
124 :
125 : /* fd_buf_shred is a thin wrapper around fd_shred_t that facilitates
126 : buffering data shreds before all the shreds for a slot have been
127 : received. After all shreds are received, these buffered shreds are
128 : released back into memory pool and future queries for the shreds are
129 : offset into the block data directly.
130 :
131 : The blockstore is only aware of data shreds and all APIs involving
132 : shreds refers to data shreds.
133 :
134 : Shreds are buffered into a map as they are received:
135 :
136 : | 0 | 1 | 2 | x | x | 5 | x |
137 : ^ ^
138 : c r
139 :
140 : c = "consumed" = contiguous window starting from index 0
141 : r = "received" = highest index received so far
142 :
143 : Shred memory layout while stored in the map:
144 :
145 : | shred hdr | shred payload |
146 : */
147 : struct __attribute__((aligned(128UL))) fd_buf_shred {
148 : fd_shred_key_t key;
149 : ulong prev;
150 : ulong next;
151 : ulong memo;
152 : int eqvoc; /* we've seen an equivocating version of this
153 : shred (same key but different payload). */
154 : union {
155 : fd_shred_t hdr; /* shred header */
156 : uchar buf[FD_SHRED_MIN_SZ]; /* the entire shred buffer, both header and payload. */
157 : };
158 : };
159 : typedef struct fd_buf_shred fd_buf_shred_t;
160 :
161 : #define POOL_NAME fd_buf_shred_pool
162 0 : #define POOL_ELE_T fd_buf_shred_t
163 : #include "../../util/tmpl/fd_pool_para.c"
164 :
165 : #define MAP_NAME fd_buf_shred_map
166 0 : #define MAP_ELE_T fd_buf_shred_t
167 0 : #define MAP_KEY_T fd_shred_key_t
168 0 : #define MAP_KEY_EQ(k0,k1) (FD_SHRED_KEY_EQ(*k0,*k1))
169 : #define MAP_KEY_EQ_IS_SLOW 1
170 0 : #define MAP_KEY_HASH(key,seed) (FD_SHRED_KEY_HASH(*key)^seed)
171 : #include "../../util/tmpl/fd_map_chain_para.c"
172 :
173 : #define DEQUE_NAME fd_slot_deque
174 0 : #define DEQUE_T ulong
175 : #include "../../util/tmpl/fd_deque_dynamic.c"
176 :
177 : /* fd_block_shred_t is a shred that has been assembled into a block. The
178 : shred begins at `off` relative to the start of the block's data
179 : region. */
180 : struct fd_block_shred {
181 : fd_shred_t hdr; /* ptr to the data shred header */
182 : ulong off; /* offset to the payload relative to the start of the block's data region */
183 : };
184 : typedef struct fd_block_shred fd_block_shred_t;
185 :
186 : /*
187 : * fd_block_entry_batch_t is a microblock/entry batch within a block.
188 : * The offset is relative to the start of the block's data region,
189 : * and indicates where the batch ends. The (exclusive) end offset of
190 : * batch i is the (inclusive) start offset of batch i+1. The 0th batch
191 : * always starts at offset 0.
192 : * On the wire, the presence of one of the COMPLETE flags in a data
193 : * shred marks the end of a batch.
194 : * In other words, batch ends are aligned with shred ends, and batch
195 : * starts are aligned with shred starts. Usually a batch comprises
196 : * multiple shreds, and a block comprises multiple batches.
197 : * This information is useful because bincode deserialization needs to
198 : * be performed on a per-batch basis. Precisely a single array of
199 : * microblocks/entries is expected to be deserialized from a batch.
200 : * Trailing bytes in each batch are ignored by default.
201 : */
202 : struct fd_block_entry_batch {
203 : ulong end_off; /* exclusive */
204 : };
205 : typedef struct fd_block_entry_batch fd_block_entry_batch_t;
206 :
207 : /* fd_block_micro_t is a microblock ("entry" in Solana parlance) within
208 : a block. The microblock begins at `off` relative to the start of the
209 : block's data region. */
210 : struct fd_block_micro {
211 : ulong off; /* offset into block data */
212 : };
213 : typedef struct fd_block_micro fd_block_micro_t;
214 :
215 : /* If the 0th bit is set, this indicates the block is preparing, which
216 : means it might be partially executed e.g. a subset of the microblocks
217 : have been executed. It is not safe to remove, relocate, or modify
218 : the block in any way at this time.
219 :
220 : Callers holding a pointer to a block should always make sure to
221 : inspect this flag.
222 :
223 : Other flags mainly provide useful metadata for read-only callers, eg.
224 : RPC. */
225 :
226 0 : #define FD_BLOCK_FLAG_RECEIVING 0 /* xxxxxxx1 still receiving shreds */
227 0 : #define FD_BLOCK_FLAG_COMPLETED 1 /* xxxxxx1x received the block ie. all shreds (SLOT_COMPLETE) */
228 : #define FD_BLOCK_FLAG_REPLAYING 2 /* xxxxx1xx replay in progress (DO NOT REMOVE) */
229 0 : #define FD_BLOCK_FLAG_PROCESSED 3 /* xxxx1xxx successfully replayed the block */
230 0 : #define FD_BLOCK_FLAG_EQVOCSAFE 4 /* xxxx1xxx 52% of cluster has voted on this (slot, bank hash) */
231 0 : #define FD_BLOCK_FLAG_CONFIRMED 5 /* xxx1xxxx 2/3 of cluster has voted on this (slot, bank hash) */
232 0 : #define FD_BLOCK_FLAG_FINALIZED 6 /* xx1xxxxx 2/3 of cluster has rooted this slot */
233 : #define FD_BLOCK_FLAG_DEADBLOCK 7 /* x1xxxxxx failed to replay the block */
234 :
235 : /* Rewards assigned after block is executed */
236 :
237 : struct fd_block_rewards {
238 : ulong collected_fees;
239 : fd_hash_t leader;
240 : ulong post_balance;
241 : };
242 : typedef struct fd_block_rewards fd_block_rewards_t;
243 :
244 : /* Remaining bits [4, 8) are reserved.
245 :
246 : To avoid confusion, please use `fd_bits.h` API
247 : ie. `fd_uchar_set_bit`, `fd_uchar_extract_bit`. */
248 :
249 : #define SET_NAME fd_block_set
250 : #define SET_MAX FD_SHRED_BLK_MAX
251 : #include "../../util/tmpl/fd_set.c"
252 :
253 : struct fd_block_info {
254 : ulong slot; /* map key */
255 : ulong next; /* reserved for use by fd_map_giant.c */
256 :
257 : /* Ancestry */
258 :
259 : ulong parent_slot;
260 : ulong child_slots[FD_BLOCKSTORE_CHILD_SLOT_MAX];
261 : ulong child_slot_cnt;
262 :
263 : /* Metadata */
264 :
265 : /* To be banished after offline ledger replay is removed. These fields
266 : are not used for replay. */
267 : ulong block_height;
268 : fd_hash_t block_hash;
269 : fd_hash_t bank_hash;
270 :
271 : ulong fec_cnt; /* the number of FEC sets in the slot */
272 : uchar flags;
273 : long ts; /* the wallclock time when we finished receiving the block. */
274 :
275 : /* Windowing
276 :
277 : Shreds are buffered into a map as they are received:
278 :
279 : | 0 | 1 | 2 | x | x | 5 | x |
280 : ^ ^ ^
281 : c b r
282 :
283 : c = "consumed" = contiguous shred idxs that have been consumed.
284 : the "consumer" is replay and the idx is
285 : incremented after replaying each block slice.
286 : b = "buffered" = contiguous shred idxs that have been buffered.
287 : when buffered == block_slice_end the next slice of
288 : a block is ready for replay.
289 : r = "received" = highest shred idx received so far. used to detect
290 : when repair is needed.
291 : */
292 :
293 : uint consumed_idx; /* the highest shred idx we've contiguously consumed (consecutive from 0). */
294 : uint buffered_idx; /* the highest shred idx we've contiguously buffered (consecutive from 0). */
295 : uint received_idx; /* the highest shred idx we've received (can be out-of-order). */
296 :
297 : uint data_complete_idx; /* the highest shred idx wrt contiguous entry batches (inclusive). */
298 : uint slot_complete_idx; /* the highest shred idx for the entire slot (inclusive). */
299 :
300 : /* This is a bit vec (fd_set) that tracks every shred idx marked with
301 : FD_SHRED_DATA_FLAG_DATA_COMPLETE. The bit position in the fd_set
302 : corresponds to the shred's index. Note shreds can be received
303 : out-of-order so higher bits might be set before lower bits. */
304 :
305 : fd_block_set_t data_complete_idxs[FD_SHRED_BLK_MAX / sizeof(ulong)];
306 :
307 : /* Helpers for batching tick verification */
308 :
309 : ulong ticks_consumed;
310 : ulong tick_hash_count_accum;
311 : fd_hash_t in_poh_hash; /* TODO: might not be best place to hold this */
312 :
313 : /* Block */
314 :
315 : ulong block_gaddr; /* global address to the start of the allocated fd_block_t */
316 : };
317 : typedef struct fd_block_info fd_block_info_t;
318 :
319 : #define MAP_NAME fd_block_map
320 0 : #define MAP_ELE_T fd_block_info_t
321 0 : #define MAP_KEY slot
322 0 : #define MAP_ELE_IS_FREE(ctx, ele) ((ele)->slot == ULONG_MAX)
323 0 : #define MAP_ELE_FREE(ctx, ele) ((ele)->slot = ULONG_MAX)
324 0 : #define MAP_ELE_MOVE(ctx,dst,src) do { MAP_ELE_T * _src = (src); (*(dst)) = *_src; _src->MAP_KEY = (MAP_KEY_T)ULONG_MAX; } while(0)
325 0 : #define MAP_KEY_HASH(key, seed) (void)(seed), (*(key))
326 : #include "../../util/tmpl/fd_map_slot_para.c"
327 :
328 0 : #define BLOCK_INFO_LOCK_CNT 1024UL
329 : #define BLOCK_INFO_PROBE_CNT 2UL
330 : /*
331 : Rationale for block_map parameters:
332 : - each lock manages block_max / lock_cnt elements, so with block_max
333 : at 4096, each lock would manage 4 contiguous elements.
334 : - Since keys are unique and increment by 1, we can index key to map
335 : bucket by taking key % ele_max directly. This way in theory we
336 : have perfect hashing and never need to probe.
337 : - This breaks when we store more than 4096 contiguous slots,
338 : i.e.: slot 0 collides with slot 4096, but this is at heart an
339 : OOM issue.
340 : - Causes possible contention - consider if we execute n, but are
341 : storing shreds for n+1 -- these are managed by the same lock.
342 : Perhaps opportunity for optimization.
343 : */
344 :
345 : /* fd_block_idx is an in-memory index of finalized blocks that have been
346 : archived to disk. It records the slot together with the byte offset
347 : relative to the start of the file. */
348 :
349 : struct fd_block_idx {
350 : ulong slot;
351 : ulong next;
352 : uint hash;
353 : ulong off;
354 : fd_hash_t block_hash;
355 : fd_hash_t bank_hash;
356 : };
357 : typedef struct fd_block_idx fd_block_idx_t;
358 :
359 : #define MAP_NAME fd_block_idx
360 0 : #define MAP_T fd_block_idx_t
361 0 : #define MAP_KEY slot
362 0 : #define MAP_KEY_HASH(key) ((uint)(key)) /* finalized slots are guaranteed to be unique so perfect hashing */
363 0 : #define MAP_KEY_INVAL(k) (k == ULONG_MAX)
364 : #include "../../util/tmpl/fd_map_dynamic.c"
365 :
366 : /* fd_blockstore_archiver outlines the format of metadata
367 : at the start of an archive file - needed so that archive
368 : files can be read back on initialization. */
369 :
370 : struct fd_blockstore_archiver {
371 : ulong fd_size_max; /* maximum size of the archival file */
372 : ulong num_blocks; /* number of blocks in the archival file. needed for reading back */
373 : ulong head; /* location of least recently written block */
374 : ulong tail; /* location after most recently written block */
375 : };
376 : typedef struct fd_blockstore_archiver fd_blockstore_archiver_t;
377 0 : #define FD_BLOCKSTORE_ARCHIVE_START sizeof(fd_blockstore_archiver_t)
378 :
379 : /* CONCURRENCY NOTES FOR BLOCKSTORE ENJOINERS:
380 :
381 : With the parallelization of the shred map and block map, parts of the
382 : blockstore are concurrent, and parts are not. Block map and shred map
383 : have their own locks, which are managed through the
384 : query_try/query_test APIs. When accessing buf_shred_t and
385 : block_info_t items then, the caller does not need to use
386 : blockstore_start/end_read/write. However, the
387 : blockstore_start/end_read/write still protects the blockstore_shmem_t
388 : object. If you are reading and writing any blockstore_shmem fields
389 : and at the same time accessing the block_info_t or buf_shred_t, you
390 : should call both the blockstore_start/end_read/write APIs AND the map
391 : query_try/test APIs. These are locks of separate concerns and will
392 : not deadlock with each other. TODO update docs when we switch to
393 : fenced read/write for primitive fields in shmem_t. */
394 : struct __attribute__((aligned(FD_BLOCKSTORE_ALIGN))) fd_blockstore_shmem {
395 :
396 : /* Metadata */
397 :
398 : ulong magic;
399 : ulong blockstore_gaddr;
400 : ulong wksp_tag;
401 : ulong seed;
402 :
403 : /* Persistence */
404 :
405 : fd_blockstore_archiver_t archiver;
406 : ulong mrw_slot; /* most recently written slot */
407 :
408 : /* Slot metadata */
409 :
410 : ulong lps; /* latest processed slot */
411 : ulong hcs; /* highest confirmed slot */
412 : ulong wmk; /* watermark. DO NOT MODIFY DIRECTLY. */
413 :
414 : /* Config limits */
415 :
416 : ulong shred_max; /* maximum # of shreds that can be held in memory */
417 : ulong block_max; /* maximum # of blocks that can be held in memory */
418 : ulong idx_max; /* maximum # of blocks that can be indexed from the archival file */
419 : ulong alloc_max; /* maximum bytes that can be allocated */
420 :
421 : //ulong block_map_gaddr; /* map of slot->(slot_meta, block) */
422 : ulong block_idx_gaddr; /* map of slot->byte offset in archival file */
423 : ulong slot_deque_gaddr; /* deque of slot numbers */
424 :
425 : ulong alloc_gaddr;
426 : };
427 : typedef struct fd_blockstore_shmem fd_blockstore_shmem_t;
428 :
429 : /* fd_blockstore_t is a local join to the blockstore. This is specific
430 : to the local address space should not be shared across tiles. */
431 :
432 : struct fd_blockstore {
433 :
434 : /* shared memory region */
435 :
436 : fd_blockstore_shmem_t * shmem; /* read/writes to shmem must call fd_blockstore_start_read()*/
437 :
438 : /* local join handles */
439 :
440 : fd_buf_shred_pool_t shred_pool[1];
441 : fd_buf_shred_map_t shred_map[1];
442 : fd_block_map_t block_map[1];
443 : };
444 : typedef struct fd_blockstore fd_blockstore_t;
445 :
446 : FD_PROTOTYPES_BEGIN
447 :
448 : /* Construction API */
449 :
450 : FD_FN_CONST static inline ulong
451 0 : fd_blockstore_align( void ) {
452 0 : return FD_BLOCKSTORE_ALIGN;
453 0 : }
454 :
455 : /* fd_blockstore_footprint returns the footprint of the entire
456 : blockstore shared memory region occupied by `fd_blockstore_shmem_t`
457 : including data structures. */
458 :
459 : FD_FN_CONST static inline ulong
460 0 : fd_blockstore_footprint( ulong shred_max, ulong block_max, ulong idx_max ) {
461 : /* TODO -- when removing, make change in fd_blockstore_new as well */
462 0 : block_max = fd_ulong_pow2_up( block_max );
463 0 : ulong lock_cnt = fd_ulong_min( block_max, BLOCK_INFO_LOCK_CNT );
464 :
465 0 : int lg_idx_max = fd_ulong_find_msb( fd_ulong_pow2_up( idx_max ) );
466 0 : return FD_LAYOUT_FINI(
467 0 : FD_LAYOUT_APPEND(
468 0 : FD_LAYOUT_APPEND(
469 0 : FD_LAYOUT_APPEND(
470 0 : FD_LAYOUT_APPEND(
471 0 : FD_LAYOUT_APPEND(
472 0 : FD_LAYOUT_APPEND(
473 0 : FD_LAYOUT_APPEND(
474 0 : FD_LAYOUT_APPEND(
475 0 : FD_LAYOUT_APPEND(
476 0 : FD_LAYOUT_INIT,
477 0 : alignof(fd_blockstore_shmem_t), sizeof(fd_blockstore_shmem_t) ),
478 0 : alignof(fd_buf_shred_t), sizeof(fd_buf_shred_t) * shred_max ),
479 0 : fd_buf_shred_pool_align(), fd_buf_shred_pool_footprint() ),
480 0 : fd_buf_shred_map_align(), fd_buf_shred_map_footprint( shred_max ) ),
481 0 : alignof(fd_block_info_t), sizeof(fd_block_info_t) * block_max ),
482 0 : fd_block_map_align(), fd_block_map_footprint( block_max, lock_cnt, BLOCK_INFO_PROBE_CNT ) ),
483 0 : fd_block_idx_align(), fd_block_idx_footprint( lg_idx_max ) ),
484 0 : fd_slot_deque_align(), fd_slot_deque_footprint( block_max ) ),
485 0 : fd_alloc_align(), fd_alloc_footprint() ),
486 0 : fd_blockstore_align() );
487 0 : }
488 :
489 : /* fd_blockstore_new formats a memory region with the appropriate
490 : alignment and footprint into a blockstore. shmem points in the
491 : caller's address space of the memory region to format. Returns shmem
492 : on success (blockstore has ownership of the memory region) and NULL
493 : on failure (no changes, logs details). Caller is not joined on
494 : return. The blockstore will be empty and unlocked. */
495 :
496 : void *
497 : fd_blockstore_new( void * shmem,
498 : ulong wksp_tag,
499 : ulong seed,
500 : ulong shred_max,
501 : ulong block_max,
502 : ulong idx_max );
503 :
504 : /* fd_blockstore_join joins a blockstore. ljoin points to a
505 : fd_blockstore_t compatible memory region in the caller's address
506 : space used to hold info about the local join, shblockstore points in
507 : the caller's address space to the memory region containing the
508 : blockstore. Returns a handle to the caller's local join on success
509 : (join has ownership of the ljoin region) and NULL on failure (no
510 : changes, logs details). */
511 :
512 : fd_blockstore_t *
513 : fd_blockstore_join( void * ljoin, void * shblockstore );
514 :
515 : void *
516 : fd_blockstore_leave( fd_blockstore_t * blockstore );
517 :
518 : void *
519 : fd_blockstore_delete( void * shblockstore );
520 :
521 : /* fd_blockstore_init initializes a blockstore with the given
522 : `slot_bank`. This bank is used for initializing fields (SMR, etc.),
523 : and should be the bank upon finishing a snapshot load if booting from
524 : a snapshot, genesis bank otherwise. It is also used to "fake" the
525 : snapshot block as if that block's data were available. The metadata
526 : for this block's slot will be populated (fd_block_map_t) but the
527 : actual block data (fd_block_t) won't exist. This is done to bootstrap
528 : the various components for live replay (turbine, repair, etc.)
529 :
530 : `fd` is a file descriptor for the blockstore archival file. As part
531 : of `init`, blockstore rebuilds an in-memory index of the archival
532 : file. */
533 :
534 : fd_blockstore_t *
535 : fd_blockstore_init( fd_blockstore_t * blockstore,
536 : int fd,
537 : ulong fd_size_max,
538 : ulong slot );
539 :
540 : /* fd_blockstore_fini finalizes a blockstore.
541 :
542 : IMPORTANT! Caller MUST hold the read lock when calling this
543 : function. */
544 :
545 : void
546 : fd_blockstore_fini( fd_blockstore_t * blockstore );
547 :
548 : /* Accessors */
549 :
550 : /* fd_blockstore_wksp returns the local join to the wksp backing the
551 : blockstore. The lifetime of the returned pointer is at least as long
552 : as the lifetime of the local join. Assumes blockstore is a current
553 : local join. */
554 :
555 : FD_FN_PURE static inline fd_wksp_t *
556 0 : fd_blockstore_wksp( fd_blockstore_t * blockstore ) {
557 0 : return (fd_wksp_t *)( ( (ulong)blockstore->shmem ) - blockstore->shmem->blockstore_gaddr );
558 0 : }
559 :
560 : /* fd_blockstore_wksp_tag returns the workspace allocation tag used by
561 : the blockstore for its wksp allocations. Will be positive. Assumes
562 : blockstore is a current local join. */
563 :
564 : FD_FN_PURE static inline ulong
565 0 : fd_blockstore_wksp_tag( fd_blockstore_t const * blockstore ) {
566 0 : return blockstore->shmem->wksp_tag;
567 0 : }
568 :
569 : /* fd_blockstore_seed returns the hash seed used by the blockstore for various hash
570 : functions. Arbitrary value. Assumes blockstore is a current local join.
571 : TODO: consider renaming hash_seed? */
572 : FD_FN_PURE static inline ulong
573 0 : fd_blockstore_seed( fd_blockstore_t const * blockstore ) {
574 0 : return blockstore->shmem->seed;
575 0 : }
576 :
577 : /* fd_block_idx returns a pointer in the caller's address space to the
578 : fd_block_idx_t in the blockstore wksp. Assumes blockstore is local
579 : join. Lifetime of the returned pointer is that of the local join. */
580 :
581 : FD_FN_PURE static inline fd_block_idx_t *
582 0 : fd_blockstore_block_idx( fd_blockstore_t * blockstore ) {
583 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore ), blockstore->shmem->block_idx_gaddr );
584 0 : }
585 :
586 : /* fd_slot_deque returns a pointer in the caller's address space to the
587 : fd_slot_deque_t in the blockstore wksp. Assumes blockstore is local
588 : join. Lifetime of the returned pointer is that of the local join. */
589 :
590 : FD_FN_PURE static inline ulong *
591 0 : fd_blockstore_slot_deque( fd_blockstore_t * blockstore ) {
592 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore), blockstore->shmem->slot_deque_gaddr );
593 0 : }
594 :
595 : /* fd_blockstore_alloc returns a pointer in the caller's address space to
596 : the blockstore's allocator. */
597 :
598 : FD_FN_PURE static inline fd_alloc_t * /* Lifetime is that of the local join */
599 0 : fd_blockstore_alloc( fd_blockstore_t * blockstore ) {
600 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore), blockstore->shmem->alloc_gaddr );
601 0 : }
602 :
603 : /* fd_blockstore_shred_test returns 1 if a shred keyed by (slot, idx) is
604 : already in the blockstore and 0 otherwise. */
605 :
606 : int
607 : fd_blockstore_shred_test( fd_blockstore_t * blockstore, ulong slot, uint idx );
608 :
609 : /* fd_buf_shred_query_copy_data queries the blockstore for shred at
610 : slot, shred_idx. Copies the shred data to the given buffer and
611 : returns the data size. Returns -1 on failure.
612 :
613 : IMPORTANT! Caller MUST hold the read lock when calling this
614 : function. */
615 :
616 : long
617 : fd_buf_shred_query_copy_data( fd_blockstore_t * blockstore,
618 : ulong slot,
619 : uint shred_idx,
620 : void * buf,
621 : ulong buf_max );
622 :
623 : /* fd_blockstore_block_hash_query performs a blocking query (concurrent
624 : writers are not blocked) for the block hash of slot. Returns
625 : FD_BLOCKSTORE_SUCCESS on success and FD_BLOCKSTORE_ERR_KEY if slot is
626 : not in blockstore. Cannot fail. On success, a copy of the block
627 : hash will be populated in `block_hash`. Retains no interest in
628 : `slot` or `block_hash`.
629 :
630 : The block hash is the final poh hash for a slot and available on the
631 : last microblock header. */
632 :
633 : int
634 : fd_blockstore_block_hash_query( fd_blockstore_t * blockstore, ulong slot, fd_hash_t * block_hash );
635 :
636 : /* fd_blockstore_bank_hash_query performs a blocking query (concurrent
637 : writers are not blocked) for the bank hash of slot. Returns
638 : FD_BLOCKSTORE_SUCCESS on success and FD_BLOCKSTORE_ERR_KEY if slot is
639 : not in blockstore. Cannot fail. On success, a copy of the bank hash
640 : will be populated in `bank_hash`. Retains no interest in `slot` or
641 : `bank_hash`.
642 :
643 : The bank hash is a hash of the execution state (the "bank") after
644 : executing the block for a given slot. */
645 :
646 : int
647 : fd_blockstore_bank_hash_query( fd_blockstore_t * blockstore, ulong slot, fd_hash_t * bank_hash );
648 :
649 : /* fd_blockstore_block_map_query queries the blockstore for the block
650 : map entry at slot. Returns a pointer to the slot meta or NULL if not
651 : in blockstore.
652 :
653 : IMPORTANT! This should only be used for single-threaded / offline
654 : use-cases as it does not test the query. Read notes below for
655 : block_map usage in live. */
656 :
657 : fd_block_info_t *
658 : fd_blockstore_block_map_query( fd_blockstore_t * blockstore, ulong slot );
659 :
660 : /* IMPORTANT! NOTES FOR block_map USAGE:
661 :
662 : The block_info entries must be queried using the query_try/query_test
663 : pattern. This will frequently look like:
664 :
665 : int err = FD_MAP_ERR_AGAIN;
666 : loop while( err == FD_MAP_ERR_AGAIN )
667 : block_map_query_t query;
668 : err = fd_block_map_query_try( nonblocking );
669 : block_info_t * ele = fd_block_map_query_ele(query);
670 : if ERROR is FD_MAP_ERR_KEY, then the slot is not found.
671 : if ERROR is FD_MAP_ERR_AGAIN, then immediately continue.
672 : // important to handle ALL possible return err codes *before*
673 : // accessing the ele, as the ele will be the sentinel (usually NULL)
674 : speculatively execute <stuff>
675 : - no side effects
676 : - no early return
677 : err = fd_block_map_query_test(query)
678 : end loop
679 :
680 : Some accessors are provided to callers that already do this pattern,
681 : and handle the looping querying. For example, block_hash_copy, and
682 : parent_slot_query. However, for most caller use cases, it would be
683 : much more effecient to use the query_try/query_test pattern directly.
684 :
685 : Example: if you are accessing a block_info_t m, and m->parent_slot to
686 : the blockstore->shmem->smr, then you will need to start_write on the
687 : blockstore, query_try for the block_info_t object, set
688 : shmem->smr = meta->parent_slot, and then query_test, AND call
689 : blockstore_end_write. In the case that there's block_info contention,
690 : i.e. another thread is removing the block_info_t object of interest
691 : as we are trying to access it, the query_test will ERR_AGAIN, we will
692 : loop back and try again, hit the FD_MAP_ERR_KEY condition
693 : (and exit the loop gracefully), and we will have an incorrectly set
694 : shmem->smr.
695 :
696 : So depending on the complexity of what's being executed, it's easiest
697 : to directly copy what you need from the block_info_t into a variable
698 : outside the context of the loop, and use it further below, ex:
699 :
700 : ulong map_item = NULL_ITEM;
701 : loop {
702 : query_try
703 : map_item = ele->map_item; // like parent_slot
704 : query_test
705 : }
706 : check if map_item is NULL_ITEM
707 : fd_blockstore_start_write
708 : use map_item
709 : fd_blockstore_end_write
710 :
711 : Writes and updates (blocking). The pattern is:
712 : int err = fd_block_map_prepare( &slot, query, blocking );
713 : block_info_t * ele = fd_block_map_query_ele(query);
714 :
715 : IF slot was an existing key, then ele->slot == slot, and you are MODIFYING
716 : <modify ele>
717 : If slot was not an existing key, then ele->slot == 0, and you are INSERTING
718 : ele->slot = slot;
719 : <initialize ele>
720 :
721 : fd_block_map_publish(query); // will always succeed */
722 :
723 : /* fd_blockstore_parent_slot_query queries the parent slot of slot.
724 :
725 : This is non-blocking. */
726 : ulong
727 : fd_blockstore_parent_slot_query( fd_blockstore_t * blockstore, ulong slot );
728 :
729 : /* fd_blockstore_block_map_query_volatile is the same as above except it
730 : only copies out the metadata (fd_block_map_t). Returns
731 : FD_BLOCKSTORE_SLOT_MISSING if slot is missing, otherwise
732 : FD_BLOCKSTORE_SUCCESS. */
733 :
734 : int
735 : fd_blockstore_block_map_query_volatile( fd_blockstore_t * blockstore,
736 : int fd,
737 : ulong slot,
738 : fd_block_info_t * block_info_out ) ;
739 :
740 : /* fd_blockstore_block_info_test tests if a block meta entry exists for
741 : the given slot. Returns 1 if the entry exists and 0 otherwise.
742 :
743 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
744 : this function. */
745 : int
746 : fd_blockstore_block_info_test( fd_blockstore_t * blockstore, ulong slot );
747 :
748 : /* fd_blockstore_block_info_remove removes a block meta entry for
749 : the given slot. Returns SUCCESS if the entry exists and an
750 : error code otherwise.
751 :
752 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
753 : this function. */
754 : int
755 : fd_blockstore_block_info_remove( fd_blockstore_t * blockstore, ulong slot );
756 :
757 : /* fd_blockstore_slot_remove removes slot from blockstore, including all
758 : relevant internal structures.
759 :
760 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
761 : this function. */
762 : void
763 : fd_blockstore_slot_remove( fd_blockstore_t * blockstore, ulong slot );
764 :
765 : /* Operations */
766 :
767 : /* fd_blockstore_shred_insert inserts shred into the blockstore, fast
768 : O(1). Returns the current `consumed_idx` for the shred's slot if
769 : insert is successful, otherwise returns FD_SHRED_IDX_NULL on error.
770 : Reasons for error include this shred is already in the blockstore or
771 : the blockstore is full.
772 :
773 : fd_blockstore_shred_insert will manage locking, so the caller
774 : should NOT be acquiring the blockstore read/write lock before
775 : calling this function. */
776 :
777 : void
778 : fd_blockstore_shred_insert( fd_blockstore_t * blockstore, fd_shred_t const * shred );
779 :
780 : /* fd_blockstore_buffered_shreds_remove removes all the unassembled shreds
781 : for a slot */
782 : void
783 : fd_blockstore_shred_remove( fd_blockstore_t * blockstore, ulong slot, uint idx );
784 :
785 : /* fd_blockstore_slice_query queries for the block slice beginning from
786 : shred `start_idx`, ending at `end_idx`, inclusive. Validates start
787 : and end_idx as valid batch boundaries. Copies at most `max` bytes of
788 : the shred payloads, and returns FD_BLOCKSTORE_NO_MEM if the buffer is
789 : too small.
790 :
791 : Returns FD_BLOCKSTORE_SUCCESS (0) on success and a FD_MAP_ERR
792 : (negative) on failure. On success, `buf` will be populated with the
793 : copied slice and `buf_sz` will contain the number of bytes copied.
794 : Caller must ignore the values of `buf` and `buf_sz` on failure.
795 :
796 : Implementation is lockfree and safe with concurrent operations on
797 : blockstore. */
798 :
799 : int
800 : fd_blockstore_slice_query( fd_blockstore_t * blockstore,
801 : ulong slot,
802 : uint start_idx,
803 : uint end_idx,
804 : ulong max,
805 : uchar * buf,
806 : ulong * buf_sz );
807 :
808 : /* fd_blockstore_shreds_complete should be a replacement for anywhere that is
809 : querying for an fd_block_t * for existence but not actually using the block data.
810 : Semantically equivalent to query_block( slot ) != NULL.
811 :
812 : Implementation is lockfree and safe with concurrent operations on
813 : blockstore. */
814 : int
815 : fd_blockstore_shreds_complete( fd_blockstore_t * blockstore, ulong slot );
816 :
817 : /* fd_blockstore_publish publishes all blocks until the current
818 : blockstore smr (`blockstore->smr`). Publishing entails 1. pruning
819 : and 2. archiving. Pruning removes any blocks that are not part of
820 : the same fork as the smr (hence the name pruning, like pruning the
821 : branches of a tree). Archiving removes from memory any slots < smr
822 : that are on the same fork, but writes those blocks out to disk using
823 : the provided file descriptor to the archival file `fd`.
824 :
825 : Note that slots < smr are ancestors of the smr, and are therefore
826 : finalized slots which is why they are archived. Blocks removed as a
827 : result of pruning are not finalized, and therefore not archived.
828 :
829 : IMPORTANT! Caller MUST hold the write lock when calling this
830 : function. */
831 :
832 : void
833 : fd_blockstore_publish( fd_blockstore_t * blockstore, int fd, ulong wmk );
834 :
835 : void
836 : fd_blockstore_log_block_status( fd_blockstore_t * blockstore, ulong around_slot );
837 :
838 : /* fd_blockstore_log_mem_usage logs the memory usage of blockstore in a
839 : human-readable format. Caller MUST hold the read lock. */
840 :
841 : void
842 : fd_blockstore_log_mem_usage( fd_blockstore_t * blockstore );
843 :
844 : FD_PROTOTYPES_END
845 :
846 : #ifndef BLOCK_ARCHIVING
847 : #define BLOCK_ARCHIVING 0
848 : #endif
849 :
850 : #endif /* HEADER_fd_src_flamenco_runtime_fd_blockstore_h */
|