Line data Source code
1 : #ifndef HEADER_fd_src_flamenco_runtime_fd_blockstore_h
2 : #define HEADER_fd_src_flamenco_runtime_fd_blockstore_h
3 :
4 : /* Blockstore is a high-performance database for in-memory indexing and
5 : durably storing blocks.
6 :
7 : `fd_blockstore` defines a number of useful types e.g. `fd_block_t`,
8 : `fd_block_shred`, etc.
9 :
10 : The blockstore alloc is used for allocating wksp resources for shred
11 : headers, microblock headers, and blocks. This is an fd_alloc.
12 : Allocations from this allocator will be tagged with wksp_tag and
13 : operations on this allocator will use concurrency group 0. */
14 :
15 : #include "../../ballet/block/fd_microblock.h"
16 : #include "../../ballet/shred/fd_deshredder.h"
17 : #include "../../ballet/shred/fd_shred.h"
18 : #include "../fd_flamenco_base.h"
19 : #include "../types/fd_types.h"
20 : #include "fd_rwseq_lock.h"
21 : #include "stdbool.h"
22 : #include <fcntl.h>
23 :
24 : /* FD_BLOCKSTORE_ALIGN specifies the alignment needed for blockstore.
25 : ALIGN is double x86 cache line to mitigate various kinds of false
26 : sharing (eg. ACLPF adjacent cache line prefetch). */
27 :
28 0 : #define FD_BLOCKSTORE_ALIGN (128UL)
29 :
30 : /* FD_BLOCKSTORE_MAGIC defines a magic number for verifying the memory
31 : of blockstore is not corrupted. */
32 :
33 0 : #define FD_BLOCKSTORE_MAGIC (0xf17eda2ce7b10c00UL) /* firedancer bloc version 0 */
34 :
35 : /* DO NOT MODIFY. */
36 : // #define FD_BUF_SHRED_MAP_MAX (1UL << 24UL) /* 16 million shreds can be buffered */
37 :
38 : /* TODO this can be removed if we explicitly manage a memory pool for
39 : the fd_block_map_t entries */
40 0 : #define FD_BLOCKSTORE_CHILD_SLOT_MAX (32UL) /* the maximum # of children a slot can have */
41 0 : #define FD_BLOCKSTORE_ARCHIVE_MIN_SIZE (1UL << 26UL) /* 64MB := ceil(MAX_DATA_SHREDS_PER_SLOT*1228) */
42 :
43 : /* FD_SLICE_ALIGN specifies the alignment needed for a block slice.
44 : ALIGN is double x86 cache line to mitigate various kinds of false
45 : sharing (eg. ACLPF adjacent cache line prefetch). */
46 :
47 : #define FD_SLICE_ALIGN (128UL)
48 :
49 : /* FD_SLICE_MAX specifies the maximum size of an entry batch. This is
50 : equivalent to the maximum size of a block (ie. a block with a single
51 : entry batch). */
52 :
53 0 : #define FD_SLICE_MAX (FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT)
54 :
55 : /* 64 ticks per slot, and then one min size transaction per microblock
56 : for all the remaining microblocks.
57 : This bound should be used along with the transaction parser and tick
58 : verifier to enforce the assumptions.
59 : This is NOT a standalone conservative bound against malicious
60 : validators.
61 : A tighter bound could probably be derived if necessary. */
62 :
63 0 : #define FD_MICROBLOCK_MAX_PER_SLOT ((FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT - 64UL*sizeof(fd_microblock_hdr_t)) / (sizeof(fd_microblock_hdr_t)+FD_TXN_MIN_SERIALIZED_SZ) + 64UL) /* 200,796 */
64 : /* 64 ticks per slot, and a single gigantic microblock containing min
65 : size transactions. */
66 : #define FD_TXN_MAX_PER_SLOT ((FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT - 65UL*sizeof(fd_microblock_hdr_t)) / (FD_TXN_MIN_SERIALIZED_SZ)) /* 272,635 */
67 :
68 : // TODO centralize these
69 : // https://github.com/firedancer-io/solana/blob/v1.17.5/sdk/program/src/clock.rs#L34
70 : #define FD_MS_PER_TICK 6
71 :
72 : // https://github.com/firedancer-io/solana/blob/v1.17.5/core/src/repair/repair_service.rs#L55
73 : #define FD_REPAIR_TIMEOUT (200 / FD_MS_PER_TICK)
74 :
75 0 : #define FD_BLOCKSTORE_SUCCESS 0
76 : #define FD_BLOCKSTORE_SUCCESS_SLOT_COMPLETE 1
77 0 : #define FD_BLOCKSTORE_ERR_INVAL (-1)
78 : #define FD_BLOCKSTORE_ERR_AGAIN (-2)
79 0 : #define FD_BLOCKSTORE_ERR_CORRUPT (-3)
80 : #define FD_BLOCKSTORE_ERR_EMPTY (-4)
81 : #define FD_BLOCKSTORE_ERR_FULL (-5)
82 0 : #define FD_BLOCKSTORE_ERR_KEY (-6)
83 : #define FD_BLOCKSTORE_ERR_SHRED_FULL -1 /* no space left for shreds */
84 : #define FD_BLOCKSTORE_ERR_SLOT_FULL -2 /* no space left for slots */
85 : #define FD_BLOCKSTORE_ERR_SHRED_MISSING -4
86 0 : #define FD_BLOCKSTORE_ERR_SLOT_MISSING -5
87 0 : #define FD_BLOCKSTORE_ERR_SHRED_INVALID -7 /* shred was invalid */
88 : #define FD_BLOCKSTORE_ERR_DESHRED_INVALID -8 /* deshredded block was invalid */
89 0 : #define FD_BLOCKSTORE_ERR_NO_MEM -9 /* no mem */
90 : #define FD_BLOCKSTORE_ERR_UNKNOWN -99
91 :
92 0 : static inline char const * fd_blockstore_strerror( int err ) {
93 0 : switch( err ) {
94 0 : case FD_BLOCKSTORE_SUCCESS: return "success";
95 0 : case FD_BLOCKSTORE_ERR_INVAL: return "bad input";
96 0 : case FD_BLOCKSTORE_ERR_AGAIN: return "try again";
97 0 : case FD_BLOCKSTORE_ERR_CORRUPT: return "corruption detected";
98 0 : case FD_BLOCKSTORE_ERR_EMPTY: return "empty";
99 0 : case FD_BLOCKSTORE_ERR_FULL: return "full";
100 0 : case FD_BLOCKSTORE_ERR_KEY: return "key not found";
101 0 : default: break;
102 0 : }
103 0 : return "unknown";
104 0 : }
105 :
106 : struct fd_shred_key {
107 : ulong slot;
108 : uint idx;
109 : };
110 : typedef struct fd_shred_key fd_shred_key_t;
111 :
112 : static const fd_shred_key_t fd_shred_key_null = { 0 };
113 : #define FD_SHRED_KEY_NULL fd_shred_key_null
114 : #define FD_SHRED_KEY_INVAL(key) (!((key).slot) & !((key).idx))
115 0 : #define FD_SHRED_KEY_EQ(k0,k1) (!(((k0).slot) ^ ((k1).slot))) & !(((k0).idx) ^ (((k1).idx)))
116 0 : #define FD_SHRED_KEY_HASH(key) ((uint)(((key).slot)<<15UL) | (((key).idx))) /* current max shred idx is 32KB = 2 << 15*/
117 :
118 : /* fd_buf_shred is a thin wrapper around fd_shred_t that facilitates
119 : buffering data shreds before all the shreds for a slot have been
120 : received. After all shreds are received, these buffered shreds are
121 : released back into memory pool and future queries for the shreds are
122 : offset into the block data directly.
123 :
124 : The blockstore is only aware of data shreds and all APIs involving
125 : shreds refers to data shreds.
126 :
127 : Shreds are buffered into a map as they are received:
128 :
129 : | 0 | 1 | 2 | x | x | 5 | x |
130 : ^ ^
131 : c r
132 :
133 : c = "consumed" = contiguous window starting from index 0
134 : r = "received" = highest index received so far
135 :
136 : Shred memory layout while stored in the map:
137 :
138 : | shred hdr | shred payload |
139 : */
140 : struct __attribute__((aligned(128UL))) fd_buf_shred {
141 : fd_shred_key_t key;
142 : ulong prev;
143 : ulong next;
144 : ulong memo;
145 : int eqvoc; /* we've seen an equivocating version of this
146 : shred (same key but different payload). */
147 : union {
148 : fd_shred_t hdr; /* shred header */
149 : uchar buf[FD_SHRED_MIN_SZ]; /* the entire shred buffer, both header and payload. */
150 : };
151 : };
152 : typedef struct fd_buf_shred fd_buf_shred_t;
153 :
154 : #define POOL_NAME fd_buf_shred_pool
155 0 : #define POOL_ELE_T fd_buf_shred_t
156 : #include "../../util/tmpl/fd_pool_para.c"
157 :
158 : #define MAP_NAME fd_buf_shred_map
159 0 : #define MAP_ELE_T fd_buf_shred_t
160 0 : #define MAP_KEY_T fd_shred_key_t
161 0 : #define MAP_KEY_EQ(k0,k1) (FD_SHRED_KEY_EQ(*k0,*k1))
162 : #define MAP_KEY_EQ_IS_SLOW 1
163 0 : #define MAP_KEY_HASH(key,seed) (FD_SHRED_KEY_HASH(*key)^seed)
164 : #include "../../util/tmpl/fd_map_chain_para.c"
165 :
166 : #define DEQUE_NAME fd_slot_deque
167 0 : #define DEQUE_T ulong
168 : #include "../../util/tmpl/fd_deque_dynamic.c"
169 :
170 : /* fd_block_shred_t is a shred that has been assembled into a block. The
171 : shred begins at `off` relative to the start of the block's data
172 : region. */
173 : struct fd_block_shred {
174 : fd_shred_t hdr; /* ptr to the data shred header */
175 : ulong off; /* offset to the payload relative to the start of the block's data region */
176 : };
177 : typedef struct fd_block_shred fd_block_shred_t;
178 :
179 : /*
180 : * fd_block_entry_batch_t is a microblock/entry batch within a block.
181 : * The offset is relative to the start of the block's data region,
182 : * and indicates where the batch ends. The (exclusive) end offset of
183 : * batch i is the (inclusive) start offset of batch i+1. The 0th batch
184 : * always starts at offset 0.
185 : * On the wire, the presence of one of the COMPLETE flags in a data
186 : * shred marks the end of a batch.
187 : * In other words, batch ends are aligned with shred ends, and batch
188 : * starts are aligned with shred starts. Usually a batch comprises
189 : * multiple shreds, and a block comprises multiple batches.
190 : * This information is useful because bincode deserialization needs to
191 : * be performed on a per-batch basis. Precisely a single array of
192 : * microblocks/entries is expected to be deserialized from a batch.
193 : * Trailing bytes in each batch are ignored by default.
194 : */
195 : struct fd_block_entry_batch {
196 : ulong end_off; /* exclusive */
197 : };
198 : typedef struct fd_block_entry_batch fd_block_entry_batch_t;
199 :
200 : /* fd_block_micro_t is a microblock ("entry" in Solana parlance) within
201 : a block. The microblock begins at `off` relative to the start of the
202 : block's data region. */
203 : struct fd_block_micro {
204 : ulong off; /* offset into block data */
205 : };
206 : typedef struct fd_block_micro fd_block_micro_t;
207 :
208 : /* If the 0th bit is set, this indicates the block is preparing, which
209 : means it might be partially executed e.g. a subset of the microblocks
210 : have been executed. It is not safe to remove, relocate, or modify
211 : the block in any way at this time.
212 :
213 : Callers holding a pointer to a block should always make sure to
214 : inspect this flag.
215 :
216 : Other flags mainly provide useful metadata for read-only callers, eg.
217 : RPC. */
218 :
219 0 : #define FD_BLOCK_FLAG_RECEIVING 0 /* xxxxxxx1 still receiving shreds */
220 0 : #define FD_BLOCK_FLAG_COMPLETED 1 /* xxxxxx1x received the block ie. all shreds (SLOT_COMPLETE) */
221 0 : #define FD_BLOCK_FLAG_REPLAYING 2 /* xxxxx1xx replay in progress (DO NOT REMOVE) */
222 0 : #define FD_BLOCK_FLAG_PROCESSED 3 /* xxxx1xxx successfully replayed the block */
223 0 : #define FD_BLOCK_FLAG_EQVOCSAFE 4 /* xxxx1xxx 52% of cluster has voted on this (slot, bank hash) */
224 0 : #define FD_BLOCK_FLAG_CONFIRMED 5 /* xxx1xxxx 2/3 of cluster has voted on this (slot, bank hash) */
225 0 : #define FD_BLOCK_FLAG_FINALIZED 6 /* xx1xxxxx 2/3 of cluster has rooted this slot */
226 : #define FD_BLOCK_FLAG_DEADBLOCK 7 /* x1xxxxxx failed to replay the block */
227 :
228 : /* Rewards assigned after block is executed */
229 :
230 : struct fd_block_rewards {
231 : ulong collected_fees;
232 : fd_hash_t leader;
233 : ulong post_balance;
234 : };
235 : typedef struct fd_block_rewards fd_block_rewards_t;
236 :
237 : /* Remaining bits [4, 8) are reserved.
238 :
239 : To avoid confusion, please use `fd_bits.h` API
240 : ie. `fd_uchar_set_bit`, `fd_uchar_extract_bit`. */
241 :
242 : #define SET_NAME fd_block_set
243 : #define SET_MAX FD_SHRED_BLK_MAX
244 : #include "../../util/tmpl/fd_set.c"
245 :
246 : struct fd_block_info {
247 : ulong slot; /* map key */
248 : ulong next; /* reserved for use by fd_map_giant.c */
249 :
250 : /* Ancestry */
251 :
252 : ulong parent_slot;
253 : ulong child_slots[FD_BLOCKSTORE_CHILD_SLOT_MAX];
254 : ulong child_slot_cnt;
255 :
256 : /* Metadata */
257 :
258 : ulong block_height;
259 : fd_hash_t block_hash;
260 : fd_hash_t bank_hash;
261 : fd_hash_t merkle_hash; /* the last FEC set's merkle hash */
262 : ulong fec_cnt; /* the number of FEC sets in the slot */
263 : uchar flags;
264 : long ts; /* the wallclock time when we finished receiving the block. */
265 :
266 : /* Windowing
267 :
268 : Shreds are buffered into a map as they are received:
269 :
270 : | 0 | 1 | 2 | x | x | 5 | x |
271 : ^ ^ ^
272 : c b r
273 :
274 : c = "consumed" = contiguous shred idxs that have been consumed.
275 : the "consumer" is replay and the idx is
276 : incremented after replaying each block slice.
277 : b = "buffered" = contiguous shred idxs that have been buffered.
278 : when buffered == block_slice_end the next slice of
279 : a block is ready for replay.
280 : r = "received" = highest shred idx received so far. used to detect
281 : when repair is needed.
282 : */
283 :
284 : uint consumed_idx; /* the highest shred idx we've contiguously consumed (consecutive from 0). */
285 : uint buffered_idx; /* the highest shred idx we've contiguously buffered (consecutive from 0). */
286 : uint received_idx; /* the highest shred idx we've received (can be out-of-order). */
287 :
288 : uint data_complete_idx; /* the highest shred idx wrt contiguous entry batches (inclusive). */
289 : uint slot_complete_idx; /* the highest shred idx for the entire slot (inclusive). */
290 :
291 : /* This is a bit vec (fd_set) that tracks every shred idx marked with
292 : FD_SHRED_DATA_FLAG_DATA_COMPLETE. The bit position in the fd_set
293 : corresponds to the shred's index. Note shreds can be received
294 : out-of-order so higher bits might be set before lower bits. */
295 :
296 : fd_block_set_t data_complete_idxs[FD_SHRED_BLK_MAX / sizeof(ulong)];
297 :
298 : /* Helpers for batching tick verification */
299 :
300 : ulong ticks_consumed;
301 : ulong tick_hash_count_accum;
302 : fd_hash_t in_poh_hash; /* TODO: might not be best place to hold this */
303 :
304 : /* Block */
305 :
306 : ulong block_gaddr; /* global address to the start of the allocated fd_block_t */
307 : };
308 : typedef struct fd_block_info fd_block_info_t;
309 :
310 : #define MAP_NAME fd_block_map
311 0 : #define MAP_ELE_T fd_block_info_t
312 0 : #define MAP_KEY slot
313 0 : #define MAP_ELE_IS_FREE(ctx, ele) ((ele)->slot == ULONG_MAX)
314 0 : #define MAP_ELE_FREE(ctx, ele) ((ele)->slot = ULONG_MAX)
315 0 : #define MAP_ELE_MOVE(ctx,dst,src) do { MAP_ELE_T * _src = (src); (*(dst)) = *_src; _src->MAP_KEY = (MAP_KEY_T)ULONG_MAX; } while(0)
316 0 : #define MAP_KEY_HASH(key, seed) (void)(seed), (*(key))
317 : #include "../../util/tmpl/fd_map_slot_para.c"
318 :
319 0 : #define BLOCK_INFO_LOCK_CNT 1024UL
320 : #define BLOCK_INFO_PROBE_CNT 2UL
321 : /*
322 : Rationale for block_map parameters:
323 : - each lock manages block_max / lock_cnt elements, so with block_max
324 : at 4096, each lock would manage 4 contiguous elements.
325 : - Since keys are unique and increment by 1, we can index key to map
326 : bucket by taking key % ele_max directly. This way in theory we
327 : have perfect hashing and never need to probe.
328 : - This breaks when we store more than 4096 contiguous slots,
329 : i.e.: slot 0 collides with slot 4096, but this is at heart an
330 : OOM issue.
331 : - Causes possible contention - consider if we execute n, but are
332 : storing shreds for n+1 -- these are managed by the same lock.
333 : Perhaps opportunity for optimization.
334 : */
335 :
336 : /* fd_block_idx is an in-memory index of finalized blocks that have been
337 : archived to disk. It records the slot together with the byte offset
338 : relative to the start of the file. */
339 :
340 : struct fd_block_idx {
341 : ulong slot;
342 : ulong next;
343 : uint hash;
344 : ulong off;
345 : fd_hash_t block_hash;
346 : fd_hash_t bank_hash;
347 : };
348 : typedef struct fd_block_idx fd_block_idx_t;
349 :
350 : #define MAP_NAME fd_block_idx
351 0 : #define MAP_T fd_block_idx_t
352 0 : #define MAP_KEY slot
353 0 : #define MAP_KEY_HASH(key) ((uint)(key)) /* finalized slots are guaranteed to be unique so perfect hashing */
354 0 : #define MAP_KEY_INVAL(k) (k == ULONG_MAX)
355 : #include "../../util/tmpl/fd_map_dynamic.c"
356 :
357 : /* fd_blockstore_archiver outlines the format of metadata
358 : at the start of an archive file - needed so that archive
359 : files can be read back on initialization. */
360 :
361 : struct fd_blockstore_archiver {
362 : ulong fd_size_max; /* maximum size of the archival file */
363 : ulong num_blocks; /* number of blocks in the archival file. needed for reading back */
364 : ulong head; /* location of least recently written block */
365 : ulong tail; /* location after most recently written block */
366 : };
367 : typedef struct fd_blockstore_archiver fd_blockstore_archiver_t;
368 0 : #define FD_BLOCKSTORE_ARCHIVE_START sizeof(fd_blockstore_archiver_t)
369 :
370 : /* CONCURRENCY NOTES FOR BLOCKSTORE ENJOINERS:
371 :
372 : With the parallelization of the shred map and block map, parts of the
373 : blockstore are concurrent, and parts are not. Block map and shred map
374 : have their own locks, which are managed through the
375 : query_try/query_test APIs. When accessing buf_shred_t and
376 : block_info_t items then, the caller does not need to use
377 : blockstore_start/end_read/write. However, the
378 : blockstore_start/end_read/write still protects the blockstore_shmem_t
379 : object. If you are reading and writing any blockstore_shmem fields
380 : and at the same time accessing the block_info_t or buf_shred_t, you
381 : should call both the blockstore_start/end_read/write APIs AND the map
382 : query_try/test APIs. These are locks of separate concerns and will
383 : not deadlock with each other. TODO update docs when we switch to
384 : fenced read/write for primitive fields in shmem_t. */
385 : struct __attribute__((aligned(FD_BLOCKSTORE_ALIGN))) fd_blockstore_shmem {
386 :
387 : /* Metadata */
388 :
389 : ulong magic;
390 : ulong blockstore_gaddr;
391 : ulong wksp_tag;
392 : ulong seed;
393 :
394 : /* Persistence */
395 :
396 : fd_blockstore_archiver_t archiver;
397 : ulong mrw_slot; /* most recently written slot */
398 :
399 : /* Slot metadata */
400 :
401 : ulong lps; /* latest processed slot */
402 : ulong hcs; /* highest confirmed slot */
403 : ulong wmk; /* watermark. DO NOT MODIFY DIRECTLY. */
404 :
405 : /* Config limits */
406 :
407 : ulong shred_max; /* maximum # of shreds that can be held in memory */
408 : ulong block_max; /* maximum # of blocks that can be held in memory */
409 : ulong idx_max; /* maximum # of blocks that can be indexed from the archival file */
410 : ulong alloc_max; /* maximum bytes that can be allocated */
411 :
412 : //ulong block_map_gaddr; /* map of slot->(slot_meta, block) */
413 : ulong block_idx_gaddr; /* map of slot->byte offset in archival file */
414 : ulong slot_deque_gaddr; /* deque of slot numbers */
415 :
416 : ulong alloc_gaddr;
417 : };
418 : typedef struct fd_blockstore_shmem fd_blockstore_shmem_t;
419 :
420 : /* fd_blockstore_t is a local join to the blockstore. This is specific
421 : to the local address space should not be shared across tiles. */
422 :
423 : struct fd_blockstore {
424 :
425 : /* shared memory region */
426 :
427 : fd_blockstore_shmem_t * shmem; /* read/writes to shmem must call fd_blockstore_start_read()*/
428 :
429 : /* local join handles */
430 :
431 : fd_buf_shred_pool_t shred_pool[1];
432 : fd_buf_shred_map_t shred_map[1];
433 : fd_block_map_t block_map[1];
434 : };
435 : typedef struct fd_blockstore fd_blockstore_t;
436 :
437 : FD_PROTOTYPES_BEGIN
438 :
439 : /* Construction API */
440 :
441 : FD_FN_CONST static inline ulong
442 0 : fd_blockstore_align( void ) {
443 0 : return FD_BLOCKSTORE_ALIGN;
444 0 : }
445 :
446 : /* fd_blockstore_footprint returns the footprint of the entire
447 : blockstore shared memory region occupied by `fd_blockstore_shmem_t`
448 : including data structures. */
449 :
450 : FD_FN_CONST static inline ulong
451 0 : fd_blockstore_footprint( ulong shred_max, ulong block_max, ulong idx_max ) {
452 : /* TODO -- when removing, make change in fd_blockstore_new as well */
453 0 : block_max = fd_ulong_pow2_up( block_max );
454 0 : ulong lock_cnt = fd_ulong_min( block_max, BLOCK_INFO_LOCK_CNT );
455 :
456 0 : int lg_idx_max = fd_ulong_find_msb( fd_ulong_pow2_up( idx_max ) );
457 0 : return FD_LAYOUT_FINI(
458 0 : FD_LAYOUT_APPEND(
459 0 : FD_LAYOUT_APPEND(
460 0 : FD_LAYOUT_APPEND(
461 0 : FD_LAYOUT_APPEND(
462 0 : FD_LAYOUT_APPEND(
463 0 : FD_LAYOUT_APPEND(
464 0 : FD_LAYOUT_APPEND(
465 0 : FD_LAYOUT_APPEND(
466 0 : FD_LAYOUT_APPEND(
467 0 : FD_LAYOUT_INIT,
468 0 : alignof(fd_blockstore_shmem_t), sizeof(fd_blockstore_shmem_t) ),
469 0 : alignof(fd_buf_shred_t), sizeof(fd_buf_shred_t) * shred_max ),
470 0 : fd_buf_shred_pool_align(), fd_buf_shred_pool_footprint() ),
471 0 : fd_buf_shred_map_align(), fd_buf_shred_map_footprint( shred_max ) ),
472 0 : alignof(fd_block_info_t), sizeof(fd_block_info_t) * block_max ),
473 0 : fd_block_map_align(), fd_block_map_footprint( block_max, lock_cnt, BLOCK_INFO_PROBE_CNT ) ),
474 0 : fd_block_idx_align(), fd_block_idx_footprint( lg_idx_max ) ),
475 0 : fd_slot_deque_align(), fd_slot_deque_footprint( block_max ) ),
476 0 : fd_alloc_align(), fd_alloc_footprint() ),
477 0 : fd_blockstore_align() );
478 0 : }
479 :
480 : /* fd_blockstore_new formats a memory region with the appropriate
481 : alignment and footprint into a blockstore. shmem points in the
482 : caller's address space of the memory region to format. Returns shmem
483 : on success (blockstore has ownership of the memory region) and NULL
484 : on failure (no changes, logs details). Caller is not joined on
485 : return. The blockstore will be empty and unlocked. */
486 :
487 : void *
488 : fd_blockstore_new( void * shmem,
489 : ulong wksp_tag,
490 : ulong seed,
491 : ulong shred_max,
492 : ulong block_max,
493 : ulong idx_max );
494 :
495 : /* fd_blockstore_join joins a blockstore. ljoin points to a
496 : fd_blockstore_t compatible memory region in the caller's address
497 : space used to hold info about the local join, shblockstore points in
498 : the caller's address space to the memory region containing the
499 : blockstore. Returns a handle to the caller's local join on success
500 : (join has ownership of the ljoin region) and NULL on failure (no
501 : changes, logs details). */
502 :
503 : fd_blockstore_t *
504 : fd_blockstore_join( void * ljoin, void * shblockstore );
505 :
506 : void *
507 : fd_blockstore_leave( fd_blockstore_t * blockstore );
508 :
509 : void *
510 : fd_blockstore_delete( void * shblockstore );
511 :
512 : /* fd_blockstore_init initializes a blockstore with the given
513 : `slot_bank`. This bank is used for initializing fields (SMR, etc.),
514 : and should be the bank upon finishing a snapshot load if booting from
515 : a snapshot, genesis bank otherwise. It is also used to "fake" the
516 : snapshot block as if that block's data were available. The metadata
517 : for this block's slot will be populated (fd_block_map_t) but the
518 : actual block data (fd_block_t) won't exist. This is done to bootstrap
519 : the various components for live replay (turbine, repair, etc.)
520 :
521 : `fd` is a file descriptor for the blockstore archival file. As part
522 : of `init`, blockstore rebuilds an in-memory index of the archival
523 : file. */
524 :
525 : fd_blockstore_t *
526 : fd_blockstore_init( fd_blockstore_t * blockstore,
527 : int fd,
528 : ulong fd_size_max,
529 : ulong slot );
530 :
531 : /* fd_blockstore_fini finalizes a blockstore.
532 :
533 : IMPORTANT! Caller MUST hold the read lock when calling this
534 : function. */
535 :
536 : void
537 : fd_blockstore_fini( fd_blockstore_t * blockstore );
538 :
539 : /* Accessors */
540 :
541 : /* fd_blockstore_wksp returns the local join to the wksp backing the
542 : blockstore. The lifetime of the returned pointer is at least as long
543 : as the lifetime of the local join. Assumes blockstore is a current
544 : local join. */
545 :
546 : FD_FN_PURE static inline fd_wksp_t *
547 0 : fd_blockstore_wksp( fd_blockstore_t * blockstore ) {
548 0 : return (fd_wksp_t *)( ( (ulong)blockstore->shmem ) - blockstore->shmem->blockstore_gaddr );
549 0 : }
550 :
551 : /* fd_blockstore_wksp_tag returns the workspace allocation tag used by
552 : the blockstore for its wksp allocations. Will be positive. Assumes
553 : blockstore is a current local join. */
554 :
555 : FD_FN_PURE static inline ulong
556 0 : fd_blockstore_wksp_tag( fd_blockstore_t const * blockstore ) {
557 0 : return blockstore->shmem->wksp_tag;
558 0 : }
559 :
560 : /* fd_blockstore_seed returns the hash seed used by the blockstore for various hash
561 : functions. Arbitrary value. Assumes blockstore is a current local join.
562 : TODO: consider renaming hash_seed? */
563 : FD_FN_PURE static inline ulong
564 0 : fd_blockstore_seed( fd_blockstore_t const * blockstore ) {
565 0 : return blockstore->shmem->seed;
566 0 : }
567 :
568 : /* fd_block_idx returns a pointer in the caller's address space to the
569 : fd_block_idx_t in the blockstore wksp. Assumes blockstore is local
570 : join. Lifetime of the returned pointer is that of the local join. */
571 :
572 : FD_FN_PURE static inline fd_block_idx_t *
573 0 : fd_blockstore_block_idx( fd_blockstore_t * blockstore ) {
574 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore ), blockstore->shmem->block_idx_gaddr );
575 0 : }
576 :
577 : /* fd_slot_deque returns a pointer in the caller's address space to the
578 : fd_slot_deque_t in the blockstore wksp. Assumes blockstore is local
579 : join. Lifetime of the returned pointer is that of the local join. */
580 :
581 : FD_FN_PURE static inline ulong *
582 0 : fd_blockstore_slot_deque( fd_blockstore_t * blockstore ) {
583 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore), blockstore->shmem->slot_deque_gaddr );
584 0 : }
585 :
586 : /* fd_blockstore_alloc returns a pointer in the caller's address space to
587 : the blockstore's allocator. */
588 :
589 : FD_FN_PURE static inline fd_alloc_t * /* Lifetime is that of the local join */
590 0 : fd_blockstore_alloc( fd_blockstore_t * blockstore ) {
591 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore), blockstore->shmem->alloc_gaddr );
592 0 : }
593 :
594 : /* fd_blockstore_shred_test returns 1 if a shred keyed by (slot, idx) is
595 : already in the blockstore and 0 otherwise. */
596 :
597 : int
598 : fd_blockstore_shred_test( fd_blockstore_t * blockstore, ulong slot, uint idx );
599 :
600 : /* fd_buf_shred_query_copy_data queries the blockstore for shred at
601 : slot, shred_idx. Copies the shred data to the given buffer and
602 : returns the data size. Returns -1 on failure.
603 :
604 : IMPORTANT! Caller MUST hold the read lock when calling this
605 : function. */
606 :
607 : long
608 : fd_buf_shred_query_copy_data( fd_blockstore_t * blockstore,
609 : ulong slot,
610 : uint shred_idx,
611 : void * buf,
612 : ulong buf_max );
613 :
614 : /* fd_blockstore_block_hash_query performs a blocking query (concurrent
615 : writers are not blocked) for the block hash of slot. Returns
616 : FD_BLOCKSTORE_SUCCESS on success and FD_BLOCKSTORE_ERR_KEY if slot is
617 : not in blockstore. Cannot fail. On success, a copy of the block
618 : hash will be populated in `block_hash`. Retains no interest in
619 : `slot` or `block_hash`.
620 :
621 : The block hash is the final poh hash for a slot and available on the
622 : last microblock header. */
623 :
624 : int
625 : fd_blockstore_block_hash_query( fd_blockstore_t * blockstore, ulong slot, fd_hash_t * block_hash );
626 :
627 : /* fd_blockstore_bank_hash_query performs a blocking query (concurrent
628 : writers are not blocked) for the bank hash of slot. Returns
629 : FD_BLOCKSTORE_SUCCESS on success and FD_BLOCKSTORE_ERR_KEY if slot is
630 : not in blockstore. Cannot fail. On success, a copy of the bank hash
631 : will be populated in `bank_hash`. Retains no interest in `slot` or
632 : `bank_hash`.
633 :
634 : The bank hash is a hash of the execution state (the "bank") after
635 : executing the block for a given slot. */
636 :
637 : int
638 : fd_blockstore_bank_hash_query( fd_blockstore_t * blockstore, ulong slot, fd_hash_t * bank_hash );
639 :
640 : /* fd_blockstore_block_map_query queries the blockstore for the block
641 : map entry at slot. Returns a pointer to the slot meta or NULL if not
642 : in blockstore.
643 :
644 : IMPORTANT! This should only be used for single-threaded / offline
645 : use-cases as it does not test the query. Read notes below for
646 : block_map usage in live. */
647 :
648 : fd_block_info_t *
649 : fd_blockstore_block_map_query( fd_blockstore_t * blockstore, ulong slot );
650 :
651 : /* IMPORTANT! NOTES FOR block_map USAGE:
652 :
653 : The block_info entries must be queried using the query_try/query_test
654 : pattern. This will frequently look like:
655 :
656 : int err = FD_MAP_ERR_AGAIN;
657 : loop while( err == FD_MAP_ERR_AGAIN )
658 : block_map_query_t query;
659 : err = fd_block_map_query_try( nonblocking );
660 : block_info_t * ele = fd_block_map_query_ele(query);
661 : if ERROR is FD_MAP_ERR_KEY, then the slot is not found.
662 : if ERROR is FD_MAP_ERR_AGAIN, then immediately continue.
663 : // important to handle ALL possible return err codes *before*
664 : // accessing the ele, as the ele will be the sentinel (usually NULL)
665 : speculatively execute <stuff>
666 : - no side effects
667 : - no early return
668 : err = fd_block_map_query_test(query)
669 : end loop
670 :
671 : Some accessors are provided to callers that already do this pattern,
672 : and handle the looping querying. For example, block_hash_copy, and
673 : parent_slot_query. However, for most caller use cases, it would be
674 : much more effecient to use the query_try/query_test pattern directly.
675 :
676 : Example: if you are accessing a block_info_t m, and m->parent_slot to
677 : the blockstore->shmem->smr, then you will need to start_write on the
678 : blockstore, query_try for the block_info_t object, set
679 : shmem->smr = meta->parent_slot, and then query_test, AND call
680 : blockstore_end_write. In the case that there's block_info contention,
681 : i.e. another thread is removing the block_info_t object of interest
682 : as we are trying to access it, the query_test will ERR_AGAIN, we will
683 : loop back and try again, hit the FD_MAP_ERR_KEY condition
684 : (and exit the loop gracefully), and we will have an incorrectly set
685 : shmem->smr.
686 :
687 : So depending on the complexity of what's being executed, it's easiest
688 : to directly copy what you need from the block_info_t into a variable
689 : outside the context of the loop, and use it further below, ex:
690 :
691 : ulong map_item = NULL_ITEM;
692 : loop {
693 : query_try
694 : map_item = ele->map_item; // like parent_slot
695 : query_test
696 : }
697 : check if map_item is NULL_ITEM
698 : fd_blockstore_start_write
699 : use map_item
700 : fd_blockstore_end_write
701 :
702 : Writes and updates (blocking). The pattern is:
703 : int err = fd_block_map_prepare( &slot, query, blocking );
704 : block_info_t * ele = fd_block_map_query_ele(query);
705 :
706 : IF slot was an existing key, then ele->slot == slot, and you are MODIFYING
707 : <modify ele>
708 : If slot was not an existing key, then ele->slot == 0, and you are INSERTING
709 : ele->slot = slot;
710 : <initialize ele>
711 :
712 : fd_block_map_publish(query); // will always succeed */
713 :
714 : /* fd_blockstore_parent_slot_query queries the parent slot of slot.
715 :
716 : This is non-blocking. */
717 : ulong
718 : fd_blockstore_parent_slot_query( fd_blockstore_t * blockstore, ulong slot );
719 :
720 : /* fd_blockstore_block_map_query_volatile is the same as above except it
721 : only copies out the metadata (fd_block_map_t). Returns
722 : FD_BLOCKSTORE_SLOT_MISSING if slot is missing, otherwise
723 : FD_BLOCKSTORE_SUCCESS. */
724 :
725 : int
726 : fd_blockstore_block_map_query_volatile( fd_blockstore_t * blockstore,
727 : int fd,
728 : ulong slot,
729 : fd_block_info_t * block_info_out ) ;
730 :
731 : /* fd_blockstore_block_info_test tests if a block meta entry exists for
732 : the given slot. Returns 1 if the entry exists and 0 otherwise.
733 :
734 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
735 : this function. */
736 : int
737 : fd_blockstore_block_info_test( fd_blockstore_t * blockstore, ulong slot );
738 :
739 : /* fd_blockstore_block_info_remove removes a block meta entry for
740 : the given slot. Returns SUCCESS if the entry exists and an
741 : error code otherwise.
742 :
743 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
744 : this function. */
745 : int
746 : fd_blockstore_block_info_remove( fd_blockstore_t * blockstore, ulong slot );
747 :
748 : /* fd_blockstore_slot_remove removes slot from blockstore, including all
749 : relevant internal structures.
750 :
751 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
752 : this function. */
753 : void
754 : fd_blockstore_slot_remove( fd_blockstore_t * blockstore, ulong slot );
755 :
756 : /* Operations */
757 :
758 : /* fd_blockstore_shred_insert inserts shred into the blockstore, fast
759 : O(1). Returns the current `consumed_idx` for the shred's slot if
760 : insert is successful, otherwise returns FD_SHRED_IDX_NULL on error.
761 : Reasons for error include this shred is already in the blockstore or
762 : the blockstore is full.
763 :
764 : fd_blockstore_shred_insert will manage locking, so the caller
765 : should NOT be acquiring the blockstore read/write lock before
766 : calling this function. */
767 :
768 : void
769 : fd_blockstore_shred_insert( fd_blockstore_t * blockstore, fd_shred_t const * shred );
770 :
771 : /* fd_blockstore_buffered_shreds_remove removes all the unassembled shreds
772 : for a slot */
773 : void
774 : fd_blockstore_shred_remove( fd_blockstore_t * blockstore, ulong slot, uint idx );
775 :
776 : /* fd_blockstore_slice_query queries for the block slice beginning from
777 : shred `start_idx`, ending at `end_idx`, inclusive. Validates start
778 : and end_idx as valid batch boundaries. Copies at most `max` bytes of
779 : the shred payloads, and returns FD_BLOCKSTORE_NO_MEM if the buffer is
780 : too small.
781 :
782 : Returns FD_BLOCKSTORE_SUCCESS (0) on success and a FD_MAP_ERR
783 : (negative) on failure. On success, `buf` will be populated with the
784 : copied slice and `buf_sz` will contain the number of bytes copied.
785 : Caller must ignore the values of `buf` and `buf_sz` on failure.
786 :
787 : Implementation is lockfree and safe with concurrent operations on
788 : blockstore. */
789 :
790 : int
791 : fd_blockstore_slice_query( fd_blockstore_t * blockstore,
792 : ulong slot,
793 : uint start_idx,
794 : uint end_idx,
795 : ulong max,
796 : uchar * buf,
797 : ulong * buf_sz );
798 :
799 : /* fd_blockstore_shreds_complete should be a replacement for anywhere that is
800 : querying for an fd_block_t * for existence but not actually using the block data.
801 : Semantically equivalent to query_block( slot ) != NULL.
802 :
803 : Implementation is lockfree and safe with concurrent operations on
804 : blockstore. */
805 : int
806 : fd_blockstore_shreds_complete( fd_blockstore_t * blockstore, ulong slot );
807 :
808 : /* fd_blockstore_block_height_update sets the block height.
809 :
810 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
811 : this function. */
812 : void
813 : fd_blockstore_block_height_update( fd_blockstore_t * blockstore, ulong slot, ulong block_height );
814 :
815 : ulong
816 : fd_blockstore_block_height_query( fd_blockstore_t * blockstore, ulong slot );
817 :
818 : /* fd_blockstore_publish publishes all blocks until the current
819 : blockstore smr (`blockstore->smr`). Publishing entails 1. pruning
820 : and 2. archiving. Pruning removes any blocks that are not part of
821 : the same fork as the smr (hence the name pruning, like pruning the
822 : branches of a tree). Archiving removes from memory any slots < smr
823 : that are on the same fork, but writes those blocks out to disk using
824 : the provided file descriptor to the archival file `fd`.
825 :
826 : Note that slots < smr are ancestors of the smr, and are therefore
827 : finalized slots which is why they are archived. Blocks removed as a
828 : result of pruning are not finalized, and therefore not archived.
829 :
830 : IMPORTANT! Caller MUST hold the write lock when calling this
831 : function. */
832 :
833 : void
834 : fd_blockstore_publish( fd_blockstore_t * blockstore, int fd, ulong wmk );
835 :
836 : void
837 : fd_blockstore_log_block_status( fd_blockstore_t * blockstore, ulong around_slot );
838 :
839 : /* fd_blockstore_log_mem_usage logs the memory usage of blockstore in a
840 : human-readable format. Caller MUST hold the read lock. */
841 :
842 : void
843 : fd_blockstore_log_mem_usage( fd_blockstore_t * blockstore );
844 :
845 : FD_PROTOTYPES_END
846 :
847 : #ifndef BLOCK_ARCHIVING
848 : #define BLOCK_ARCHIVING 0
849 : #endif
850 :
851 : #endif /* HEADER_fd_src_flamenco_runtime_fd_blockstore_h */
|