Line data Source code
1 : #ifndef HEADER_fd_src_flamenco_runtime_fd_blockstore_h
2 : #define HEADER_fd_src_flamenco_runtime_fd_blockstore_h
3 :
4 : /* Blockstore is a high-performance database for in-memory indexing and
5 : durably storing blocks.
6 :
7 : `fd_blockstore` defines a number of useful types e.g. `fd_block_t`,
8 : `fd_block_shred`, etc.
9 :
10 : The blockstore alloc is used for allocating wksp resources for shred
11 : headers, microblock headers, and blocks. This is an fd_alloc.
12 : Allocations from this allocator will be tagged with wksp_tag and
13 : operations on this allocator will use concurrency group 0. */
14 :
15 : #include "../../ballet/block/fd_microblock.h"
16 : #include "../../ballet/shred/fd_deshredder.h"
17 : #include "../../ballet/shred/fd_shred.h"
18 : #include "../fd_flamenco_base.h"
19 : #include "../types/fd_types.h"
20 : #include "fd_rwseq_lock.h"
21 : #include "stdbool.h"
22 : #include <fcntl.h>
23 :
24 : /* FD_BLOCKSTORE_ALIGN specifies the alignment needed for blockstore.
25 : ALIGN is double x86 cache line to mitigate various kinds of false
26 : sharing (eg. ACLPF adjacent cache line prefetch). */
27 :
28 0 : #define FD_BLOCKSTORE_ALIGN (128UL)
29 :
30 : /* FD_BLOCKSTORE_MAGIC defines a magic number for verifying the memory
31 : of blockstore is not corrupted. */
32 :
33 0 : #define FD_BLOCKSTORE_MAGIC (0xf17eda2ce7b10c00UL) /* firedancer bloc version 0 */
34 :
35 : /* DO NOT MODIFY. */
36 : // #define FD_BUF_SHRED_MAP_MAX (1UL << 24UL) /* 16 million shreds can be buffered */
37 : // #define FD_TXN_MAP_LG_MAX (24) /* 16 million txns can be stored in the txn map */
38 :
39 : /* TODO this can be removed if we explicitly manage a memory pool for
40 : the fd_block_map_t entries */
41 0 : #define FD_BLOCKSTORE_CHILD_SLOT_MAX (32UL) /* the maximum # of children a slot can have */
42 0 : #define FD_BLOCKSTORE_ARCHIVE_MIN_SIZE (1UL << 26UL) /* 64MB := ceil(MAX_DATA_SHREDS_PER_SLOT*1228) */
43 :
44 : /* FD_SLICE_ALIGN specifies the alignment needed for a block slice.
45 : ALIGN is double x86 cache line to mitigate various kinds of false
46 : sharing (eg. ACLPF adjacent cache line prefetch). */
47 :
48 : #define FD_SLICE_ALIGN (128UL)
49 :
50 : /* FD_SLICE_MAX specifies the maximum size of an entry batch. This is
51 : equivalent to the maximum size of a block (ie. a block with a single
52 : entry batch). */
53 :
54 0 : #define FD_SLICE_MAX (FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT)
55 :
56 : /* 64 ticks per slot, and then one min size transaction per microblock
57 : for all the remaining microblocks.
58 : This bound should be used along with the transaction parser and tick
59 : verifier to enforce the assumptions.
60 : This is NOT a standalone conservative bound against malicious
61 : validators.
62 : A tighter bound could probably be derived if necessary. */
63 :
64 0 : #define FD_MICROBLOCK_MAX_PER_SLOT ((FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT - 64UL*sizeof(fd_microblock_hdr_t)) / (sizeof(fd_microblock_hdr_t)+FD_TXN_MIN_SERIALIZED_SZ) + 64UL) /* 200,796 */
65 : /* 64 ticks per slot, and a single gigantic microblock containing min
66 : size transactions. */
67 0 : #define FD_TXN_MAX_PER_SLOT ((FD_SHRED_DATA_PAYLOAD_MAX_PER_SLOT - 65UL*sizeof(fd_microblock_hdr_t)) / (FD_TXN_MIN_SERIALIZED_SZ)) /* 272,635 */
68 :
69 : // TODO centralize these
70 : // https://github.com/firedancer-io/solana/blob/v1.17.5/sdk/program/src/clock.rs#L34
71 : #define FD_MS_PER_TICK 6
72 :
73 : // https://github.com/firedancer-io/solana/blob/v1.17.5/core/src/repair/repair_service.rs#L55
74 : #define FD_REPAIR_TIMEOUT (200 / FD_MS_PER_TICK)
75 :
76 0 : #define FD_BLOCKSTORE_SUCCESS 0
77 0 : #define FD_BLOCKSTORE_SUCCESS_SLOT_COMPLETE 1
78 0 : #define FD_BLOCKSTORE_ERR_INVAL (-1)
79 : #define FD_BLOCKSTORE_ERR_AGAIN (-2)
80 0 : #define FD_BLOCKSTORE_ERR_CORRUPT (-3)
81 : #define FD_BLOCKSTORE_ERR_EMPTY (-4)
82 : #define FD_BLOCKSTORE_ERR_FULL (-5)
83 0 : #define FD_BLOCKSTORE_ERR_KEY (-6)
84 : #define FD_BLOCKSTORE_ERR_SHRED_FULL -1 /* no space left for shreds */
85 : #define FD_BLOCKSTORE_ERR_SLOT_FULL -2 /* no space left for slots */
86 : #define FD_BLOCKSTORE_ERR_TXN_FULL -3 /* no space left for txns */
87 : #define FD_BLOCKSTORE_ERR_SHRED_MISSING -4
88 0 : #define FD_BLOCKSTORE_ERR_SLOT_MISSING -5
89 : #define FD_BLOCKSTORE_ERR_TXN_MISSING -6
90 0 : #define FD_BLOCKSTORE_ERR_SHRED_INVALID -7 /* shred was invalid */
91 : #define FD_BLOCKSTORE_ERR_DESHRED_INVALID -8 /* deshredded block was invalid */
92 0 : #define FD_BLOCKSTORE_ERR_NO_MEM -9 /* no mem */
93 : #define FD_BLOCKSTORE_ERR_UNKNOWN -99
94 :
95 0 : static inline char const * fd_blockstore_strerror( int err ) {
96 0 : switch( err ) {
97 0 : case FD_BLOCKSTORE_SUCCESS: return "success";
98 0 : case FD_BLOCKSTORE_ERR_INVAL: return "bad input";
99 0 : case FD_BLOCKSTORE_ERR_AGAIN: return "try again";
100 0 : case FD_BLOCKSTORE_ERR_CORRUPT: return "corruption detected";
101 0 : case FD_BLOCKSTORE_ERR_EMPTY: return "empty";
102 0 : case FD_BLOCKSTORE_ERR_FULL: return "full";
103 0 : case FD_BLOCKSTORE_ERR_KEY: return "key not found";
104 0 : default: break;
105 0 : }
106 0 : return "unknown";
107 0 : }
108 :
109 : struct fd_shred_key {
110 : ulong slot;
111 : uint idx;
112 : };
113 : typedef struct fd_shred_key fd_shred_key_t;
114 :
115 : static const fd_shred_key_t fd_shred_key_null = { 0 };
116 : #define FD_SHRED_KEY_NULL fd_shred_key_null
117 : #define FD_SHRED_KEY_INVAL(key) (!((key).slot) & !((key).idx))
118 0 : #define FD_SHRED_KEY_EQ(k0,k1) (!(((k0).slot) ^ ((k1).slot))) & !(((k0).idx) ^ (((k1).idx)))
119 0 : #define FD_SHRED_KEY_HASH(key) ((uint)(((key).slot)<<15UL) | (((key).idx))) /* current max shred idx is 32KB = 2 << 15*/
120 :
121 :
122 :
123 : /* fd_buf_shred is a thin wrapper around fd_shred_t that facilitates
124 : buffering data shreds before all the shreds for a slot have been
125 : received. After all shreds are received, these buffered shreds are
126 : released back into memory pool and future queries for the shreds are
127 : offset into the block data directly.
128 :
129 : The blockstore is only aware of data shreds and all APIs involving
130 : shreds refers to data shreds.
131 :
132 : Shreds are buffered into a map as they are received:
133 :
134 : | 0 | 1 | 2 | x | x | 5 | x |
135 : ^ ^
136 : c r
137 :
138 : c = "consumed" = contiguous window starting from index 0
139 : r = "received" = highest index received so far
140 :
141 : Shred memory layout while stored in the map:
142 :
143 : | shred hdr | shred payload |
144 : */
145 : struct __attribute__((aligned(128UL))) fd_buf_shred {
146 : fd_shred_key_t key;
147 : ulong prev;
148 : ulong next;
149 : ulong memo;
150 : int eqvoc; /* we've seen an equivocating version of this
151 : shred (same key but different payload). */
152 : union {
153 : fd_shred_t hdr; /* shred header */
154 : uchar buf[FD_SHRED_MIN_SZ]; /* the entire shred buffer, both header and payload. */
155 : };
156 : };
157 : typedef struct fd_buf_shred fd_buf_shred_t;
158 :
159 : #define POOL_NAME fd_buf_shred_pool
160 0 : #define POOL_ELE_T fd_buf_shred_t
161 : #include "../../util/tmpl/fd_pool_para.c"
162 :
163 : #define MAP_NAME fd_buf_shred_map
164 0 : #define MAP_ELE_T fd_buf_shred_t
165 0 : #define MAP_KEY_T fd_shred_key_t
166 0 : #define MAP_KEY_EQ(k0,k1) (FD_SHRED_KEY_EQ(*k0,*k1))
167 : #define MAP_KEY_EQ_IS_SLOW 1
168 0 : #define MAP_KEY_HASH(key,seed) (FD_SHRED_KEY_HASH(*key)^seed)
169 : #include "../../util/tmpl/fd_map_para.c"
170 :
171 : #define DEQUE_NAME fd_slot_deque
172 0 : #define DEQUE_T ulong
173 : #include "../../util/tmpl/fd_deque_dynamic.c"
174 :
175 : /* fd_block_shred_t is a shred that has been assembled into a block. The
176 : shred begins at `off` relative to the start of the block's data
177 : region. */
178 : struct fd_block_shred {
179 : fd_shred_t hdr; /* ptr to the data shred header */
180 : ulong off; /* offset to the payload relative to the start of the block's data region */
181 : };
182 : typedef struct fd_block_shred fd_block_shred_t;
183 :
184 : /*
185 : * fd_block_entry_batch_t is a microblock/entry batch within a block.
186 : * The offset is relative to the start of the block's data region,
187 : * and indicates where the batch ends. The (exclusive) end offset of
188 : * batch i is the (inclusive) start offset of batch i+1. The 0th batch
189 : * always starts at offset 0.
190 : * On the wire, the presence of one of the COMPLETE flags in a data
191 : * shred marks the end of a batch.
192 : * In other words, batch ends are aligned with shred ends, and batch
193 : * starts are aligned with shred starts. Usually a batch comprises
194 : * multiple shreds, and a block comprises multiple batches.
195 : * This information is useful because bincode deserialization needs to
196 : * be performed on a per-batch basis. Precisely a single array of
197 : * microblocks/entries is expected to be deserialized from a batch.
198 : * Trailing bytes in each batch are ignored by default.
199 : */
200 : struct fd_block_entry_batch {
201 : ulong end_off; /* exclusive */
202 : };
203 : typedef struct fd_block_entry_batch fd_block_entry_batch_t;
204 :
205 : /* fd_block_micro_t is a microblock ("entry" in Solana parlance) within
206 : a block. The microblock begins at `off` relative to the start of the
207 : block's data region. */
208 : struct fd_block_micro {
209 : ulong off; /* offset into block data */
210 : };
211 : typedef struct fd_block_micro fd_block_micro_t;
212 :
213 : /* fd_block_txn_t is a transaction that has been parsed and is part of a
214 : block. The transaction begins at `off` relative to the start of the
215 : block's data region. */
216 : struct fd_block_txn {
217 : ulong txn_off; /* offset into block data of transaction */
218 : ulong id_off; /* offset into block data of transaction identifiers */
219 : ulong sz;
220 : };
221 : typedef struct fd_block_txn fd_block_txn_t;
222 :
223 : /* If the 0th bit is set, this indicates the block is preparing, which
224 : means it might be partially executed e.g. a subset of the microblocks
225 : have been executed. It is not safe to remove, relocate, or modify
226 : the block in any way at this time.
227 :
228 : Callers holding a pointer to a block should always make sure to
229 : inspect this flag.
230 :
231 : Other flags mainly provide useful metadata for read-only callers, eg.
232 : RPC. */
233 :
234 0 : #define FD_BLOCK_FLAG_RECEIVING 0 /* xxxxxxx1 still receiving shreds */
235 0 : #define FD_BLOCK_FLAG_COMPLETED 1 /* xxxxxx1x received the block ie. all shreds (SLOT_COMPLETE) */
236 0 : #define FD_BLOCK_FLAG_REPLAYING 2 /* xxxxx1xx replay in progress (DO NOT REMOVE) */
237 0 : #define FD_BLOCK_FLAG_PROCESSED 3 /* xxxx1xxx successfully replayed the block */
238 0 : #define FD_BLOCK_FLAG_EQVOCSAFE 4 /* xxxx1xxx 52% of cluster has voted on this (slot, bank hash) */
239 0 : #define FD_BLOCK_FLAG_CONFIRMED 5 /* xxx1xxxx 2/3 of cluster has voted on this (slot, bank hash) */
240 0 : #define FD_BLOCK_FLAG_FINALIZED 6 /* xx1xxxxx 2/3 of cluster has rooted this slot */
241 0 : #define FD_BLOCK_FLAG_DEADBLOCK 7 /* x1xxxxxx failed to replay the block */
242 :
243 : /* Rewards assigned after block is executed */
244 :
245 : struct fd_block_rewards {
246 : ulong collected_fees;
247 : fd_hash_t leader;
248 : ulong post_balance;
249 : };
250 : typedef struct fd_block_rewards fd_block_rewards_t;
251 :
252 : /* Remaining bits [4, 8) are reserved.
253 :
254 : To avoid confusion, please use `fd_bits.h` API
255 : ie. `fd_uchar_set_bit`, `fd_uchar_extract_bit`. */
256 :
257 : #define SET_NAME fd_block_set
258 : #define SET_MAX FD_SHRED_MAX_PER_SLOT
259 : #include "../../util/tmpl/fd_set.c"
260 :
261 : struct fd_block_info {
262 : ulong slot; /* map key */
263 : ulong next; /* reserved for use by fd_map_giant.c */
264 :
265 : /* Ancestry */
266 :
267 : ulong parent_slot;
268 : ulong child_slots[FD_BLOCKSTORE_CHILD_SLOT_MAX];
269 : ulong child_slot_cnt;
270 :
271 : /* Metadata */
272 :
273 : ulong block_height;
274 : fd_hash_t block_hash;
275 : fd_hash_t bank_hash;
276 : fd_hash_t merkle_hash; /* the last FEC set's merkle hash */
277 : ulong fec_cnt; /* the number of FEC sets in the slot */
278 : uchar flags;
279 : long ts; /* the wallclock time when we finished receiving the block. */
280 :
281 : /* Windowing
282 :
283 : Shreds are buffered into a map as they are received:
284 :
285 : | 0 | 1 | 2 | x | x | 5 | x |
286 : ^ ^ ^
287 : c b r
288 :
289 : c = "consumed" = contiguous shred idxs that have been consumed.
290 : the "consumer" is replay and the idx is
291 : incremented after replaying each block slice.
292 : b = "buffered" = contiguous shred idxs that have been buffered.
293 : when buffered == block_slice_end the next slice of
294 : a block is ready for replay.
295 : r = "received" = highest shred idx received so far. used to detect
296 : when repair is needed.
297 : */
298 :
299 : uint consumed_idx; /* the highest shred idx we've contiguously consumed (consecutive from 0). */
300 : uint buffered_idx; /* the highest shred idx we've contiguously buffered (consecutive from 0). */
301 : uint received_idx; /* the highest shred idx we've received (can be out-of-order). */
302 :
303 : uint data_complete_idx; /* the highest shred idx wrt contiguous entry batches (inclusive). */
304 : uint slot_complete_idx; /* the highest shred idx for the entire slot (inclusive). */
305 :
306 : /* This is a bit vec (fd_set) that tracks every shred idx marked with
307 : FD_SHRED_DATA_FLAG_DATA_COMPLETE. The bit position in the fd_set
308 : corresponds to the shred's index. Note shreds can be received
309 : out-of-order so higher bits might be set before lower bits. */
310 :
311 : fd_block_set_t data_complete_idxs[FD_SHRED_MAX_PER_SLOT / sizeof(ulong)];
312 :
313 : /* Helpers for batching tick verification */
314 :
315 : ulong ticks_consumed;
316 : ulong tick_hash_count_accum;
317 : fd_hash_t in_poh_hash; /* TODO: might not be best place to hold this */
318 :
319 : /* Block */
320 :
321 : ulong block_gaddr; /* global address to the start of the allocated fd_block_t */
322 : };
323 : typedef struct fd_block_info fd_block_info_t;
324 :
325 : /* Needed due to redefinition of err codes in slot_para */
326 : #undef FD_MAP_SUCCESS
327 : #undef FD_MAP_ERR_INVAL
328 : #undef FD_MAP_ERR_AGAIN
329 : #undef FD_MAP_ERR_KEY
330 : #undef FD_MAP_FLAG_BLOCKING
331 :
332 : #define MAP_NAME fd_block_map
333 0 : #define MAP_ELE_T fd_block_info_t
334 0 : #define MAP_KEY slot
335 0 : #define MAP_ELE_IS_FREE(ctx, ele) ((ele)->slot == 0)
336 0 : #define MAP_ELE_FREE(ctx, ele) ((ele)->slot = 0)
337 0 : #define MAP_KEY_HASH(key, seed) (void)(seed), (*(key))
338 : #include "../../util/tmpl/fd_map_slot_para.c"
339 :
340 0 : #define BLOCK_INFO_LOCK_CNT 1024UL
341 : #define BLOCK_INFO_PROBE_CNT 2UL
342 : /*
343 : Rationale for block_map parameters:
344 : - each lock manages block_max / lock_cnt elements, so with block_max
345 : at 4096, each lock would manage 4 contiguous elements.
346 : - Since keys are unique and increment by 1, we can index key to map
347 : bucket by taking key % ele_max directly. This way in theory we
348 : have perfect hashing and never need to probe.
349 : - This breaks when we store more than 4096 contiguous slots,
350 : i.e.: slot 0 collides with slot 4096, but this is at heart an
351 : OOM issue.
352 : - Causes possible contention - consider if we execute n, but are
353 : storing shreds for n+1 -- these are managed by the same lock.
354 : Perhaps opportunity for optimization.
355 : */
356 :
357 : /* fd_block_idx is an in-memory index of finalized blocks that have been
358 : archived to disk. It records the slot together with the byte offset
359 : relative to the start of the file. */
360 :
361 : struct fd_block_idx {
362 : ulong slot;
363 : ulong next;
364 : uint hash;
365 : ulong off;
366 : fd_hash_t block_hash;
367 : fd_hash_t bank_hash;
368 : };
369 : typedef struct fd_block_idx fd_block_idx_t;
370 :
371 : #define MAP_NAME fd_block_idx
372 0 : #define MAP_T fd_block_idx_t
373 0 : #define MAP_KEY slot
374 0 : #define MAP_KEY_HASH(key) ((uint)(key)) /* finalized slots are guaranteed to be unique so perfect hashing */
375 : #include "../../util/tmpl/fd_map_dynamic.c"
376 :
377 : struct fd_txn_key {
378 : ulong v[FD_ED25519_SIG_SZ / sizeof( ulong )];
379 : };
380 : typedef struct fd_txn_key fd_txn_key_t;
381 :
382 : struct fd_txn_map {
383 : fd_txn_key_t sig;
384 : ulong next;
385 : ulong slot;
386 : ulong offset;
387 : ulong sz;
388 : ulong meta_gaddr; /* ptr to the transaction metadata */
389 : ulong meta_sz; /* metadata size */
390 : };
391 : typedef struct fd_txn_map fd_txn_map_t;
392 :
393 : FD_FN_PURE int fd_txn_key_equal(fd_txn_key_t const * k0, fd_txn_key_t const * k1);
394 : FD_FN_PURE ulong fd_txn_key_hash(fd_txn_key_t const * k, ulong seed);
395 :
396 : #define MAP_NAME fd_txn_map
397 0 : #define MAP_T fd_txn_map_t
398 0 : #define MAP_KEY sig
399 : #define MAP_KEY_T fd_txn_key_t
400 0 : #define MAP_KEY_EQ(k0,k1) fd_txn_key_equal(k0,k1)
401 0 : #define MAP_KEY_HASH(k,seed) fd_txn_key_hash(k, seed)
402 : #include "../../util/tmpl/fd_map_giant.c"
403 :
404 : /* fd_blockstore_archiver outlines the format of metadata
405 : at the start of an archive file - needed so that archive
406 : files can be read back on initialization. */
407 :
408 : struct fd_blockstore_archiver {
409 : ulong fd_size_max; /* maximum size of the archival file */
410 : ulong num_blocks; /* number of blocks in the archival file. needed for reading back */
411 : ulong head; /* location of least recently written block */
412 : ulong tail; /* location after most recently written block */
413 : };
414 : typedef struct fd_blockstore_archiver fd_blockstore_archiver_t;
415 0 : #define FD_BLOCKSTORE_ARCHIVE_START sizeof(fd_blockstore_archiver_t)
416 :
417 : /* CONCURRENCY NOTES FOR BLOCKSTORE ENJOINERS:
418 :
419 : With the parallelization of the shred map and block map, parts of the
420 : blockstore are concurrent, and parts are not. Block map and shred map
421 : have their own locks, which are managed through the
422 : query_try/query_test APIs. When accessing buf_shred_t and
423 : block_info_t items then, the caller does not need to use
424 : blockstore_start/end_read/write. However, the
425 : blockstore_start/end_read/write still protects the blockstore_shmem_t
426 : object. If you are reading and writing any blockstore_shmem fields
427 : and at the same time accessing the block_info_t or buf_shred_t, you
428 : should call both the blockstore_start/end_read/write APIs AND the map
429 : query_try/test APIs. These are locks of separate concerns and will
430 : not deadlock with each other. TODO update docs when we switch to
431 : fenced read/write for primitive fields in shmem_t. */
432 : struct __attribute__((aligned(FD_BLOCKSTORE_ALIGN))) fd_blockstore_shmem {
433 :
434 : /* Metadata */
435 :
436 : ulong magic;
437 : ulong blockstore_gaddr;
438 : ulong wksp_tag;
439 : ulong seed;
440 :
441 : /* Persistence */
442 :
443 : fd_blockstore_archiver_t archiver;
444 : ulong mrw_slot; /* most recently written slot */
445 :
446 : /* Slot metadata */
447 :
448 : ulong lps; /* latest processed slot */
449 : ulong hcs; /* highest confirmed slot */
450 : ulong wmk; /* watermark. DO NOT MODIFY DIRECTLY. */
451 :
452 : /* Config limits */
453 :
454 : ulong shred_max; /* maximum # of shreds that can be held in memory */
455 : ulong block_max; /* maximum # of blocks that can be held in memory */
456 : ulong idx_max; /* maximum # of blocks that can be indexed from the archival file */
457 : ulong txn_max; /* maximum # of transactions that can be indexed from blocks */
458 : ulong alloc_max; /* maximum bytes that can be allocated */
459 :
460 : //ulong block_map_gaddr; /* map of slot->(slot_meta, block) */
461 : ulong block_idx_gaddr; /* map of slot->byte offset in archival file */
462 : ulong slot_deque_gaddr; /* deque of slot numbers */
463 : ulong txn_map_gaddr;
464 : ulong alloc_gaddr;
465 : };
466 : typedef struct fd_blockstore_shmem fd_blockstore_shmem_t;
467 :
468 : /* fd_blockstore_t is a local join to the blockstore. This is specific
469 : to the local address space should not be shared across tiles. */
470 :
471 : struct fd_blockstore {
472 :
473 : /* shared memory region */
474 :
475 : fd_blockstore_shmem_t * shmem; /* read/writes to shmem must call fd_blockstore_start_read()*/
476 :
477 : /* local join handles */
478 :
479 : fd_buf_shred_pool_t shred_pool[1];
480 : fd_buf_shred_map_t shred_map[1];
481 : fd_block_map_t block_map[1];
482 : };
483 : typedef struct fd_blockstore fd_blockstore_t;
484 :
485 : FD_PROTOTYPES_BEGIN
486 :
487 : /* Construction API */
488 :
489 : FD_FN_CONST static inline ulong
490 0 : fd_blockstore_align( void ) {
491 0 : return FD_BLOCKSTORE_ALIGN;
492 0 : }
493 :
494 : /* fd_blockstore_footprint returns the footprint of the entire
495 : blockstore shared memory region occupied by `fd_blockstore_shmem_t`
496 : including data structures. */
497 :
498 : FD_FN_CONST static inline ulong
499 0 : fd_blockstore_footprint( ulong shred_max, ulong block_max, ulong idx_max, ulong txn_max ) {
500 : /* TODO -- when removing, make change in fd_blockstore_new as well */
501 0 : block_max = fd_ulong_pow2_up( block_max );
502 0 : ulong lock_cnt = fd_ulong_min( block_max, BLOCK_INFO_LOCK_CNT );
503 :
504 0 : int lg_idx_max = fd_ulong_find_msb( fd_ulong_pow2_up( idx_max ) );
505 0 : return FD_LAYOUT_FINI(
506 0 : FD_LAYOUT_APPEND(
507 0 : FD_LAYOUT_APPEND(
508 0 : FD_LAYOUT_APPEND(
509 0 : FD_LAYOUT_APPEND(
510 0 : FD_LAYOUT_APPEND(
511 0 : FD_LAYOUT_APPEND(
512 0 : FD_LAYOUT_APPEND(
513 0 : FD_LAYOUT_APPEND(
514 0 : FD_LAYOUT_APPEND(
515 0 : FD_LAYOUT_APPEND(
516 0 : FD_LAYOUT_INIT,
517 0 : alignof(fd_blockstore_shmem_t), sizeof(fd_blockstore_shmem_t) ),
518 0 : alignof(fd_buf_shred_t), sizeof(fd_buf_shred_t) * shred_max ),
519 0 : fd_buf_shred_pool_align(), fd_buf_shred_pool_footprint() ),
520 0 : fd_buf_shred_map_align(), fd_buf_shred_map_footprint( shred_max ) ),
521 0 : alignof(fd_block_info_t), sizeof(fd_block_info_t) * block_max ),
522 0 : fd_block_map_align(), fd_block_map_footprint( block_max, lock_cnt, BLOCK_INFO_PROBE_CNT ) ),
523 0 : fd_block_idx_align(), fd_block_idx_footprint( lg_idx_max ) ),
524 0 : fd_slot_deque_align(), fd_slot_deque_footprint( block_max ) ),
525 0 : fd_txn_map_align(), fd_txn_map_footprint( txn_max ) ),
526 0 : fd_alloc_align(), fd_alloc_footprint() ),
527 0 : fd_blockstore_align() );
528 0 : }
529 :
530 : /* fd_blockstore_new formats a memory region with the appropriate
531 : alignment and footprint into a blockstore. shmem points in the the
532 : caller's address space of the memory region to format. Returns shmem
533 : on success (blockstore has ownership of the memory region) and NULL
534 : on failure (no changes, logs details). Caller is not joined on
535 : return. The blockstore will be empty and unlocked. */
536 :
537 : void *
538 : fd_blockstore_new( void * shmem,
539 : ulong wksp_tag,
540 : ulong seed,
541 : ulong shred_max,
542 : ulong block_max,
543 : ulong idx_max,
544 : ulong txn_max );
545 :
546 : /* fd_blockstore_join joins a blockstore. ljoin points to a
547 : fd_blockstore_t compatible memory region in the caller's address
548 : space used to hold info about the local join, shblockstore points in
549 : the caller's address space to the memory region containing the
550 : blockstore. Returns a handle to the caller's local join on success
551 : (join has ownership of the ljoin region) and NULL on failure (no
552 : changes, logs details). */
553 :
554 : fd_blockstore_t *
555 : fd_blockstore_join( void * ljoin, void * shblockstore );
556 :
557 : void *
558 : fd_blockstore_leave( fd_blockstore_t * blockstore );
559 :
560 : void *
561 : fd_blockstore_delete( void * shblockstore );
562 :
563 : /* fd_blockstore_init initializes a blockstore with the given
564 : `slot_bank`. This bank is used for initializing fields (SMR, etc.),
565 : and should be the bank upon finishing a snapshot load if booting from
566 : a snapshot, genesis bank otherwise. It is also used to "fake" the
567 : snapshot block as if that block's data were available. The metadata
568 : for this block's slot will be populated (fd_block_map_t) but the
569 : actual block data (fd_block_t) won't exist. This is done to bootstrap
570 : the various components for live replay (turbine, repair, etc.)
571 :
572 : `fd` is a file descriptor for the blockstore archival file. As part
573 : of `init`, blockstore rebuilds an in-memory index of the archival
574 : file. */
575 :
576 : fd_blockstore_t *
577 : fd_blockstore_init( fd_blockstore_t * blockstore, int fd, ulong fd_size_max, fd_slot_bank_t const * slot_bank );
578 :
579 : /* fd_blockstore_fini finalizes a blockstore.
580 :
581 : IMPORTANT! Caller MUST hold the read lock when calling this
582 : function. */
583 :
584 : void
585 : fd_blockstore_fini( fd_blockstore_t * blockstore );
586 :
587 : /* Accessors */
588 :
589 : /* fd_blockstore_wksp returns the local join to the wksp backing the
590 : blockstore. The lifetime of the returned pointer is at least as long
591 : as the lifetime of the local join. Assumes blockstore is a current
592 : local join. */
593 :
594 : FD_FN_PURE static inline fd_wksp_t *
595 0 : fd_blockstore_wksp( fd_blockstore_t * blockstore ) {
596 0 : return (fd_wksp_t *)( ( (ulong)blockstore->shmem ) - blockstore->shmem->blockstore_gaddr );
597 0 : }
598 :
599 : /* fd_blockstore_wksp_tag returns the workspace allocation tag used by
600 : the blockstore for its wksp allocations. Will be positive. Assumes
601 : blockstore is a current local join. */
602 :
603 : FD_FN_PURE static inline ulong
604 0 : fd_blockstore_wksp_tag( fd_blockstore_t const * blockstore ) {
605 0 : return blockstore->shmem->wksp_tag;
606 0 : }
607 :
608 : /* fd_blockstore_seed returns the hash seed used by the blockstore for various hash
609 : functions. Arbitrary value. Assumes blockstore is a current local join.
610 : TODO: consider renaming hash_seed? */
611 : FD_FN_PURE static inline ulong
612 0 : fd_blockstore_seed( fd_blockstore_t const * blockstore ) {
613 0 : return blockstore->shmem->seed;
614 0 : }
615 :
616 : /* fd_block_idx returns a pointer in the caller's address space to the
617 : fd_block_idx_t in the blockstore wksp. Assumes blockstore is local
618 : join. Lifetime of the returned pointer is that of the local join. */
619 :
620 : FD_FN_PURE static inline fd_block_idx_t *
621 0 : fd_blockstore_block_idx( fd_blockstore_t * blockstore ) {
622 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore ), blockstore->shmem->block_idx_gaddr );
623 0 : }
624 :
625 : /* fd_slot_deque returns a pointer in the caller's address space to the
626 : fd_slot_deque_t in the blockstore wksp. Assumes blockstore is local
627 : join. Lifetime of the returned pointer is that of the local join. */
628 :
629 : FD_FN_PURE static inline ulong *
630 0 : fd_blockstore_slot_deque( fd_blockstore_t * blockstore ) {
631 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore), blockstore->shmem->slot_deque_gaddr );
632 0 : }
633 :
634 : /* fd_txn_map returns a pointer in the caller's address space to the blockstore's
635 : block map. Assumes blockstore is local join. Lifetime of the returned pointer is that of the
636 : local join. */
637 :
638 : FD_FN_PURE static inline fd_txn_map_t *
639 0 : fd_blockstore_txn_map( fd_blockstore_t * blockstore ) {
640 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore), blockstore->shmem->txn_map_gaddr );
641 0 : }
642 :
643 : /* fd_blockstore_alloc returns a pointer in the caller's address space to
644 : the blockstore's allocator. */
645 :
646 : FD_FN_PURE static inline fd_alloc_t * /* Lifetime is that of the local join */
647 0 : fd_blockstore_alloc( fd_blockstore_t * blockstore ) {
648 0 : return fd_wksp_laddr_fast( fd_blockstore_wksp( blockstore), blockstore->shmem->alloc_gaddr );
649 0 : }
650 :
651 : /* fd_blockstore_shred_test returns 1 if a shred keyed by (slot, idx) is
652 : already in the blockstore and 0 otherwise. */
653 :
654 : int
655 : fd_blockstore_shred_test( fd_blockstore_t * blockstore, ulong slot, uint idx );
656 :
657 : /* fd_buf_shred_query_copy_data queries the blockstore for shred at
658 : slot, shred_idx. Copies the shred data to the given buffer and
659 : returns the data size. Returns -1 on failure.
660 :
661 : IMPORTANT! Caller MUST hold the read lock when calling this
662 : function. */
663 :
664 : long
665 : fd_buf_shred_query_copy_data( fd_blockstore_t * blockstore,
666 : ulong slot,
667 : uint shred_idx,
668 : void * buf,
669 : ulong buf_max );
670 :
671 : /* fd_blockstore_block_hash_query performs a blocking query (concurrent
672 : writers are not blocked) for the block hash of slot. Returns
673 : FD_BLOCKSTORE_SUCCESS on success and FD_BLOCKSTORE_ERR_KEY if slot is
674 : not in blockstore. Cannot fail. On success, a copy of the block
675 : hash will be populated in `block_hash`. Retains no interest in
676 : `slot` or `block_hash`.
677 :
678 : The block hash is the final poh hash for a slot and available on the
679 : last microblock header. */
680 :
681 : int
682 : fd_blockstore_block_hash_query( fd_blockstore_t * blockstore, ulong slot, fd_hash_t * block_hash );
683 :
684 : /* fd_blockstore_bank_hash_query performs a blocking query (concurrent
685 : writers are not blocked) for the bank hash of slot. Returns
686 : FD_BLOCKSTORE_SUCCESS on success and FD_BLOCKSTORE_ERR_KEY if slot is
687 : not in blockstore. Cannot fail. On success, a copy of the bank hash
688 : will be populated in `bank_hash`. Retains no interest in `slot` or
689 : `bank_hash`.
690 :
691 : The bank hash is a hash of the execution state (the "bank") after
692 : executing the block for a given slot. */
693 :
694 : int
695 : fd_blockstore_bank_hash_query( fd_blockstore_t * blockstore, ulong slot, fd_hash_t * bank_hash );
696 :
697 : /* fd_blockstore_block_map_query queries the blockstore for the block
698 : map entry at slot. Returns a pointer to the slot meta or NULL if not
699 : in blockstore.
700 :
701 : IMPORTANT! This should only be used for single-threaded / offline
702 : use-cases as it does not test the query. Read notes below for
703 : block_map usage in live. */
704 :
705 : fd_block_info_t *
706 : fd_blockstore_block_map_query( fd_blockstore_t * blockstore, ulong slot );
707 :
708 : /* IMPORTANT! NOTES FOR block_map USAGE:
709 :
710 : The block_info entries must be queried using the query_try/query_test
711 : pattern. This will frequently look like:
712 :
713 : int err = FD_MAP_ERR_AGAIN;
714 : loop while( err == FD_MAP_ERR_AGAIN )
715 : block_map_query_t query;
716 : err = fd_block_map_query_try( nonblocking );
717 : block_info_t * ele = fd_block_map_query_ele(query);
718 : if ERROR is FD_MAP_ERR_KEY, then the slot is not found.
719 : if ERROR is FD_MAP_ERR_AGAIN, then immediately continue.
720 : // important to handle ALL possible return err codes *before*
721 : // accessing the ele, as the ele will be the sentinel (usually NULL)
722 : speculatively execute <stuff>
723 : - no side effects
724 : - no early return
725 : err = fd_block_map_query_test(query)
726 : end loop
727 :
728 : Some accessors are provided to callers that already do this pattern,
729 : and handle the looping querying. For example, block_hash_copy, and
730 : parent_slot_query. However, for most caller use cases, it would be
731 : much more effecient to use the query_try/query_test pattern directly.
732 :
733 : Example: if you are accessing a block_info_t m, and m->parent_slot to
734 : the blockstore->shmem->smr, then you will need to start_write on the
735 : blockstore, query_try for the block_info_t object, set
736 : shmem->smr = meta->parent_slot, and then query_test, AND call
737 : blockstore_end_write. In the case that there's block_info contention,
738 : i.e. another thread is removing the block_info_t object of interest
739 : as we are trying to access it, the query_test will ERR_AGAIN, we will
740 : loop back and try again, hit the FD_MAP_ERR_KEY condition
741 : (and exit the loop gracefully), and we will have an incorrectly set
742 : shmem->smr.
743 :
744 : So depending on the complexity of what's being executed, it's easiest
745 : to directly copy what you need from the block_info_t into a variable
746 : outside the context of the loop, and use it further below, ex:
747 :
748 : ulong map_item = NULL_ITEM;
749 : loop {
750 : query_try
751 : map_item = ele->map_item; // like parent_slot
752 : query_test
753 : }
754 : check if map_item is NULL_ITEM
755 : fd_blockstore_start_write
756 : use map_item
757 : fd_blockstore_end_write
758 :
759 : Writes and updates (blocking). The pattern is:
760 : int err = fd_block_map_prepare( &slot, query, blocking );
761 : block_info_t * ele = fd_block_map_query_ele(query);
762 :
763 : IF slot was an existing key, then ele->slot == slot, and you are MODIFYING
764 : <modify ele>
765 : If slot was not an existing key, then ele->slot == 0, and you are INSERTING
766 : ele->slot = slot;
767 : <initialize ele>
768 :
769 : fd_block_map_publish(query); // will always succeed */
770 :
771 : /* fd_blockstore_parent_slot_query queries the parent slot of slot.
772 :
773 : This is non-blocking. */
774 : ulong
775 : fd_blockstore_parent_slot_query( fd_blockstore_t * blockstore, ulong slot );
776 :
777 : /* fd_blockstore_block_data_query_volatile queries the block map entry
778 : (metadata and block data) in a lock-free thread-safe manner that does
779 : not block writes. Copies the metadata (fd_block_map_t) into
780 : block_info_out. Allocates a new block data (uchar *) using
781 : alloc, copies the block data into it, and sets the block_data_out
782 : pointer. Caller provides the allocator via alloc for the copied
783 : block data (an allocator is needed because the block data sz is not
784 : known apriori). Returns FD_BLOCKSTORE_SLOT_MISSING if slot is
785 : missing: caller MUST ignore out pointers in this case. Otherwise this
786 : call cannot fail and returns FD_BLOCKSTORE_SUCCESS. */
787 :
788 : int
789 : fd_blockstore_block_data_query_volatile( fd_blockstore_t * blockstore,
790 : int fd,
791 : ulong slot,
792 : fd_valloc_t alloc,
793 : fd_hash_t * parent_block_hash_out,
794 : fd_block_info_t * block_info_out,
795 : fd_block_rewards_t * block_rewards_out,
796 : uchar ** block_data_out,
797 : ulong * block_data_sz_out );
798 :
799 : /* fd_blockstore_block_map_query_volatile is the same as above except it
800 : only copies out the metadata (fd_block_map_t). Returns
801 : FD_BLOCKSTORE_SLOT_MISSING if slot is missing, otherwise
802 : FD_BLOCKSTORE_SUCCESS. */
803 :
804 : int
805 : fd_blockstore_block_map_query_volatile( fd_blockstore_t * blockstore,
806 : int fd,
807 : ulong slot,
808 : fd_block_info_t * block_info_out ) ;
809 :
810 : /* fd_blockstore_txn_query queries the transaction data for the given
811 : signature.
812 :
813 : IMPORTANT! Caller MUST hold the read lock when calling this
814 : function. */
815 :
816 : fd_txn_map_t *
817 : fd_blockstore_txn_query( fd_blockstore_t * blockstore, uchar const sig[static FD_ED25519_SIG_SZ] );
818 :
819 : /* Query the transaction data for the given signature in a thread
820 : safe manner. The transaction data is copied out. txn_data_out can
821 : be NULL if you are only interested in the transaction metadata. */
822 : int
823 : fd_blockstore_txn_query_volatile( fd_blockstore_t * blockstore,
824 : int fd,
825 : uchar const sig[static FD_ED25519_SIG_SZ],
826 : fd_txn_map_t * txn_out,
827 : long * blk_ts,
828 : uchar * blk_flags,
829 : uchar txn_data_out[FD_TXN_MTU] );
830 :
831 : /* fd_blockstore_block_info_test tests if a block meta entry exists for
832 : the given slot. Returns 1 if the entry exists and 0 otherwise.
833 :
834 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
835 : this function. */
836 : int
837 : fd_blockstore_block_info_test( fd_blockstore_t * blockstore, ulong slot );
838 :
839 : /* fd_blockstore_block_info_remove removes a block meta entry for
840 : the given slot. Returns SUCCESS if the entry exists and an
841 : error code otherwise.
842 :
843 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
844 : this function. */
845 : int
846 : fd_blockstore_block_info_remove( fd_blockstore_t * blockstore, ulong slot );
847 :
848 : /* fd_blockstore_slot_remove removes slot from blockstore, including all
849 : relevant internal structures.
850 :
851 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
852 : this function. */
853 : void
854 : fd_blockstore_slot_remove( fd_blockstore_t * blockstore, ulong slot );
855 :
856 : /* Operations */
857 :
858 : /* fd_blockstore_shred_insert inserts shred into the blockstore, fast
859 : O(1). Returns the current `consumed_idx` for the shred's slot if
860 : insert is successful, otherwise returns FD_SHRED_IDX_NULL on error.
861 : Reasons for error include this shred is already in the blockstore or
862 : the blockstore is full.
863 :
864 : fd_blockstore_shred_insert will manage locking, so the caller
865 : should NOT be acquiring the blockstore read/write lock before
866 : calling this function. */
867 :
868 : void
869 : fd_blockstore_shred_insert( fd_blockstore_t * blockstore, fd_shred_t const * shred );
870 :
871 : /* fd_blockstore_buffered_shreds_remove removes all the unassembled shreds
872 : for a slot */
873 : void
874 : fd_blockstore_shred_remove( fd_blockstore_t * blockstore, ulong slot, uint idx );
875 :
876 : /* fd_blockstore_slice_query queries for the block slice beginning from
877 : shred `idx`. Copies at most `max` bytes of the shred payloads
878 : consecutively from `idx` until the first {DATA, SLOT}_COMPLETES.
879 :
880 : Returns FD_BLOCKSTORE_SUCCESS (0) on success and a FD_MAP_ERR
881 : (negative) on failure. On success, `buf` will be populated with the
882 : copied slice and `buf_sz` will contain the number of bytes copied.
883 : Caller must ignore the values of `buf` and `buf_sz` on failure.
884 :
885 : Implementation is lockfree and safe with concurrent operations on
886 : blockstore. */
887 :
888 : int
889 : fd_blockstore_slice_query( fd_blockstore_t * blockstore,
890 : ulong slot,
891 : uint start_idx,
892 : uint end_idx,
893 : ulong max,
894 : uchar * buf,
895 : ulong * buf_sz );
896 :
897 : /* fd_blockstore_shreds_complete should be a replacement for anywhere that is
898 : querying for an fd_block_t * for existence but not actually using the block data.
899 : Semantically equivalent to query_block( slot ) != NULL.
900 :
901 : Implementation is lockfree and safe with concurrent operations on
902 : blockstore. */
903 : int
904 : fd_blockstore_shreds_complete( fd_blockstore_t * blockstore, ulong slot );
905 :
906 : /* fd_blockstore_block_height_update sets the block height.
907 :
908 : IMPORTANT! Caller MUST NOT be in a block_map_t prepare when calling
909 : this function. */
910 : void
911 : fd_blockstore_block_height_update( fd_blockstore_t * blockstore, ulong slot, ulong block_height );
912 :
913 : /* fd_blockstore_publish publishes all blocks until the current
914 : blockstore smr (`blockstore->smr`). Publishing entails 1. pruning
915 : and 2. archiving. Pruning removes any blocks that are not part of
916 : the same fork as the smr (hence the name pruning, like pruning the
917 : branches of a tree). Archiving removes from memory any slots < smr
918 : that are on the same fork, but writes those blocks out to disk using
919 : the provided file descriptor to the archival file `fd`.
920 :
921 : Note that slots < smr are ancestors of the smr, and are therefore
922 : finalized slots which is why they are archived. Blocks removed as a
923 : result of pruning are not finalized, and therefore not archived.
924 :
925 : IMPORTANT! Caller MUST hold the write lock when calling this
926 : function. */
927 :
928 : void
929 : fd_blockstore_publish( fd_blockstore_t * blockstore, int fd, ulong wmk );
930 :
931 : void
932 : fd_blockstore_log_block_status( fd_blockstore_t * blockstore, ulong around_slot );
933 :
934 : /* fd_blockstore_log_mem_usage logs the memory usage of blockstore in a
935 : human-readable format. Caller MUST hold the read lock. */
936 :
937 : void
938 : fd_blockstore_log_mem_usage( fd_blockstore_t * blockstore );
939 :
940 : FD_PROTOTYPES_END
941 :
942 : #ifndef BLOCK_ARCHIVING
943 : #define BLOCK_ARCHIVING 0
944 : #endif
945 :
946 : #endif /* HEADER_fd_src_flamenco_runtime_fd_blockstore_h */
|