Line data Source code
1 : #include "fd_snapshot_create.h"
2 : #include "../runtime/sysvar/fd_sysvar_epoch_schedule.h"
3 : #include "../../ballet/zstd/fd_zstd.h"
4 : #include "../runtime/fd_hashes.h"
5 : #include "../runtime/fd_runtime.h"
6 :
7 : #include <errno.h>
8 : #include <stdio.h>
9 : #include <stdlib.h>
10 : #include <sys/stat.h>
11 : #include <sys/types.h>
12 : #include <unistd.h>
13 : #include <zstd.h>
14 :
15 : static uchar padding[ FD_SNAPSHOT_ACC_ALIGN ] = {0};
16 : static fd_account_meta_t default_meta = { .magic = FD_ACCOUNT_META_MAGIC };
17 :
18 : static inline fd_account_meta_t *
19 0 : fd_snapshot_create_get_default_meta( ulong slot ) {
20 0 : default_meta.slot = slot;
21 0 : return &default_meta;
22 0 : }
23 :
24 : static inline void
25 : fd_snapshot_create_populate_acc_vecs( fd_snapshot_ctx_t * snapshot_ctx,
26 : fd_solana_manifest_serializable_t * manifest,
27 : fd_tar_writer_t * writer,
28 0 : ulong * out_cap ) {
29 :
30 : /* The append vecs need to be described in an index in the manifest so a
31 : reader knows what account files to look for. These files are technically
32 : slot indexed, but the Firedancer implementation of the Solana snapshot
33 : produces far fewer indices. These storages are for the accounts
34 : that were modified and deleted in the most recent slot because that
35 : information is used by the Agave client to calculate and verify the
36 : bank hash for the given slot. This is done as an optimization to avoid
37 : having to slot index the Firedancer accounts db which would incur a large
38 : performance hit.
39 :
40 : To avoid iterating through the root twice to determine what accounts were
41 : touched in the snapshot slot and what accounts were touched in the
42 : other slots, we will create an array of pubkey pointers for all accounts
43 : that were touched in the snapshot slot. This buffer can be safely sized to
44 : the maximum amount of writable accounts that are possible in a non-epoch
45 : boundary slot. The rationale for this bound is explained in fd_runtime.h.
46 : We will not attempt to create a snapshot on an epoch boundary.
47 :
48 : TODO: We must add compaction here. */
49 :
50 0 : fd_pubkey_t * * snapshot_slot_keys = fd_valloc_malloc( snapshot_ctx->valloc, alignof(fd_pubkey_t*), sizeof(fd_pubkey_t*) * FD_WRITABLE_ACCS_IN_SLOT );
51 0 : ulong snapshot_slot_key_cnt = 0UL;
52 :
53 : /* We will dynamically resize the number of incremental keys because the upper
54 : bound will be roughly 8 bytes * writable accs in a slot * number of slots
55 : since the last full snapshot which can quickly grow to be severalgigabytes
56 : or more. In the normal case, this won't require dynamic resizing. */
57 0 : #define FD_INCREMENTAL_KEY_INIT_BOUND (100000UL)
58 0 : ulong incremental_key_bound = FD_INCREMENTAL_KEY_INIT_BOUND;
59 0 : ulong incremental_key_cnt = 0UL;
60 0 : fd_funk_rec_key_t const * * incremental_keys = snapshot_ctx->is_incremental ?
61 0 : fd_valloc_malloc( snapshot_ctx->valloc, alignof(fd_funk_rec_key_t*), sizeof(fd_funk_rec_key_t*) * incremental_key_bound ) :
62 0 : NULL;
63 :
64 0 : #undef FD_INCREMENTAL_KEY_INIT_BOUND
65 :
66 : /* In order to size out the accounts DB index in the manifest, we must
67 : iterate through funk and accumulate the size of all of the records
68 : from all slots before the snapshot_slot. */
69 :
70 0 : fd_funk_t * funk = snapshot_ctx->acc_mgr->funk;
71 0 : ulong prev_sz = 0UL;
72 0 : ulong tombstones_cnt = 0UL;
73 0 : for( fd_funk_rec_t const * rec = fd_funk_txn_first_rec( funk, NULL ); NULL != rec; rec = fd_funk_txn_next_rec( funk, rec ) ) {
74 :
75 0 : if( !fd_funk_key_is_acc( rec->pair.key ) ) {
76 0 : continue;
77 0 : }
78 :
79 0 : tombstones_cnt++;
80 :
81 0 : int is_tombstone = rec->flags & FD_FUNK_REC_FLAG_ERASE;
82 0 : uchar const * raw = fd_funk_val( rec, fd_funk_wksp( funk ) );
83 0 : fd_account_meta_t * metadata = is_tombstone ? fd_snapshot_create_get_default_meta( fd_funk_rec_get_erase_data( rec ) ) :
84 0 : (fd_account_meta_t*)raw;
85 :
86 0 : if( !metadata ) {
87 0 : continue;
88 0 : }
89 :
90 0 : if( metadata->magic!=FD_ACCOUNT_META_MAGIC ) {
91 0 : continue;
92 0 : }
93 :
94 0 : if( snapshot_ctx->is_incremental ) {
95 : /* We only care about accounts that were modified since the last
96 : snapshot slot for incremental snapshots.
97 :
98 : We also need to keep track of the capitalization for all of the
99 : accounts that are in the incremental as this is verified. */
100 0 : if( metadata->slot<=snapshot_ctx->last_snap_slot ) {
101 0 : continue;
102 0 : }
103 0 : incremental_keys[ incremental_key_cnt++ ] = rec->pair.key;
104 0 : *out_cap += metadata->info.lamports;
105 :
106 0 : if( FD_UNLIKELY( incremental_key_cnt==incremental_key_bound ) ) {
107 : /* Dynamically resize if needed. */
108 0 : incremental_key_bound *= 2UL;
109 0 : fd_funk_rec_key_t const * * new_incremental_keys = fd_valloc_malloc( snapshot_ctx->valloc,
110 0 : alignof(fd_funk_rec_key_t*),
111 0 : sizeof(fd_funk_rec_key_t*) * incremental_key_bound );
112 0 : fd_memcpy( new_incremental_keys, incremental_keys, sizeof(fd_funk_rec_key_t*) * incremental_key_cnt );
113 0 : fd_valloc_free( snapshot_ctx->valloc, incremental_keys );
114 0 : incremental_keys = new_incremental_keys;
115 0 : }
116 0 : }
117 :
118 : /* We know that all of the accounts from the snapshot slot can fit into
119 : one append vec, so we ignore all accounts from the snapshot slot. */
120 :
121 0 : if( metadata->slot==snapshot_ctx->slot ) {
122 0 : continue;
123 0 : }
124 :
125 0 : prev_sz += metadata->dlen + sizeof(fd_solana_account_hdr_t);
126 :
127 0 : }
128 :
129 : /* At this point we have sized out all of the relevant accounts that will
130 : be included in the snapshot. Now we must populate each of the append vecs
131 : and update the index as we go.
132 :
133 : When we account for the number of slots we need to consider one append vec
134 : for the snapshot slot and try to maximally fill up the others: an append
135 : vec has a protocol-defined maximum size in Agave. */
136 :
137 0 : ulong num_slots = 1UL + prev_sz / FD_SNAPSHOT_APPEND_VEC_SZ_MAX +
138 0 : (prev_sz % FD_SNAPSHOT_APPEND_VEC_SZ_MAX ? 1UL : 0UL);
139 :
140 0 : fd_solana_accounts_db_fields_t * accounts_db = &manifest->accounts_db;
141 :
142 0 : accounts_db->storages_len = num_slots;
143 0 : accounts_db->storages = fd_valloc_malloc( snapshot_ctx->valloc,
144 0 : FD_SNAPSHOT_SLOT_ACC_VECS_ALIGN,
145 0 : sizeof(fd_snapshot_slot_acc_vecs_t) * accounts_db->storages_len );
146 0 : accounts_db->version = 1UL;
147 0 : accounts_db->slot = snapshot_ctx->slot;
148 0 : accounts_db->historical_roots_len = 0UL;
149 0 : accounts_db->historical_roots = NULL;
150 0 : accounts_db->historical_roots_with_hash_len = 0UL;
151 0 : accounts_db->historical_roots_with_hash = NULL;
152 :
153 0 : for( ulong i=0UL; i<num_slots; i++ ) {
154 : /* Populate the storages for each slot. As a note, the slot number only
155 : matters for the snapshot slot. The other slot numbers don't affect
156 : consensus at all. Agave also maintains an invariant that there can
157 : only be one account vec per storage. */
158 :
159 0 : accounts_db->storages[ i ].account_vecs_len = 1UL;
160 0 : accounts_db->storages[ i ].account_vecs = fd_valloc_malloc( snapshot_ctx->valloc,
161 0 : FD_SNAPSHOT_ACC_VEC_ALIGN,
162 0 : sizeof(fd_snapshot_acc_vec_t) * accounts_db->storages[ i ].account_vecs_len );
163 0 : accounts_db->storages[ i ].account_vecs[ 0 ].file_sz = 0UL;
164 0 : accounts_db->storages[ i ].account_vecs[ 0 ].id = i + 1UL;
165 0 : accounts_db->storages[ i ].slot = snapshot_ctx->slot - i;
166 0 : }
167 :
168 : /* At this point we have iterated through all of the accounts and created
169 : the index. We are now ready to generate a snapshot hash. For both
170 : snapshots we need to generate two hashes:
171 : 1. The accounts hash. This is a simple hash of all of the accounts
172 : included in the snapshot.
173 : 2. The snapshot hash. This is a hash of the accounts hash and the epoch
174 : account hash. If the EAH is not included, then the accounts hash ==
175 : snapshot hash.
176 :
177 : There is some nuance as to which hash goes where. For full snapshots,
178 : the accounts hash in the bank hash info is the accounts hash. The hash in
179 : the filename is the snapshot hash.
180 :
181 : For incremental snapshots, the account hash in the bank hash info field is
182 : left zeroed out. The full snapshot's hash is in the incremental persistence
183 : field. The incremental snapshot's accounts hash is included in the
184 : incremental persistence field. The hash in the filename is the snapshot
185 : hash. */
186 :
187 0 : int err;
188 0 : if( !snapshot_ctx->is_incremental ) {
189 0 : err = fd_snapshot_service_hash( &snapshot_ctx->acc_hash,
190 0 : &snapshot_ctx->snap_hash,
191 0 : &snapshot_ctx->slot_bank,
192 0 : &snapshot_ctx->epoch_bank,
193 0 : snapshot_ctx->acc_mgr->funk,
194 0 : snapshot_ctx->tpool,
195 0 : snapshot_ctx->valloc );
196 0 : accounts_db->bank_hash_info.accounts_hash = snapshot_ctx->acc_hash;
197 0 : } else {
198 0 : err = fd_snapshot_service_inc_hash( &snapshot_ctx->acc_hash,
199 0 : &snapshot_ctx->snap_hash,
200 0 : &snapshot_ctx->slot_bank,
201 0 : &snapshot_ctx->epoch_bank,
202 0 : snapshot_ctx->acc_mgr->funk,
203 0 : incremental_keys,
204 0 : incremental_key_cnt,
205 0 : snapshot_ctx->valloc );
206 0 : fd_valloc_free( snapshot_ctx->valloc, incremental_keys );
207 :
208 0 : fd_memset( &accounts_db->bank_hash_info.accounts_hash, 0, sizeof(fd_hash_t) );
209 0 : }
210 :
211 0 : FD_LOG_NOTICE(( "Hashes calculated acc_hash=%s snapshot_hash=%s",
212 0 : FD_BASE58_ENC_32_ALLOCA(&snapshot_ctx->acc_hash),
213 0 : FD_BASE58_ENC_32_ALLOCA(&snapshot_ctx->snap_hash) ));
214 :
215 0 : if( FD_UNLIKELY( err ) ) {
216 0 : FD_LOG_ERR(( "Unable to calculate snapshot hash" ));
217 0 : }
218 :
219 0 : fd_memset( &accounts_db->bank_hash_info.stats, 0, sizeof(fd_bank_hash_stats_t) );
220 :
221 : /* Now, we have calculated the relevant hashes for the accounts.
222 : Because the files are serially written out for tar and we need to prepend
223 : the manifest, we must reserve space in the archive for the solana manifest. */
224 :
225 0 : if( snapshot_ctx->is_incremental ) {
226 0 : manifest->bank_incremental_snapshot_persistence = fd_valloc_malloc( snapshot_ctx->valloc,
227 0 : FD_BANK_INCREMENTAL_SNAPSHOT_PERSISTENCE_ALIGN,
228 0 : sizeof(fd_bank_incremental_snapshot_persistence_t) );
229 0 : }
230 :
231 0 : ulong manifest_sz = fd_solana_manifest_serializable_size( manifest );
232 :
233 0 : char buffer[ FD_SNAPSHOT_DIR_MAX ];
234 0 : err = snprintf( buffer, FD_SNAPSHOT_DIR_MAX, "snapshots/%lu/%lu", snapshot_ctx->slot, snapshot_ctx->slot );
235 0 : if( FD_UNLIKELY( err<0 ) ) {
236 0 : FD_LOG_ERR(( "Unable to format manifest name string" ));
237 0 : }
238 :
239 0 : err = fd_tar_writer_new_file( writer, buffer );
240 0 : if( FD_UNLIKELY( err ) ) {
241 0 : FD_LOG_ERR(( "Unable to create snapshot manifest file" ));
242 0 : }
243 :
244 : /* TODO: We want to eliminate having to write back into the tar file. This
245 : will enable the snapshot service to only use one file per snapshot.
246 : In order to do this, we must precompute the index in the manifest
247 : completely. This will allow us to stream out a compressed snapshot. */
248 :
249 0 : err = fd_tar_writer_make_space( writer, manifest_sz );
250 0 : if( FD_UNLIKELY( err ) ) {
251 0 : FD_LOG_ERR(( "Unable to make space for snapshot manifest file" ));
252 0 : }
253 :
254 0 : err = fd_tar_writer_fini_file( writer );
255 0 : if( FD_UNLIKELY( err ) ) {
256 0 : FD_LOG_ERR(( "Unable to finalize snapshot manifest file" ));
257 0 : }
258 :
259 : /* We have made space for the manifest and are ready to append the append
260 : vec files directly into the tar archive. We will iterate through all of
261 : the records in the funk root and create/populate an append vec for
262 : previous slots. Just record the pubkeys for the latest slot to populate
263 : the append vec after. If the append vec is full, write into the next one. */
264 :
265 0 : ulong curr_slot = 1UL;
266 0 : fd_snapshot_acc_vec_t * prev_accs = &accounts_db->storages[ curr_slot ].account_vecs[ 0UL ];
267 :
268 0 : err = snprintf( buffer, FD_SNAPSHOT_DIR_MAX, "accounts/%lu.%lu", snapshot_ctx->slot - curr_slot, prev_accs->id );
269 0 : if( FD_UNLIKELY( err<0 ) ) {
270 0 : FD_LOG_ERR(( "Unable to format previous accounts name string" ));
271 0 : }
272 :
273 0 : err = fd_tar_writer_new_file( writer, buffer );
274 0 : if( FD_UNLIKELY( err ) ) {
275 0 : FD_LOG_ERR(( "Unable to create previous accounts file" ));
276 0 : }
277 :
278 0 : fd_funk_rec_t * * tombstones = snapshot_ctx->is_incremental ? NULL :
279 0 : fd_valloc_malloc( snapshot_ctx->valloc, alignof(fd_funk_rec_t*), sizeof(fd_funk_rec_t*) * tombstones_cnt );
280 0 : tombstones_cnt = 0UL;
281 :
282 0 : for( fd_funk_rec_t const * rec = fd_funk_txn_first_rec( funk, NULL ); NULL != rec; rec = fd_funk_txn_next_rec( funk, rec ) ) {
283 :
284 : /* Get the account data. */
285 :
286 0 : if( !fd_funk_key_is_acc( rec->pair.key ) ) {
287 0 : continue;
288 0 : }
289 :
290 0 : fd_pubkey_t const * pubkey = fd_type_pun_const( rec->pair.key[0].uc );
291 0 : int is_tombstone = rec->flags & FD_FUNK_REC_FLAG_ERASE;
292 0 : uchar const * raw = fd_funk_val( rec, fd_funk_wksp( funk ) );
293 0 : fd_account_meta_t * metadata = is_tombstone ? fd_snapshot_create_get_default_meta( fd_funk_rec_get_erase_data( rec ) ) :
294 0 : (fd_account_meta_t*)raw;
295 :
296 0 : if( !snapshot_ctx->is_incremental && is_tombstone ) {
297 : /* If we are in a full snapshot, we need to gather all of the accounts
298 : that we plan on deleting. */
299 0 : tombstones[ tombstones_cnt++ ] = (fd_funk_rec_t*)rec;
300 0 : }
301 :
302 0 : if( !metadata ) {
303 0 : continue;
304 0 : }
305 :
306 0 : if( metadata->magic!=FD_ACCOUNT_META_MAGIC ) {
307 0 : continue;
308 0 : }
309 :
310 : /* Don't iterate through accounts that were touched before the last full
311 : snapshot. */
312 0 : if( snapshot_ctx->is_incremental && metadata->slot<=snapshot_ctx->last_snap_slot ) {
313 0 : continue;
314 0 : }
315 :
316 0 : uchar const * acc_data = raw + metadata->hlen;
317 :
318 : /* All accounts that were touched in the snapshot slot should be in
319 : a different append vec so that Agave can calculate the snapshot slot's
320 : bank hash. We don't want to include them in an arbitrary append vec. */
321 :
322 0 : if( metadata->slot==snapshot_ctx->slot ) {
323 0 : snapshot_slot_keys[ snapshot_slot_key_cnt++ ] = (fd_pubkey_t*)pubkey;
324 0 : continue;
325 0 : }
326 :
327 : /* We don't want to iterate over tombstones if the snapshot is not
328 : incremental */
329 0 : if( !snapshot_ctx->is_incremental && is_tombstone ) {
330 0 : continue;
331 0 : }
332 :
333 0 : ulong new_sz = prev_accs->file_sz + sizeof(fd_solana_account_hdr_t) + fd_ulong_align_up( metadata->dlen, FD_SNAPSHOT_ACC_ALIGN );
334 :
335 0 : if( new_sz>FD_SNAPSHOT_APPEND_VEC_SZ_MAX ) {
336 :
337 : /* When the current append vec is full, finish writing it, start writing
338 : into the next append vec. */
339 :
340 0 : err = fd_tar_writer_fini_file( writer );
341 0 : if( FD_UNLIKELY( err ) ) {
342 0 : FD_LOG_ERR(( "Unable to finalize previous accounts file" ));
343 0 : }
344 :
345 0 : prev_accs = &accounts_db->storages[ ++curr_slot ].account_vecs[ 0UL ];
346 :
347 0 : err = snprintf( buffer, FD_SNAPSHOT_DIR_MAX, "accounts/%lu.%lu", snapshot_ctx->slot - curr_slot, prev_accs->id );
348 0 : if( FD_UNLIKELY( err<0 ) ) {
349 0 : FD_LOG_ERR(( "Unable to format previous accounts name string" ));
350 0 : }
351 :
352 0 : err = fd_tar_writer_new_file( writer, buffer );
353 0 : if( FD_UNLIKELY( err ) ) {
354 0 : FD_LOG_ERR(( "Unable to create previous accounts file" ));
355 0 : }
356 0 : }
357 :
358 0 : prev_accs->file_sz += sizeof(fd_solana_account_hdr_t) + fd_ulong_align_up( metadata->dlen, FD_SNAPSHOT_ACC_ALIGN );
359 :
360 :
361 : /* Write out the header. */
362 :
363 0 : fd_solana_account_hdr_t header = {0};
364 : /* Stored meta */
365 0 : header.meta.write_version_obsolete = 0UL;
366 0 : header.meta.data_len = metadata->dlen;
367 0 : fd_memcpy( header.meta.pubkey, pubkey, sizeof(fd_pubkey_t) );
368 : /* Account Meta */
369 0 : header.info.lamports = metadata->info.lamports;
370 0 : header.info.rent_epoch = header.info.lamports ? metadata->info.rent_epoch : 0UL;
371 0 : fd_memcpy( header.info.owner, metadata->info.owner, sizeof(fd_pubkey_t) );
372 0 : header.info.executable = metadata->info.executable;
373 : /* Hash */
374 0 : fd_memcpy( &header.hash, metadata->hash, sizeof(fd_hash_t) );
375 :
376 0 : err = fd_tar_writer_write_file_data( writer, &header, sizeof(fd_solana_account_hdr_t) );
377 0 : if( FD_UNLIKELY( err ) ) {
378 0 : FD_LOG_ERR(( "Unable to stream out account header to tar archive" ));
379 0 : }
380 :
381 : /* Write out the file data. */
382 :
383 0 : err = fd_tar_writer_write_file_data( writer, acc_data, metadata->dlen );
384 0 : if( FD_UNLIKELY( err ) ) {
385 0 : FD_LOG_ERR(( "Unable to stream out account data to tar archive" ));
386 0 : }
387 :
388 0 : ulong align_sz = fd_ulong_align_up( metadata->dlen, FD_SNAPSHOT_ACC_ALIGN ) - metadata->dlen;
389 0 : err = fd_tar_writer_write_file_data( writer, padding, align_sz );
390 0 : if( FD_UNLIKELY( err ) ) {
391 0 : FD_LOG_ERR( ("Unable to stream out account padding to tar archive" ));
392 0 : }
393 0 : }
394 :
395 0 : err = fd_tar_writer_fini_file( writer );
396 0 : if( FD_UNLIKELY( err ) ) {
397 0 : FD_LOG_ERR(( "Unable to finalize previous accounts file" ));
398 0 : }
399 :
400 : /* Now write out the append vec for the snapshot slot. Again, this is needed
401 : because the snapshot slot's accounts must be in their append vec in order
402 : to verify the bank hash for the snapshot slot in the Agave client. */
403 :
404 0 : fd_snapshot_acc_vec_t * curr_accs = &accounts_db->storages[ 0UL ].account_vecs[ 0UL ];
405 0 : err = snprintf( buffer, FD_SNAPSHOT_DIR_MAX, "accounts/%lu.%lu", snapshot_ctx->slot, curr_accs->id );
406 0 : if( FD_UNLIKELY( err<0 ) ) {
407 0 : FD_LOG_ERR(( "Unable to format current accounts name string" ));
408 0 : }
409 :
410 0 : err = fd_tar_writer_new_file( writer, buffer );
411 0 : if( FD_UNLIKELY( err ) ) {
412 0 : FD_LOG_ERR(( "Unable to create current accounts file" ));
413 0 : }
414 :
415 0 : for( ulong i=0UL; i<snapshot_slot_key_cnt; i++ ) {
416 :
417 0 : fd_pubkey_t const * pubkey = snapshot_slot_keys[i];
418 0 : fd_funk_rec_key_t key = fd_acc_funk_key( pubkey );
419 :
420 0 : fd_funk_rec_t const * rec = fd_funk_rec_query( funk, NULL, &key );
421 0 : if( FD_UNLIKELY( !rec ) ) {
422 0 : FD_LOG_ERR(( "Previously found record can no longer be found" ));
423 0 : }
424 :
425 0 : int is_tombstone = rec->flags & FD_FUNK_REC_FLAG_ERASE;
426 0 : uchar const * raw = fd_funk_val( rec, fd_funk_wksp( funk ) );
427 0 : fd_account_meta_t * metadata = is_tombstone ? fd_snapshot_create_get_default_meta( fd_funk_rec_get_erase_data( rec ) ) :
428 0 : (fd_account_meta_t*)raw;
429 :
430 0 : if( FD_UNLIKELY( !metadata ) ) {
431 0 : FD_LOG_ERR(( "Record should have non-NULL metadata" ));
432 0 : }
433 :
434 0 : if( FD_UNLIKELY( metadata->magic!=FD_ACCOUNT_META_MAGIC ) ) {
435 0 : FD_LOG_ERR(( "Record should have valid magic" ));
436 0 : }
437 :
438 0 : uchar const * acc_data = raw + metadata->hlen;
439 :
440 0 : curr_accs->file_sz += sizeof(fd_solana_account_hdr_t) + fd_ulong_align_up( metadata->dlen, FD_SNAPSHOT_ACC_ALIGN );
441 :
442 : /* Write out the header. */
443 0 : fd_solana_account_hdr_t header = {0};
444 : /* Stored meta */
445 0 : header.meta.write_version_obsolete = 0UL;
446 0 : header.meta.data_len = metadata->dlen;
447 0 : fd_memcpy( header.meta.pubkey, pubkey, sizeof(fd_pubkey_t) );
448 : /* Account Meta */
449 0 : header.info.lamports = metadata->info.lamports;
450 0 : header.info.rent_epoch = header.info.lamports ? metadata->info.rent_epoch : 0UL;
451 0 : fd_memcpy( header.info.owner, metadata->info.owner, sizeof(fd_pubkey_t) );
452 0 : header.info.executable = metadata->info.executable;
453 : /* Hash */
454 0 : fd_memcpy( &header.hash, metadata->hash, sizeof(fd_hash_t) );
455 :
456 :
457 0 : err = fd_tar_writer_write_file_data( writer, &header, sizeof(fd_solana_account_hdr_t) );
458 0 : if( FD_UNLIKELY( err ) ) {
459 0 : FD_LOG_ERR(( "Unable to stream out account header to tar archive" ));
460 0 : }
461 0 : err = fd_tar_writer_write_file_data( writer, acc_data, metadata->dlen );
462 0 : if( FD_UNLIKELY( err ) ) {
463 0 : FD_LOG_ERR(( "Unable to stream out account data to tar archive" ));
464 0 : }
465 0 : ulong align_sz = fd_ulong_align_up( metadata->dlen, FD_SNAPSHOT_ACC_ALIGN ) - metadata->dlen;
466 0 : err = fd_tar_writer_write_file_data( writer, padding, align_sz );
467 0 : if( FD_UNLIKELY( err ) ) {
468 0 : FD_LOG_ERR(( "Unable to stream out account padding to tar archive" ));
469 0 : }
470 0 : }
471 :
472 0 : err = fd_tar_writer_fini_file( writer );
473 0 : if( FD_UNLIKELY( err ) ) {
474 0 : FD_LOG_ERR(( "Unable to finish writing out file" ));
475 0 : }
476 :
477 : /* TODO: At this point we must implement compaction to the snapshot service.
478 : Without this, we are actually not cleaning up any tombstones from funk. */
479 :
480 0 : if( snapshot_ctx->is_incremental ) {
481 0 : fd_funk_start_write( funk );
482 0 : err = fd_funk_rec_forget( funk, tombstones, tombstones_cnt );
483 0 : if( FD_UNLIKELY( err!=FD_FUNK_SUCCESS ) ) {
484 0 : FD_LOG_ERR(( "Unable to forget tombstones" ));
485 0 : }
486 0 : FD_LOG_NOTICE(( "Compacted %lu tombstone records", tombstones_cnt ));
487 0 : fd_funk_end_write( funk );
488 0 : }
489 :
490 0 : fd_valloc_free( snapshot_ctx->valloc, snapshot_slot_keys );
491 0 : fd_valloc_free( snapshot_ctx->valloc, tombstones );
492 :
493 0 : }
494 :
495 : static void
496 : fd_snapshot_create_serialiable_stakes( fd_snapshot_ctx_t * snapshot_ctx,
497 : fd_stakes_t * old_stakes,
498 0 : fd_stakes_serializable_t * new_stakes ) {
499 :
500 : /* The deserialized stakes cache that is used by the runtime can't be
501 : reserialized into the format that Agave uses. For every vote account
502 : in the stakes struct, the Firedancer client holds a decoded copy of the
503 : vote state. However, this vote state can't be reserialized back into the
504 : full vote account data.
505 :
506 : This poses a problem in the Agave client client because upon boot, Agave
507 : verifies that for all of the vote accounts in the stakes struct, the data
508 : in the cache is the same as the data in the accounts db.
509 :
510 : The other problem is that the Firedancer stakes cache does not evict old
511 : entries and doesn't update delegations within the cache. The cache will
512 : just insert new pubkeys as stake accounts are created/delegated to. To
513 : make the cache conformant for the snapshot, old accounts should be removed
514 : from the snapshot and all of the delegations should be updated. */
515 :
516 : /* First populate the vote accounts using the vote accounts/stakes cache.
517 : We can populate over all of the fields except we can't reserialize the
518 : vote account data. Instead we will copy over the raw contents of all of
519 : the vote accounts. */
520 :
521 0 : ulong vote_accounts_len = fd_vote_accounts_pair_t_map_size( old_stakes->vote_accounts.vote_accounts_pool, old_stakes->vote_accounts.vote_accounts_root );
522 0 : new_stakes->vote_accounts.vote_accounts_pool = fd_vote_accounts_pair_serializable_t_map_alloc( snapshot_ctx->valloc, fd_ulong_max(vote_accounts_len, 15000 ) );
523 0 : new_stakes->vote_accounts.vote_accounts_root = NULL;
524 :
525 0 : for( fd_vote_accounts_pair_t_mapnode_t * n = fd_vote_accounts_pair_t_map_minimum(
526 0 : old_stakes->vote_accounts.vote_accounts_pool,
527 0 : old_stakes->vote_accounts.vote_accounts_root );
528 0 : n;
529 0 : n = fd_vote_accounts_pair_t_map_successor( old_stakes->vote_accounts.vote_accounts_pool, n ) ) {
530 :
531 0 : fd_vote_accounts_pair_serializable_t_mapnode_t * new_node = fd_vote_accounts_pair_serializable_t_map_acquire( new_stakes->vote_accounts.vote_accounts_pool );
532 0 : new_node->elem.key = n->elem.key;
533 0 : new_node->elem.stake = n->elem.stake;
534 : /* Now to populate the value, lookup the account using the acc mgr */
535 0 : FD_BORROWED_ACCOUNT_DECL( vote_acc );
536 0 : int err = fd_acc_mgr_view( snapshot_ctx->acc_mgr, NULL, &n->elem.key, vote_acc );
537 0 : if( FD_UNLIKELY( err ) ) {
538 0 : FD_LOG_ERR(( "Failed to view vote account from stakes cache %s", FD_BASE58_ENC_32_ALLOCA(&n->elem.key) ));
539 0 : }
540 :
541 0 : new_node->elem.value.lamports = vote_acc->const_meta->info.lamports;
542 0 : new_node->elem.value.data_len = vote_acc->const_meta->dlen;
543 0 : new_node->elem.value.data = fd_valloc_malloc( snapshot_ctx->valloc, 8UL, vote_acc->const_meta->dlen );
544 0 : fd_memcpy( new_node->elem.value.data, vote_acc->const_data, vote_acc->const_meta->dlen );
545 0 : fd_memcpy( &new_node->elem.value.owner, &vote_acc->const_meta->info.owner, sizeof(fd_pubkey_t) );
546 0 : new_node->elem.value.executable = vote_acc->const_meta->info.executable;
547 0 : new_node->elem.value.rent_epoch = vote_acc->const_meta->info.rent_epoch;
548 0 : fd_vote_accounts_pair_serializable_t_map_insert( new_stakes->vote_accounts.vote_accounts_pool, &new_stakes->vote_accounts.vote_accounts_root, new_node );
549 :
550 0 : }
551 :
552 : /* Stale stake delegations should also be removed or updated in the cache.
553 : TODO: This will likely be changed in the near future as the stake
554 : program is migrated to a bpf program. It will likely be replaced by an
555 : index of stake/vote accounts. */
556 :
557 0 : FD_BORROWED_ACCOUNT_DECL( stake_acc );
558 0 : fd_delegation_pair_t_mapnode_t * nn = NULL;
559 0 : for( fd_delegation_pair_t_mapnode_t * n = fd_delegation_pair_t_map_minimum(
560 0 : old_stakes->stake_delegations_pool, old_stakes->stake_delegations_root ); n; n=nn ) {
561 :
562 0 : nn = fd_delegation_pair_t_map_successor( old_stakes->stake_delegations_pool, n );
563 :
564 0 : int err = fd_acc_mgr_view( snapshot_ctx->acc_mgr, NULL, &n->elem.account, stake_acc );
565 0 : if( FD_UNLIKELY( err ) ) {
566 : /* If the stake account doesn't exist, the cache is stale and the entry
567 : just needs to be evicted. */
568 0 : fd_delegation_pair_t_map_remove( old_stakes->stake_delegations_pool, &old_stakes->stake_delegations_root, n );
569 0 : fd_delegation_pair_t_map_release( old_stakes->stake_delegations_pool, n );
570 0 : } else {
571 : /* Otherwise, just update the delegation in case it is stale. */
572 0 : fd_bincode_decode_ctx_t ctx = {
573 0 : .data = stake_acc->const_data,
574 0 : .dataend = stake_acc->const_data + stake_acc->const_meta->dlen,
575 0 : .valloc = snapshot_ctx->valloc
576 0 : };
577 0 : fd_stake_state_v2_t stake_state = {0};
578 0 : err = fd_stake_state_v2_decode( &stake_state, &ctx );
579 0 : if( FD_UNLIKELY( err ) ) {
580 0 : FD_LOG_ERR(( "Failed to decode stake state" ));
581 0 : }
582 0 : n->elem.delegation = stake_state.inner.stake.stake.delegation;
583 0 : }
584 0 : }
585 :
586 : /* Copy over the rest of the fields as they are the same. */
587 :
588 0 : new_stakes->stake_delegations_pool = old_stakes->stake_delegations_pool;
589 0 : new_stakes->stake_delegations_root = old_stakes->stake_delegations_root;
590 0 : new_stakes->unused = old_stakes->unused;
591 0 : new_stakes->epoch = old_stakes->epoch;
592 0 : new_stakes->stake_history = old_stakes->stake_history;
593 :
594 0 : }
595 :
596 : static inline void
597 : fd_snapshot_create_populate_bank( fd_snapshot_ctx_t * snapshot_ctx,
598 0 : fd_serializable_versioned_bank_t * bank ) {
599 :
600 0 : fd_slot_bank_t * slot_bank = &snapshot_ctx->slot_bank;
601 0 : fd_epoch_bank_t * epoch_bank = &snapshot_ctx->epoch_bank;
602 :
603 : /* The blockhash queue has to be copied over along with all of its entries.
604 : As a note, the size is 300 but in fact is of size 301 due to a knwon bug
605 : in the agave client that is emulated by the firedancer client. */
606 :
607 0 : bank->blockhash_queue.last_hash_index = slot_bank->block_hash_queue.last_hash_index;
608 0 : bank->blockhash_queue.last_hash = fd_valloc_malloc( snapshot_ctx->valloc, FD_HASH_ALIGN, FD_HASH_FOOTPRINT );
609 0 : fd_memcpy( bank->blockhash_queue.last_hash, slot_bank->block_hash_queue.last_hash, sizeof(fd_hash_t) );
610 :
611 0 : bank->blockhash_queue.ages_len = fd_hash_hash_age_pair_t_map_size( slot_bank->block_hash_queue.ages_pool, slot_bank->block_hash_queue.ages_root);
612 0 : bank->blockhash_queue.ages = fd_valloc_malloc( snapshot_ctx->valloc, FD_HASH_HASH_AGE_PAIR_ALIGN, bank->blockhash_queue.ages_len * sizeof(fd_hash_hash_age_pair_t) );
613 0 : bank->blockhash_queue.max_age = FD_BLOCKHASH_QUEUE_SIZE;
614 :
615 0 : fd_block_hash_queue_t * queue = &slot_bank->block_hash_queue;
616 0 : fd_hash_hash_age_pair_t_mapnode_t * nn = NULL;
617 0 : ulong blockhash_queue_idx = 0UL;
618 0 : for( fd_hash_hash_age_pair_t_mapnode_t * n = fd_hash_hash_age_pair_t_map_minimum( queue->ages_pool, queue->ages_root ); n; n = nn ) {
619 0 : nn = fd_hash_hash_age_pair_t_map_successor( queue->ages_pool, n );
620 0 : fd_memcpy( &bank->blockhash_queue.ages[ blockhash_queue_idx++ ], &n->elem, sizeof(fd_hash_hash_age_pair_t) );
621 0 : }
622 :
623 :
624 :
625 : /* Ancestor can be omitted to boot off of for both clients */
626 :
627 0 : bank->ancestors_len = 0UL;
628 0 : bank->ancestors = NULL;
629 :
630 0 : bank->hash = slot_bank->banks_hash;
631 0 : bank->parent_hash = slot_bank->prev_banks_hash;
632 0 : bank->parent_slot = slot_bank->prev_slot;
633 0 : bank->hard_forks = slot_bank->hard_forks;
634 0 : bank->transaction_count = slot_bank->transaction_count;
635 0 : bank->signature_count = slot_bank->parent_signature_cnt;
636 0 : bank->capitalization = slot_bank->capitalization;
637 0 : bank->tick_height = slot_bank->tick_height;
638 0 : bank->max_tick_height = slot_bank->max_tick_height;
639 :
640 : /* The hashes_per_tick needs to be copied over from the epoch bank because
641 : the pointer could go out of bounds during an epoch boundary. */
642 0 : bank->hashes_per_tick = fd_valloc_malloc( snapshot_ctx->valloc, alignof(ulong), sizeof(ulong) );
643 0 : fd_memcpy( bank->hashes_per_tick, &epoch_bank->hashes_per_tick, sizeof(ulong) );
644 :
645 0 : bank->ticks_per_slot = FD_TICKS_PER_SLOT;
646 0 : bank->ns_per_slot = epoch_bank->ns_per_slot;
647 0 : bank->genesis_creation_time = epoch_bank->genesis_creation_time;
648 0 : bank->slots_per_year = epoch_bank->slots_per_year;
649 :
650 : /* This value can be set to 0 because the Agave client recomputes this value
651 : and the firedancer client doesn't use it. */
652 :
653 0 : bank->accounts_data_len = 0UL;
654 :
655 0 : bank->slot = snapshot_ctx->slot;
656 0 : bank->epoch = fd_slot_to_epoch( &epoch_bank->epoch_schedule, bank->slot, NULL );
657 0 : bank->block_height = slot_bank->block_height;
658 :
659 : /* Collector id can be left as null for both clients */
660 :
661 0 : fd_memset( &bank->collector_id, 0, sizeof(fd_pubkey_t) );
662 :
663 0 : bank->collector_fees = slot_bank->collected_execution_fees + slot_bank->collected_priority_fees;
664 0 : bank->fee_calculator.lamports_per_signature = slot_bank->lamports_per_signature;
665 0 : bank->fee_rate_governor = slot_bank->fee_rate_governor;
666 0 : bank->collected_rent = slot_bank->collected_rent;
667 :
668 0 : bank->rent_collector.epoch = bank->epoch;
669 0 : bank->rent_collector.epoch_schedule = epoch_bank->rent_epoch_schedule;
670 0 : bank->rent_collector.slots_per_year = epoch_bank->slots_per_year;
671 0 : bank->rent_collector.rent = epoch_bank->rent;
672 :
673 0 : bank->epoch_schedule = epoch_bank->epoch_schedule;
674 0 : bank->inflation = epoch_bank->inflation;
675 :
676 : /* Unused accounts can be left as NULL for both clients. */
677 :
678 0 : fd_memset( &bank->unused_accounts, 0, sizeof(fd_unused_accounts_t) );
679 :
680 : /* We need to copy over the stakes for two epochs despite the Agave client
681 : providing the stakes for 6 epochs. These stakes need to be copied over
682 : because of the fact that the leader schedule computation uses the two
683 : previous epoch stakes.
684 :
685 : TODO: This field has been deprecated by agave and has instead been
686 : replaced with the versioned epoch stakes field in the manifest. The
687 : firedancer client will populate the deprecated field. */
688 :
689 0 : fd_epoch_epoch_stakes_pair_t * relevant_epoch_stakes = fd_valloc_malloc( snapshot_ctx->valloc, FD_EPOCH_EPOCH_STAKES_PAIR_ALIGN, 2UL * sizeof(fd_epoch_epoch_stakes_pair_t) );
690 0 : fd_memset( &relevant_epoch_stakes[0], 0UL, sizeof(fd_epoch_epoch_stakes_pair_t) );
691 0 : fd_memset( &relevant_epoch_stakes[1], 0UL, sizeof(fd_epoch_epoch_stakes_pair_t) );
692 0 : relevant_epoch_stakes[0].key = bank->epoch;
693 0 : relevant_epoch_stakes[0].value.stakes.vote_accounts = slot_bank->epoch_stakes;
694 0 : relevant_epoch_stakes[1].key = bank->epoch+1UL;
695 0 : relevant_epoch_stakes[1].value.stakes.vote_accounts = epoch_bank->next_epoch_stakes;
696 :
697 0 : bank->epoch_stakes_len = 2UL;
698 0 : bank->epoch_stakes = relevant_epoch_stakes;
699 0 : bank->is_delta = snapshot_ctx->is_incremental;
700 :
701 : /* The firedancer runtime currently maintains a version of the stakes which
702 : can't be reserialized into a format that is compatible with the Solana
703 : snapshot format. Therefore, we must recompute the data structure using
704 : the pubkeys from the stakes cache that is currently in the epoch context. */
705 :
706 0 : fd_snapshot_create_serialiable_stakes( snapshot_ctx, &epoch_bank->stakes, &bank->stakes );
707 :
708 0 : }
709 :
710 : static inline void
711 0 : fd_snapshot_create_setup_and_validate_ctx( fd_snapshot_ctx_t * snapshot_ctx ) {
712 :
713 0 : fd_funk_t * funk = snapshot_ctx->funk;
714 :
715 : /* Initialize the account manager. */
716 :
717 0 : uchar * mem = fd_valloc_malloc( snapshot_ctx->valloc, FD_ACC_MGR_ALIGN, FD_ACC_MGR_FOOTPRINT );
718 0 : snapshot_ctx->acc_mgr = fd_acc_mgr_new( mem, funk );
719 0 : if( FD_UNLIKELY( !snapshot_ctx->acc_mgr ) ) {
720 0 : FD_LOG_ERR(( "Failed to initialize account manager" ));
721 0 : }
722 :
723 : /* First the epoch bank. */
724 :
725 0 : fd_funk_rec_key_t epoch_id = fd_runtime_epoch_bank_key();
726 0 : fd_funk_rec_t const * epoch_rec = fd_funk_rec_query( funk, NULL, &epoch_id );
727 0 : if( FD_UNLIKELY( !epoch_rec ) ) {
728 0 : FD_LOG_ERR(( "Failed to read epoch bank record: missing record" ));
729 0 : }
730 0 : void * epoch_val = fd_funk_val( epoch_rec, fd_funk_wksp( funk ) );
731 :
732 0 : if( FD_UNLIKELY( fd_funk_val_sz( epoch_rec )<sizeof(uint) ) ) {
733 0 : FD_LOG_ERR(( "Failed to read epoch bank record: empty record" ));
734 0 : }
735 :
736 0 : uint epoch_magic = *(uint*)epoch_val;
737 :
738 0 : fd_bincode_decode_ctx_t epoch_decode_ctx = {
739 0 : .data = (uchar*)epoch_val + sizeof(uint),
740 0 : .dataend = (uchar*)epoch_val + fd_funk_val_sz( epoch_rec ),
741 0 : .valloc = snapshot_ctx->valloc
742 0 : };
743 :
744 0 : if( FD_UNLIKELY( epoch_magic!=FD_RUNTIME_ENC_BINCODE ) ) {
745 0 : FD_LOG_ERR(( "Epoch bank record has wrong magic" ));
746 0 : }
747 :
748 0 : int err = fd_epoch_bank_decode( &snapshot_ctx->epoch_bank, &epoch_decode_ctx );
749 0 : if( FD_UNLIKELY( err!=FD_BINCODE_SUCCESS ) ) {
750 0 : FD_LOG_ERR(( "Failed to decode epoch bank" ));
751 0 : }
752 :
753 : /* Now the slot bank. */
754 :
755 0 : fd_funk_rec_key_t slot_id = fd_runtime_slot_bank_key();
756 0 : fd_funk_rec_t const * slot_rec = fd_funk_rec_query( funk, NULL, &slot_id );
757 0 : if( FD_UNLIKELY( !slot_rec ) ) {
758 0 : FD_LOG_ERR(( "Failed to read slot bank record: missing record" ));
759 0 : }
760 0 : void * slot_val = fd_funk_val( slot_rec, fd_funk_wksp( funk ) );
761 :
762 0 : if( FD_UNLIKELY( fd_funk_val_sz( slot_rec )<sizeof(uint) ) ) {
763 0 : FD_LOG_ERR(( "Failed to read slot bank record: empty record" ));
764 0 : }
765 :
766 0 : uint slot_magic = *(uint*)slot_val;
767 :
768 0 : fd_bincode_decode_ctx_t slot_decode_ctx = {
769 0 : .data = (uchar*)slot_val + sizeof(uint),
770 0 : .dataend = (uchar*)slot_val + fd_funk_val_sz( slot_rec ),
771 0 : .valloc = snapshot_ctx->valloc
772 0 : };
773 :
774 0 : if( FD_UNLIKELY( slot_magic!=FD_RUNTIME_ENC_BINCODE ) ) {
775 0 : FD_LOG_ERR(( "Slot bank record has wrong magic" ));
776 0 : }
777 :
778 0 : err = fd_slot_bank_decode( &snapshot_ctx->slot_bank, &slot_decode_ctx );
779 0 : if( FD_UNLIKELY( err!=FD_BINCODE_SUCCESS ) ) {
780 0 : FD_LOG_ERR(( "Failed to decode slot bank" ));
781 0 : }
782 :
783 : /* Validate that the snapshot context is setup correctly */
784 :
785 0 : if( FD_UNLIKELY( !snapshot_ctx->out_dir ) ) {
786 0 : FD_LOG_ERR(( "Snapshot directory is not set" ));
787 0 : }
788 :
789 0 : if( FD_UNLIKELY( snapshot_ctx->slot>snapshot_ctx->slot_bank.slot ) ) {
790 0 : FD_LOG_ERR(( "Snapshot slot=%lu is greater than the current slot=%lu",
791 0 : snapshot_ctx->slot, snapshot_ctx->slot_bank.slot ));
792 0 : }
793 :
794 : /* Truncate the two files used for snapshot creation and seek to its start. */
795 :
796 0 : long seek = lseek( snapshot_ctx->tmp_fd, 0, SEEK_SET );
797 0 : if( FD_UNLIKELY( seek ) ) {
798 0 : FD_LOG_ERR(( "Failed to seek to the start of the file" ));
799 0 : }
800 :
801 0 : if( FD_UNLIKELY( ftruncate( snapshot_ctx->tmp_fd, 0UL ) < 0 ) ) {
802 0 : FD_LOG_ERR(( "Failed to truncate the temporary file" ));
803 0 : }
804 :
805 0 : seek = lseek( snapshot_ctx->snapshot_fd, 0, SEEK_SET );
806 0 : if( FD_UNLIKELY( seek ) ) {
807 0 : FD_LOG_ERR(( "Failed to seek to the start of the file" ));
808 0 : }
809 :
810 0 : if( FD_UNLIKELY( ftruncate( snapshot_ctx->snapshot_fd, 0UL ) < 0 ) ) {
811 0 : FD_LOG_ERR(( "Failed to truncate the snapshot file" ));
812 0 : }
813 :
814 0 : }
815 :
816 : static inline void
817 0 : fd_snapshot_create_setup_writer( fd_snapshot_ctx_t * snapshot_ctx ) {
818 :
819 : /* Setup a tar writer. */
820 :
821 0 : uchar * writer_mem = fd_valloc_malloc( snapshot_ctx->valloc, fd_tar_writer_align(), fd_tar_writer_footprint() );
822 0 : snapshot_ctx->writer = fd_tar_writer_new( writer_mem, snapshot_ctx->tmp_fd );
823 0 : if( FD_UNLIKELY( !snapshot_ctx->writer ) ) {
824 0 : FD_LOG_ERR(( "Unable to create a tar writer" ));
825 0 : }
826 0 : }
827 :
828 : static inline void
829 0 : fd_snapshot_create_write_version( fd_snapshot_ctx_t * snapshot_ctx ) {
830 :
831 : /* The first file in the tar archive should be the version file.. */
832 :
833 0 : int err = fd_tar_writer_new_file( snapshot_ctx->writer, FD_SNAPSHOT_VERSION_FILE );
834 0 : if( FD_UNLIKELY( err ) ) {
835 0 : FD_LOG_ERR(( "Failed to create the version file" ));
836 0 : }
837 :
838 0 : err = fd_tar_writer_write_file_data( snapshot_ctx->writer, FD_SNAPSHOT_VERSION, FD_SNAPSHOT_VERSION_LEN);
839 0 : if( FD_UNLIKELY( err ) ) {
840 0 : FD_LOG_ERR(( "Failed to create the version file" ));
841 0 : }
842 :
843 0 : err = fd_tar_writer_fini_file( snapshot_ctx->writer );
844 0 : if( FD_UNLIKELY( err ) ) {
845 0 : FD_LOG_ERR(( "Failed to create the version file" ));
846 0 : }
847 :
848 0 : }
849 :
850 : static inline void
851 0 : fd_snapshot_create_write_status_cache( fd_snapshot_ctx_t * snapshot_ctx ) {
852 :
853 0 : FD_SCRATCH_SCOPE_BEGIN {
854 :
855 : /* First convert the existing status cache into a snapshot-friendly format. */
856 :
857 0 : fd_bank_slot_deltas_t slot_deltas_new = {0};
858 0 : int err = fd_txncache_get_entries( snapshot_ctx->status_cache,
859 0 : &slot_deltas_new );
860 0 : if( FD_UNLIKELY( err ) ) {
861 0 : FD_LOG_ERR(( "Failed to get entries from the status cache" ));
862 0 : }
863 0 : ulong bank_slot_deltas_sz = fd_bank_slot_deltas_size( &slot_deltas_new );
864 0 : uchar * out_status_cache = fd_valloc_malloc( snapshot_ctx->valloc,
865 0 : FD_BANK_SLOT_DELTAS_ALIGN,
866 0 : bank_slot_deltas_sz );
867 0 : fd_bincode_encode_ctx_t encode_status_cache = {
868 0 : .data = out_status_cache,
869 0 : .dataend = out_status_cache + bank_slot_deltas_sz,
870 0 : };
871 0 : if( FD_UNLIKELY( fd_bank_slot_deltas_encode( &slot_deltas_new, &encode_status_cache ) ) ) {
872 0 : FD_LOG_ERR(( "Failed to encode the status cache" ));
873 0 : }
874 :
875 : /* Now write out the encoded buffer to the tar archive. */
876 :
877 0 : err = fd_tar_writer_new_file( snapshot_ctx->writer, FD_SNAPSHOT_STATUS_CACHE_FILE );
878 0 : if( FD_UNLIKELY( err ) ) {
879 0 : FD_LOG_ERR(( "Failed to create the status cache file" ));
880 0 : }
881 0 : err = fd_tar_writer_write_file_data( snapshot_ctx->writer, out_status_cache, bank_slot_deltas_sz );
882 0 : if( FD_UNLIKELY( err ) ) {
883 0 : FD_LOG_ERR(( "Failed to create the status cache file" ));
884 0 : }
885 0 : err = fd_tar_writer_fini_file( snapshot_ctx->writer );
886 0 : if( FD_UNLIKELY( err ) ) {
887 0 : FD_LOG_ERR(( "Failed to create the status cache file" ));
888 0 : }
889 :
890 : /* Registers all roots and unconstipates the status cache. */
891 :
892 0 : fd_txncache_flush_constipated_slots( snapshot_ctx->status_cache );
893 :
894 0 : fd_valloc_free( snapshot_ctx->valloc, out_status_cache );
895 :
896 0 : } FD_SCRATCH_SCOPE_END;
897 :
898 0 : }
899 :
900 : static inline void
901 : fd_snapshot_create_write_manifest_and_acc_vecs( fd_snapshot_ctx_t * snapshot_ctx,
902 : fd_hash_t * out_hash,
903 0 : ulong * out_capitalization ) {
904 :
905 :
906 0 : fd_solana_manifest_serializable_t manifest = {0};
907 :
908 : /* Copy in all the fields of the bank. */
909 :
910 0 : fd_snapshot_create_populate_bank( snapshot_ctx, &manifest.bank );
911 :
912 : /* Populate the rest of the manifest, except for the append vec index. */
913 :
914 0 : manifest.lamports_per_signature = snapshot_ctx->slot_bank.lamports_per_signature;
915 0 : manifest.epoch_account_hash = &snapshot_ctx->slot_bank.epoch_account_hash;
916 :
917 : /* FIXME: The versioned epoch stakes needs to be implemented. Right now if
918 : we try to create a snapshot on or near an epoch boundary, we will produce
919 : an invalid snapshot. */
920 :
921 0 : manifest.versioned_epoch_stakes_len = 0UL;
922 0 : manifest.versioned_epoch_stakes = NULL;
923 :
924 : /* Populate the append vec index and write out the corresponding acc files. */
925 :
926 0 : ulong incr_capitalization = 0UL;
927 0 : fd_snapshot_create_populate_acc_vecs( snapshot_ctx, &manifest, snapshot_ctx->writer, &incr_capitalization );
928 :
929 : /* Once the append vec index is populated and the hashes are calculated,
930 : propogate the hashes to the correct fields. As a note, the last_snap_hash
931 : is the full snapshot's account hash. */
932 :
933 0 : if( snapshot_ctx->is_incremental ) {
934 0 : manifest.bank_incremental_snapshot_persistence->full_slot = snapshot_ctx->last_snap_slot;
935 0 : fd_memcpy( &manifest.bank_incremental_snapshot_persistence->full_hash, snapshot_ctx->last_snap_acc_hash, sizeof(fd_hash_t) );
936 0 : manifest.bank_incremental_snapshot_persistence->full_capitalization = snapshot_ctx->last_snap_capitalization;
937 0 : manifest.bank_incremental_snapshot_persistence->incremental_hash = snapshot_ctx->acc_hash;
938 0 : manifest.bank_incremental_snapshot_persistence->incremental_capitalization = incr_capitalization;
939 0 : } else {
940 0 : memcpy( out_hash, &manifest.accounts_db.bank_hash_info.accounts_hash, sizeof(fd_hash_t) );
941 0 : *out_capitalization = snapshot_ctx->slot_bank.capitalization;
942 0 : }
943 :
944 : /* At this point, all of the account files are written out and the append
945 : vec index is populated in the manifest. We have already reserved space
946 : in the archive for the manifest. All we need to do now is encode the
947 : manifest and write it in. */
948 :
949 0 : ulong manifest_sz = fd_solana_manifest_serializable_size( &manifest );
950 0 : uchar * out_manifest = fd_valloc_malloc( snapshot_ctx->valloc, FD_SOLANA_MANIFEST_SERIALIZABLE_ALIGN, manifest_sz );
951 :
952 0 : fd_bincode_encode_ctx_t encode = {
953 0 : .data = out_manifest,
954 0 : .dataend = out_manifest + manifest_sz
955 0 : };
956 :
957 0 : int err = fd_solana_manifest_serializable_encode( &manifest, &encode );
958 0 : if( FD_UNLIKELY( err ) ) {
959 0 : FD_LOG_ERR(( "Failed to encode the manifest" ));
960 0 : }
961 :
962 0 : err = fd_tar_writer_fill_space( snapshot_ctx->writer, out_manifest, manifest_sz );
963 0 : if( FD_UNLIKELY( err ) ) {
964 0 : FD_LOG_ERR(( "Failed to write out the manifest" ));
965 0 : }
966 :
967 0 : void * mem = fd_tar_writer_delete( snapshot_ctx->writer );
968 0 : if( FD_UNLIKELY( !mem ) ) {
969 0 : FD_LOG_ERR(( "Unable to delete the tar writer" ));
970 0 : }
971 :
972 0 : fd_bincode_destroy_ctx_t destroy = {
973 0 : .valloc = snapshot_ctx->valloc
974 0 : };
975 :
976 : /* This is kind of a hack but we need to do this so we don't accidentally
977 : corrupt memory when we try to double destory. Everything below is
978 : things that aren't stack allocated from the manifest including the banks. */
979 :
980 0 : fd_stakes_serializable_destroy( &manifest.bank.stakes, &destroy );
981 0 : fd_block_hash_vec_destroy( &manifest.bank.blockhash_queue, &destroy );
982 0 : fd_valloc_free( snapshot_ctx->valloc, manifest.bank.epoch_stakes );
983 0 : fd_epoch_bank_destroy( &snapshot_ctx->epoch_bank, &destroy );
984 0 : fd_slot_bank_destroy( &snapshot_ctx->slot_bank, &destroy );
985 0 : if( snapshot_ctx->is_incremental ) {
986 0 : fd_valloc_free( snapshot_ctx->valloc, manifest.bank_incremental_snapshot_persistence );
987 0 : }
988 0 : fd_valloc_free( snapshot_ctx->valloc, out_manifest );
989 :
990 0 : }
991 :
992 : static inline void
993 0 : fd_snapshot_create_compress( fd_snapshot_ctx_t * snapshot_ctx ) {
994 :
995 : /* Compress the file using zstd. First open the non-compressed file and
996 : create a file for the compressed file. The reason why we can't do this
997 : as we stream out the snapshot archive is that we write back into the
998 : manifest buffer.
999 :
1000 : TODO: A way to eliminate this and to just stream out
1001 : 1 compressed file would be to totally precompute the index such that
1002 : we don't have to write back into funk.
1003 :
1004 : TODO: Currently, the snapshot service interfaces directly with the zstd
1005 : library but a generalized cstream defined in fd_zstd should be used
1006 : instead. */
1007 :
1008 0 : ulong in_buf_sz = ZSTD_CStreamInSize();
1009 0 : ulong zstd_buf_sz = ZSTD_CStreamOutSize();
1010 0 : ulong out_buf_sz = ZSTD_CStreamOutSize();
1011 :
1012 0 : char * in_buf = fd_valloc_malloc( snapshot_ctx->valloc, FD_ZSTD_CSTREAM_ALIGN, in_buf_sz );
1013 0 : char * zstd_buf = fd_valloc_malloc( snapshot_ctx->valloc, FD_ZSTD_CSTREAM_ALIGN, out_buf_sz );
1014 0 : char * out_buf = fd_valloc_malloc( snapshot_ctx->valloc, FD_ZSTD_CSTREAM_ALIGN, out_buf_sz );
1015 :
1016 : /* Reopen the tarball and open/overwrite the filename for the compressed,
1017 : finalized full snapshot. Setup the zstd compression stream. */
1018 :
1019 0 : int err = 0;
1020 :
1021 0 : ZSTD_CStream * cstream = ZSTD_createCStream();
1022 0 : if( FD_UNLIKELY( !cstream ) ) {
1023 0 : FD_LOG_ERR(( "Failed to create the zstd compression stream" ));
1024 0 : }
1025 0 : ZSTD_initCStream( cstream, ZSTD_CLEVEL_DEFAULT );
1026 :
1027 0 : fd_io_buffered_ostream_t ostream[1];
1028 :
1029 0 : if( FD_UNLIKELY( !fd_io_buffered_ostream_init( ostream, snapshot_ctx->snapshot_fd, out_buf, out_buf_sz ) ) ) {
1030 0 : FD_LOG_ERR(( "Failed to initialize the ostream" ));
1031 0 : }
1032 :
1033 0 : long seek = lseek( snapshot_ctx->snapshot_fd, 0, SEEK_SET );
1034 0 : if( FD_UNLIKELY( seek!=0L ) ) {
1035 0 : FD_LOG_ERR(( "Failed to seek to the start of the file" ));
1036 0 : }
1037 :
1038 : /* At this point, the tar archive and the new zstd file is open. The zstd
1039 : streamer is still open. Now, we are ready to read in bytes and stream
1040 : compress them. We will keep going until we see an EOF in a tar archive. */
1041 :
1042 0 : ulong in_sz = in_buf_sz;
1043 :
1044 0 : ulong off = (ulong)lseek( snapshot_ctx->tmp_fd, 0, SEEK_SET );
1045 0 : if( FD_UNLIKELY( off ) ) {
1046 0 : FD_LOG_ERR(( "Failed to seek to the beginning of the file" ));
1047 0 : }
1048 :
1049 0 : while( in_sz==in_buf_sz ) {
1050 :
1051 : /* Read chunks from the file. There isn't really a need to use a streamed
1052 : reader here because we will read in the max size buffer for every single
1053 : file read except for the very last one.
1054 :
1055 : in_sz will only not equal in_buf_sz on the last read. */
1056 0 : err = fd_io_read( snapshot_ctx->tmp_fd, in_buf, 0UL, in_buf_sz, &in_sz );
1057 0 : if( FD_UNLIKELY( err ) ) {
1058 0 : FD_LOG_ERR(( "Failed to read in the file" ));
1059 0 : }
1060 :
1061 : /* Compress the in memory buffer and add it to the output stream. */
1062 :
1063 0 : ZSTD_inBuffer input = { in_buf, in_sz, 0UL };
1064 0 : while( input.pos<input.size ) {
1065 0 : ZSTD_outBuffer output = { zstd_buf, zstd_buf_sz, 0UL };
1066 0 : ulong ret = ZSTD_compressStream( cstream, &output, &input );
1067 :
1068 0 : if( FD_UNLIKELY( ZSTD_isError( ret ) ) ) {
1069 0 : FD_LOG_ERR(( "Compression error: %s\n", ZSTD_getErrorName( ret ) ));
1070 0 : }
1071 :
1072 0 : err = fd_io_buffered_ostream_write( ostream, zstd_buf, output.pos );
1073 0 : if( FD_UNLIKELY( err ) ) {
1074 0 : FD_LOG_ERR(( "Failed to write out the compressed file" ));
1075 0 : }
1076 0 : }
1077 0 : }
1078 :
1079 : /* Now flush any bytes left in the zstd buffer, cleanup open file
1080 : descriptors, and deinit any data structures. */
1081 :
1082 0 : ZSTD_outBuffer output = { zstd_buf, zstd_buf_sz, 0UL };
1083 0 : ulong remaining = ZSTD_endStream( cstream, &output );
1084 :
1085 0 : if( FD_UNLIKELY( ZSTD_isError( remaining ) ) ) {
1086 0 : FD_LOG_ERR(( "Unable to end the zstd stream" ));
1087 0 : }
1088 0 : if( output.pos>0UL ) {
1089 0 : fd_io_buffered_ostream_write( ostream, zstd_buf, output.pos );
1090 0 : }
1091 :
1092 0 : fd_valloc_free( snapshot_ctx->valloc, in_buf );
1093 0 : fd_valloc_free( snapshot_ctx->valloc, zstd_buf );
1094 0 : fd_valloc_free( snapshot_ctx->valloc, out_buf );
1095 :
1096 0 : ZSTD_freeCStream( cstream ); /* Works even if cstream is null */
1097 0 : err = fd_io_buffered_ostream_flush( ostream );
1098 0 : if( FD_UNLIKELY( err ) ) {
1099 0 : FD_LOG_ERR(( "Failed to flush the ostream" ));
1100 0 : }
1101 :
1102 : /* Assuming that there was a successful write, make the compressed
1103 : snapshot file readable and servable. */
1104 :
1105 0 : char tmp_directory_buf_zstd[ FD_SNAPSHOT_DIR_MAX ];
1106 0 : err = snprintf( tmp_directory_buf_zstd, FD_SNAPSHOT_DIR_MAX, "%s/%s", snapshot_ctx->out_dir, snapshot_ctx->is_incremental ? FD_SNAPSHOT_TMP_INCR_ARCHIVE_ZSTD : FD_SNAPSHOT_TMP_FULL_ARCHIVE_ZSTD );
1107 0 : if( FD_UNLIKELY( err<0 ) ) {
1108 0 : FD_LOG_ERR(( "Failed to format directory string" ));
1109 0 : }
1110 :
1111 0 : char directory_buf_zstd[ FD_SNAPSHOT_DIR_MAX ];
1112 0 : if( !snapshot_ctx->is_incremental ) {
1113 0 : err = snprintf( directory_buf_zstd, FD_SNAPSHOT_DIR_MAX, "%s/snapshot-%lu-%s.tar.zst",
1114 0 : snapshot_ctx->out_dir, snapshot_ctx->slot, FD_BASE58_ENC_32_ALLOCA(&snapshot_ctx->snap_hash) );
1115 0 : } else {
1116 0 : err = snprintf( directory_buf_zstd, FD_SNAPSHOT_DIR_MAX, "%s/incremental-snapshot-%lu-%lu-%s.tar.zst",
1117 0 : snapshot_ctx->out_dir, snapshot_ctx->last_snap_slot, snapshot_ctx->slot, FD_BASE58_ENC_32_ALLOCA(&snapshot_ctx->snap_hash) );
1118 0 : }
1119 :
1120 0 : if( FD_UNLIKELY( err<0 ) ) {
1121 0 : FD_LOG_ERR(( "Failed to format directory string" ));
1122 0 : }
1123 :
1124 0 : err = rename( tmp_directory_buf_zstd, directory_buf_zstd );
1125 0 : if( FD_UNLIKELY( err<0 ) ) {
1126 0 : FD_LOG_ERR(( "Failed to rename file from %s to %s (%i-%s)", tmp_directory_buf_zstd, directory_buf_zstd, errno, fd_io_strerror( errno ) ));
1127 0 : }
1128 :
1129 0 : }
1130 :
1131 : void
1132 : fd_snapshot_create_new_snapshot( fd_snapshot_ctx_t * snapshot_ctx,
1133 : fd_hash_t * out_hash,
1134 0 : ulong * out_capitalization ) {
1135 :
1136 0 : FD_SCRATCH_SCOPE_BEGIN {
1137 :
1138 0 : FD_LOG_NOTICE(( "Starting to produce a snapshot for slot=%lu in directory=%s", snapshot_ctx->slot, snapshot_ctx->out_dir ));
1139 :
1140 : /* Validate that the snapshot_ctx is setup correctly. */
1141 :
1142 0 : fd_snapshot_create_setup_and_validate_ctx( snapshot_ctx );
1143 :
1144 : /* Setup the tar archive writer. */
1145 :
1146 0 : fd_snapshot_create_setup_writer( snapshot_ctx );
1147 :
1148 : /* Write out the version file. */
1149 :
1150 0 : fd_snapshot_create_write_version( snapshot_ctx );
1151 :
1152 : /* Dump the status cache and append it to the tar archive. */
1153 :
1154 0 : fd_snapshot_create_write_status_cache( snapshot_ctx );
1155 :
1156 : /* Populate and write out the manifest and append vecs. */
1157 :
1158 0 : fd_snapshot_create_write_manifest_and_acc_vecs( snapshot_ctx, out_hash, out_capitalization );
1159 :
1160 : /* Compress the tar file and write it out to the specified directory. */
1161 :
1162 0 : fd_snapshot_create_compress( snapshot_ctx );
1163 :
1164 0 : FD_LOG_NOTICE(("Finished producing a snapshot" ));
1165 :
1166 0 : } FD_SCRATCH_SCOPE_END;
1167 0 : }
|