Line data Source code
1 : #include "fd_snapshot_create.h"
2 : #include "../runtime/sysvar/fd_sysvar_epoch_schedule.h"
3 : #include "../../ballet/zstd/fd_zstd.h"
4 : #include "../runtime/fd_hashes.h"
5 : #include "../runtime/fd_runtime.h"
6 :
7 : #include <errno.h>
8 : #include <stdio.h>
9 : #include <stdlib.h>
10 : #include <sys/stat.h>
11 : #include <sys/types.h>
12 : #include <unistd.h>
13 : #include <zstd.h>
14 :
15 : static uchar padding[ FD_SNAPSHOT_ACC_ALIGN ] = {0};
16 : static fd_account_meta_t default_meta = { .magic = FD_ACCOUNT_META_MAGIC };
17 :
18 : static inline fd_account_meta_t *
19 0 : fd_snapshot_create_get_default_meta( ulong slot ) {
20 0 : default_meta.slot = slot;
21 0 : return &default_meta;
22 0 : }
23 :
24 : static inline void
25 : fd_snapshot_create_populate_acc_vecs( fd_snapshot_ctx_t * snapshot_ctx,
26 : fd_solana_manifest_t * manifest,
27 : fd_tar_writer_t * writer,
28 0 : ulong * out_cap ) {
29 :
30 : /* The append vecs need to be described in an index in the manifest so a
31 : reader knows what account files to look for. These files are technically
32 : slot indexed, but the Firedancer implementation of the Solana snapshot
33 : produces far fewer indices. These storages are for the accounts
34 : that were modified and deleted in the most recent slot because that
35 : information is used by the Agave client to calculate and verify the
36 : bank hash for the given slot. This is done as an optimization to avoid
37 : having to slot index the Firedancer accounts db which would incur a large
38 : performance hit.
39 :
40 : To avoid iterating through the root twice to determine what accounts were
41 : touched in the snapshot slot and what accounts were touched in the
42 : other slots, we will create an array of pubkey pointers for all accounts
43 : that were touched in the snapshot slot. This buffer can be safely sized to
44 : the maximum amount of writable accounts that are possible in a non-epoch
45 : boundary slot. The rationale for this bound is explained in fd_runtime.h.
46 : We will not attempt to create a snapshot on an epoch boundary.
47 :
48 : TODO: We must add compaction here. */
49 :
50 0 : fd_pubkey_t * * snapshot_slot_keys = fd_spad_alloc( snapshot_ctx->spad, alignof(fd_pubkey_t*), sizeof(fd_pubkey_t*) * FD_WRITABLE_ACCS_IN_SLOT );
51 0 : ulong snapshot_slot_key_cnt = 0UL;
52 :
53 : /* We will dynamically resize the number of incremental keys because the upper
54 : bound will be roughly 8 bytes * writable accs in a slot * number of slots
55 : since the last full snapshot which can quickly grow to be severalgigabytes
56 : or more. In the normal case, this won't require dynamic resizing. */
57 0 : #define FD_INCREMENTAL_KEY_INIT_BOUND (100000UL)
58 0 : ulong incremental_key_bound = FD_INCREMENTAL_KEY_INIT_BOUND;
59 0 : ulong incremental_key_cnt = 0UL;
60 0 : fd_funk_rec_key_t const * * incremental_keys = snapshot_ctx->is_incremental ?
61 0 : fd_spad_alloc( snapshot_ctx->spad, alignof(fd_funk_rec_key_t*), sizeof(fd_funk_rec_key_t*) * incremental_key_bound ) :
62 0 : NULL;
63 :
64 0 : #undef FD_INCREMENTAL_KEY_INIT_BOUND
65 :
66 : /* In order to size out the accounts DB index in the manifest, we must
67 : iterate through funk and accumulate the size of all of the records
68 : from all slots before the snapshot_slot. */
69 :
70 0 : fd_funk_t * funk = snapshot_ctx->acc_mgr->funk;
71 0 : ulong prev_sz = 0UL;
72 0 : ulong tombstones_cnt = 0UL;
73 0 : for( fd_funk_rec_t const * rec = fd_funk_txn_first_rec( funk, NULL ); NULL != rec; rec = fd_funk_txn_next_rec( funk, rec ) ) {
74 :
75 0 : if( !fd_funk_key_is_acc( rec->pair.key ) ) {
76 0 : continue;
77 0 : }
78 :
79 0 : tombstones_cnt++;
80 :
81 0 : int is_tombstone = rec->flags & FD_FUNK_REC_FLAG_ERASE;
82 0 : uchar const * raw = fd_funk_val( rec, fd_funk_wksp( funk ) );
83 0 : fd_account_meta_t * metadata = is_tombstone ? fd_snapshot_create_get_default_meta( fd_funk_rec_get_erase_data( rec ) ) :
84 0 : (fd_account_meta_t*)raw;
85 :
86 0 : if( !metadata ) {
87 0 : continue;
88 0 : }
89 :
90 0 : if( metadata->magic!=FD_ACCOUNT_META_MAGIC ) {
91 0 : continue;
92 0 : }
93 :
94 0 : if( snapshot_ctx->is_incremental ) {
95 : /* We only care about accounts that were modified since the last
96 : snapshot slot for incremental snapshots.
97 :
98 : We also need to keep track of the capitalization for all of the
99 : accounts that are in the incremental as this is verified. */
100 0 : if( metadata->slot<=snapshot_ctx->last_snap_slot ) {
101 0 : continue;
102 0 : }
103 0 : incremental_keys[ incremental_key_cnt++ ] = rec->pair.key;
104 0 : *out_cap += metadata->info.lamports;
105 :
106 0 : if( FD_UNLIKELY( incremental_key_cnt==incremental_key_bound ) ) {
107 : /* Dynamically resize if needed. */
108 0 : incremental_key_bound *= 2UL;
109 0 : fd_funk_rec_key_t const * * new_incremental_keys = fd_spad_alloc( snapshot_ctx->spad,
110 0 : alignof(fd_funk_rec_key_t*),
111 0 : sizeof(fd_funk_rec_key_t*) * incremental_key_bound );
112 0 : fd_memcpy( new_incremental_keys, incremental_keys, sizeof(fd_funk_rec_key_t*) * incremental_key_cnt );
113 0 : fd_valloc_free( fd_spad_virtual( snapshot_ctx->spad ), incremental_keys );
114 0 : incremental_keys = new_incremental_keys;
115 0 : }
116 0 : }
117 :
118 : /* We know that all of the accounts from the snapshot slot can fit into
119 : one append vec, so we ignore all accounts from the snapshot slot. */
120 :
121 0 : if( metadata->slot==snapshot_ctx->slot ) {
122 0 : continue;
123 0 : }
124 :
125 0 : prev_sz += metadata->dlen + sizeof(fd_solana_account_hdr_t);
126 :
127 0 : }
128 :
129 : /* At this point we have sized out all of the relevant accounts that will
130 : be included in the snapshot. Now we must populate each of the append vecs
131 : and update the index as we go.
132 :
133 : When we account for the number of slots we need to consider one append vec
134 : for the snapshot slot and try to maximally fill up the others: an append
135 : vec has a protocol-defined maximum size in Agave. */
136 :
137 0 : ulong num_slots = 1UL + prev_sz / FD_SNAPSHOT_APPEND_VEC_SZ_MAX +
138 0 : (prev_sz % FD_SNAPSHOT_APPEND_VEC_SZ_MAX ? 1UL : 0UL);
139 :
140 0 : fd_solana_accounts_db_fields_t * accounts_db = &manifest->accounts_db;
141 :
142 0 : accounts_db->storages_len = num_slots;
143 0 : accounts_db->storages = fd_spad_alloc( snapshot_ctx->spad,
144 0 : FD_SNAPSHOT_SLOT_ACC_VECS_ALIGN,
145 0 : sizeof(fd_snapshot_slot_acc_vecs_t) * accounts_db->storages_len );
146 0 : accounts_db->version = 1UL;
147 0 : accounts_db->slot = snapshot_ctx->slot;
148 0 : accounts_db->historical_roots_len = 0UL;
149 0 : accounts_db->historical_roots = NULL;
150 0 : accounts_db->historical_roots_with_hash_len = 0UL;
151 0 : accounts_db->historical_roots_with_hash = NULL;
152 :
153 0 : for( ulong i=0UL; i<num_slots; i++ ) {
154 : /* Populate the storages for each slot. As a note, the slot number only
155 : matters for the snapshot slot. The other slot numbers don't affect
156 : consensus at all. Agave also maintains an invariant that there can
157 : only be one account vec per storage. */
158 :
159 0 : accounts_db->storages[ i ].account_vecs_len = 1UL;
160 0 : accounts_db->storages[ i ].account_vecs = fd_spad_alloc( snapshot_ctx->spad,
161 0 : FD_SNAPSHOT_ACC_VEC_ALIGN,
162 0 : sizeof(fd_snapshot_acc_vec_t) * accounts_db->storages[ i ].account_vecs_len );
163 0 : accounts_db->storages[ i ].account_vecs[ 0 ].file_sz = 0UL;
164 0 : accounts_db->storages[ i ].account_vecs[ 0 ].id = i + 1UL;
165 0 : accounts_db->storages[ i ].slot = snapshot_ctx->slot - i;
166 0 : }
167 :
168 : /* At this point we have iterated through all of the accounts and created
169 : the index. We are now ready to generate a snapshot hash. For both
170 : snapshots we need to generate two hashes:
171 : 1. The accounts hash. This is a simple hash of all of the accounts
172 : included in the snapshot.
173 : 2. The snapshot hash. This is a hash of the accounts hash and the epoch
174 : account hash. If the EAH is not included, then the accounts hash ==
175 : snapshot hash.
176 :
177 : There is some nuance as to which hash goes where. For full snapshots,
178 : the accounts hash in the bank hash info is the accounts hash. The hash in
179 : the filename is the snapshot hash.
180 :
181 : For incremental snapshots, the account hash in the bank hash info field is
182 : left zeroed out. The full snapshot's hash is in the incremental persistence
183 : field. The incremental snapshot's accounts hash is included in the
184 : incremental persistence field. The hash in the filename is the snapshot
185 : hash. */
186 :
187 0 : int err;
188 0 : if( !snapshot_ctx->is_incremental ) {
189 :
190 0 : err = fd_snapshot_service_hash( &snapshot_ctx->acc_hash,
191 0 : &snapshot_ctx->snap_hash,
192 0 : &snapshot_ctx->slot_bank,
193 0 : &snapshot_ctx->epoch_bank,
194 0 : snapshot_ctx->acc_mgr->funk,
195 0 : snapshot_ctx->tpool,
196 0 : snapshot_ctx->spad,
197 0 : snapshot_ctx->features );
198 0 : accounts_db->bank_hash_info.accounts_hash = snapshot_ctx->acc_hash;
199 0 : } else {
200 0 : err = fd_snapshot_service_inc_hash( &snapshot_ctx->acc_hash,
201 0 : &snapshot_ctx->snap_hash,
202 0 : &snapshot_ctx->slot_bank,
203 0 : &snapshot_ctx->epoch_bank,
204 0 : snapshot_ctx->acc_mgr->funk,
205 0 : incremental_keys,
206 0 : incremental_key_cnt,
207 0 : snapshot_ctx->spad,
208 0 : snapshot_ctx->features );
209 0 : fd_valloc_free( fd_spad_virtual( snapshot_ctx->spad ), incremental_keys );
210 :
211 0 : fd_memset( &accounts_db->bank_hash_info.accounts_hash, 0, sizeof(fd_hash_t) );
212 0 : }
213 :
214 0 : FD_LOG_NOTICE(( "Hashes calculated acc_hash=%s snapshot_hash=%s",
215 0 : FD_BASE58_ENC_32_ALLOCA(&snapshot_ctx->acc_hash),
216 0 : FD_BASE58_ENC_32_ALLOCA(&snapshot_ctx->snap_hash) ));
217 :
218 0 : if( FD_UNLIKELY( err ) ) {
219 0 : FD_LOG_ERR(( "Unable to calculate snapshot hash" ));
220 0 : }
221 :
222 0 : fd_memset( &accounts_db->bank_hash_info.stats, 0, sizeof(fd_bank_hash_stats_t) );
223 :
224 : /* Now, we have calculated the relevant hashes for the accounts.
225 : Because the files are serially written out for tar and we need to prepend
226 : the manifest, we must reserve space in the archive for the solana manifest. */
227 :
228 0 : if( snapshot_ctx->is_incremental ) {
229 0 : manifest->bank_incremental_snapshot_persistence = fd_spad_alloc( snapshot_ctx->spad,
230 0 : FD_BANK_INCREMENTAL_SNAPSHOT_PERSISTENCE_ALIGN,
231 0 : sizeof(fd_bank_incremental_snapshot_persistence_t) );
232 0 : }
233 :
234 0 : ulong manifest_sz = fd_solana_manifest_size( manifest );
235 :
236 0 : char buffer[ FD_SNAPSHOT_DIR_MAX ];
237 0 : err = snprintf( buffer, FD_SNAPSHOT_DIR_MAX, "snapshots/%lu/%lu", snapshot_ctx->slot, snapshot_ctx->slot );
238 0 : if( FD_UNLIKELY( err<0 ) ) {
239 0 : FD_LOG_ERR(( "Unable to format manifest name string" ));
240 0 : }
241 :
242 0 : err = fd_tar_writer_new_file( writer, buffer );
243 0 : if( FD_UNLIKELY( err ) ) {
244 0 : FD_LOG_ERR(( "Unable to create snapshot manifest file" ));
245 0 : }
246 :
247 : /* TODO: We want to eliminate having to write back into the tar file. This
248 : will enable the snapshot service to only use one file per snapshot.
249 : In order to do this, we must precompute the index in the manifest
250 : completely. This will allow us to stream out a compressed snapshot. */
251 :
252 0 : err = fd_tar_writer_make_space( writer, manifest_sz );
253 0 : if( FD_UNLIKELY( err ) ) {
254 0 : FD_LOG_ERR(( "Unable to make space for snapshot manifest file" ));
255 0 : }
256 :
257 0 : err = fd_tar_writer_fini_file( writer );
258 0 : if( FD_UNLIKELY( err ) ) {
259 0 : FD_LOG_ERR(( "Unable to finalize snapshot manifest file" ));
260 0 : }
261 :
262 : /* We have made space for the manifest and are ready to append the append
263 : vec files directly into the tar archive. We will iterate through all of
264 : the records in the funk root and create/populate an append vec for
265 : previous slots. Just record the pubkeys for the latest slot to populate
266 : the append vec after. If the append vec is full, write into the next one. */
267 :
268 0 : ulong curr_slot = 1UL;
269 0 : fd_snapshot_acc_vec_t * prev_accs = &accounts_db->storages[ curr_slot ].account_vecs[ 0UL ];
270 :
271 0 : err = snprintf( buffer, FD_SNAPSHOT_DIR_MAX, "accounts/%lu.%lu", snapshot_ctx->slot - curr_slot, prev_accs->id );
272 0 : if( FD_UNLIKELY( err<0 ) ) {
273 0 : FD_LOG_ERR(( "Unable to format previous accounts name string" ));
274 0 : }
275 :
276 0 : err = fd_tar_writer_new_file( writer, buffer );
277 0 : if( FD_UNLIKELY( err ) ) {
278 0 : FD_LOG_ERR(( "Unable to create previous accounts file" ));
279 0 : }
280 :
281 0 : fd_funk_rec_t * * tombstones = snapshot_ctx->is_incremental ? NULL :
282 0 : fd_spad_alloc( snapshot_ctx->spad, alignof(fd_funk_rec_t*), sizeof(fd_funk_rec_t*) * tombstones_cnt );
283 0 : tombstones_cnt = 0UL;
284 :
285 0 : for( fd_funk_rec_t const * rec = fd_funk_txn_first_rec( funk, NULL ); NULL != rec; rec = fd_funk_txn_next_rec( funk, rec ) ) {
286 :
287 : /* Get the account data. */
288 :
289 0 : if( !fd_funk_key_is_acc( rec->pair.key ) ) {
290 0 : continue;
291 0 : }
292 :
293 0 : fd_pubkey_t const * pubkey = fd_type_pun_const( rec->pair.key[0].uc );
294 0 : int is_tombstone = rec->flags & FD_FUNK_REC_FLAG_ERASE;
295 0 : uchar const * raw = fd_funk_val( rec, fd_funk_wksp( funk ) );
296 0 : fd_account_meta_t * metadata = is_tombstone ? fd_snapshot_create_get_default_meta( fd_funk_rec_get_erase_data( rec ) ) :
297 0 : (fd_account_meta_t*)raw;
298 :
299 0 : if( !snapshot_ctx->is_incremental && is_tombstone ) {
300 : /* If we are in a full snapshot, we need to gather all of the accounts
301 : that we plan on deleting. */
302 0 : tombstones[ tombstones_cnt++ ] = (fd_funk_rec_t*)rec;
303 0 : }
304 :
305 0 : if( !metadata ) {
306 0 : continue;
307 0 : }
308 :
309 0 : if( metadata->magic!=FD_ACCOUNT_META_MAGIC ) {
310 0 : continue;
311 0 : }
312 :
313 : /* Don't iterate through accounts that were touched before the last full
314 : snapshot. */
315 0 : if( snapshot_ctx->is_incremental && metadata->slot<=snapshot_ctx->last_snap_slot ) {
316 0 : continue;
317 0 : }
318 :
319 0 : uchar const * acc_data = raw + metadata->hlen;
320 :
321 : /* All accounts that were touched in the snapshot slot should be in
322 : a different append vec so that Agave can calculate the snapshot slot's
323 : bank hash. We don't want to include them in an arbitrary append vec. */
324 :
325 0 : if( metadata->slot==snapshot_ctx->slot ) {
326 0 : snapshot_slot_keys[ snapshot_slot_key_cnt++ ] = (fd_pubkey_t*)pubkey;
327 0 : continue;
328 0 : }
329 :
330 : /* We don't want to iterate over tombstones if the snapshot is not
331 : incremental */
332 0 : if( !snapshot_ctx->is_incremental && is_tombstone ) {
333 0 : continue;
334 0 : }
335 :
336 0 : ulong new_sz = prev_accs->file_sz + sizeof(fd_solana_account_hdr_t) + fd_ulong_align_up( metadata->dlen, FD_SNAPSHOT_ACC_ALIGN );
337 :
338 0 : if( new_sz>FD_SNAPSHOT_APPEND_VEC_SZ_MAX ) {
339 :
340 : /* When the current append vec is full, finish writing it, start writing
341 : into the next append vec. */
342 :
343 0 : err = fd_tar_writer_fini_file( writer );
344 0 : if( FD_UNLIKELY( err ) ) {
345 0 : FD_LOG_ERR(( "Unable to finalize previous accounts file" ));
346 0 : }
347 :
348 0 : prev_accs = &accounts_db->storages[ ++curr_slot ].account_vecs[ 0UL ];
349 :
350 0 : err = snprintf( buffer, FD_SNAPSHOT_DIR_MAX, "accounts/%lu.%lu", snapshot_ctx->slot - curr_slot, prev_accs->id );
351 0 : if( FD_UNLIKELY( err<0 ) ) {
352 0 : FD_LOG_ERR(( "Unable to format previous accounts name string" ));
353 0 : }
354 :
355 0 : err = fd_tar_writer_new_file( writer, buffer );
356 0 : if( FD_UNLIKELY( err ) ) {
357 0 : FD_LOG_ERR(( "Unable to create previous accounts file" ));
358 0 : }
359 0 : }
360 :
361 0 : prev_accs->file_sz += sizeof(fd_solana_account_hdr_t) + fd_ulong_align_up( metadata->dlen, FD_SNAPSHOT_ACC_ALIGN );
362 :
363 :
364 : /* Write out the header. */
365 :
366 0 : fd_solana_account_hdr_t header = {0};
367 : /* Stored meta */
368 0 : header.meta.write_version_obsolete = 0UL;
369 0 : header.meta.data_len = metadata->dlen;
370 0 : fd_memcpy( header.meta.pubkey, pubkey, sizeof(fd_pubkey_t) );
371 : /* Account Meta */
372 0 : header.info.lamports = metadata->info.lamports;
373 0 : header.info.rent_epoch = header.info.lamports ? metadata->info.rent_epoch : 0UL;
374 0 : fd_memcpy( header.info.owner, metadata->info.owner, sizeof(fd_pubkey_t) );
375 0 : header.info.executable = metadata->info.executable;
376 : /* Hash */
377 0 : fd_memcpy( &header.hash, metadata->hash, sizeof(fd_hash_t) );
378 :
379 0 : err = fd_tar_writer_write_file_data( writer, &header, sizeof(fd_solana_account_hdr_t) );
380 0 : if( FD_UNLIKELY( err ) ) {
381 0 : FD_LOG_ERR(( "Unable to stream out account header to tar archive" ));
382 0 : }
383 :
384 : /* Write out the file data. */
385 :
386 0 : err = fd_tar_writer_write_file_data( writer, acc_data, metadata->dlen );
387 0 : if( FD_UNLIKELY( err ) ) {
388 0 : FD_LOG_ERR(( "Unable to stream out account data to tar archive" ));
389 0 : }
390 :
391 0 : ulong align_sz = fd_ulong_align_up( metadata->dlen, FD_SNAPSHOT_ACC_ALIGN ) - metadata->dlen;
392 0 : err = fd_tar_writer_write_file_data( writer, padding, align_sz );
393 0 : if( FD_UNLIKELY( err ) ) {
394 0 : FD_LOG_ERR( ("Unable to stream out account padding to tar archive" ));
395 0 : }
396 0 : }
397 :
398 0 : err = fd_tar_writer_fini_file( writer );
399 0 : if( FD_UNLIKELY( err ) ) {
400 0 : FD_LOG_ERR(( "Unable to finalize previous accounts file" ));
401 0 : }
402 :
403 : /* Now write out the append vec for the snapshot slot. Again, this is needed
404 : because the snapshot slot's accounts must be in their append vec in order
405 : to verify the bank hash for the snapshot slot in the Agave client. */
406 :
407 0 : fd_snapshot_acc_vec_t * curr_accs = &accounts_db->storages[ 0UL ].account_vecs[ 0UL ];
408 0 : err = snprintf( buffer, FD_SNAPSHOT_DIR_MAX, "accounts/%lu.%lu", snapshot_ctx->slot, curr_accs->id );
409 0 : if( FD_UNLIKELY( err<0 ) ) {
410 0 : FD_LOG_ERR(( "Unable to format current accounts name string" ));
411 0 : }
412 :
413 0 : err = fd_tar_writer_new_file( writer, buffer );
414 0 : if( FD_UNLIKELY( err ) ) {
415 0 : FD_LOG_ERR(( "Unable to create current accounts file" ));
416 0 : }
417 :
418 0 : for( ulong i=0UL; i<snapshot_slot_key_cnt; i++ ) {
419 :
420 0 : fd_pubkey_t const * pubkey = snapshot_slot_keys[i];
421 0 : fd_funk_rec_key_t key = fd_acc_funk_key( pubkey );
422 :
423 0 : fd_funk_rec_t const * rec = fd_funk_rec_query( funk, NULL, &key );
424 0 : if( FD_UNLIKELY( !rec ) ) {
425 0 : FD_LOG_ERR(( "Previously found record can no longer be found" ));
426 0 : }
427 :
428 0 : int is_tombstone = rec->flags & FD_FUNK_REC_FLAG_ERASE;
429 0 : uchar const * raw = fd_funk_val( rec, fd_funk_wksp( funk ) );
430 0 : fd_account_meta_t * metadata = is_tombstone ? fd_snapshot_create_get_default_meta( fd_funk_rec_get_erase_data( rec ) ) :
431 0 : (fd_account_meta_t*)raw;
432 :
433 0 : if( FD_UNLIKELY( !metadata ) ) {
434 0 : FD_LOG_ERR(( "Record should have non-NULL metadata" ));
435 0 : }
436 :
437 0 : if( FD_UNLIKELY( metadata->magic!=FD_ACCOUNT_META_MAGIC ) ) {
438 0 : FD_LOG_ERR(( "Record should have valid magic" ));
439 0 : }
440 :
441 0 : uchar const * acc_data = raw + metadata->hlen;
442 :
443 0 : curr_accs->file_sz += sizeof(fd_solana_account_hdr_t) + fd_ulong_align_up( metadata->dlen, FD_SNAPSHOT_ACC_ALIGN );
444 :
445 : /* Write out the header. */
446 0 : fd_solana_account_hdr_t header = {0};
447 : /* Stored meta */
448 0 : header.meta.write_version_obsolete = 0UL;
449 0 : header.meta.data_len = metadata->dlen;
450 0 : fd_memcpy( header.meta.pubkey, pubkey, sizeof(fd_pubkey_t) );
451 : /* Account Meta */
452 0 : header.info.lamports = metadata->info.lamports;
453 0 : header.info.rent_epoch = header.info.lamports ? metadata->info.rent_epoch : 0UL;
454 0 : fd_memcpy( header.info.owner, metadata->info.owner, sizeof(fd_pubkey_t) );
455 0 : header.info.executable = metadata->info.executable;
456 : /* Hash */
457 0 : fd_memcpy( &header.hash, metadata->hash, sizeof(fd_hash_t) );
458 :
459 :
460 0 : err = fd_tar_writer_write_file_data( writer, &header, sizeof(fd_solana_account_hdr_t) );
461 0 : if( FD_UNLIKELY( err ) ) {
462 0 : FD_LOG_ERR(( "Unable to stream out account header to tar archive" ));
463 0 : }
464 0 : err = fd_tar_writer_write_file_data( writer, acc_data, metadata->dlen );
465 0 : if( FD_UNLIKELY( err ) ) {
466 0 : FD_LOG_ERR(( "Unable to stream out account data to tar archive" ));
467 0 : }
468 0 : ulong align_sz = fd_ulong_align_up( metadata->dlen, FD_SNAPSHOT_ACC_ALIGN ) - metadata->dlen;
469 0 : err = fd_tar_writer_write_file_data( writer, padding, align_sz );
470 0 : if( FD_UNLIKELY( err ) ) {
471 0 : FD_LOG_ERR(( "Unable to stream out account padding to tar archive" ));
472 0 : }
473 0 : }
474 :
475 0 : err = fd_tar_writer_fini_file( writer );
476 0 : if( FD_UNLIKELY( err ) ) {
477 0 : FD_LOG_ERR(( "Unable to finish writing out file" ));
478 0 : }
479 :
480 : /* TODO: At this point we must implement compaction to the snapshot service.
481 : Without this, we are actually not cleaning up any tombstones from funk. */
482 :
483 0 : if( snapshot_ctx->is_incremental ) {
484 0 : fd_funk_start_write( funk );
485 0 : err = fd_funk_rec_forget( funk, tombstones, tombstones_cnt );
486 0 : if( FD_UNLIKELY( err!=FD_FUNK_SUCCESS ) ) {
487 0 : FD_LOG_ERR(( "Unable to forget tombstones" ));
488 0 : }
489 0 : FD_LOG_NOTICE(( "Compacted %lu tombstone records", tombstones_cnt ));
490 0 : fd_funk_end_write( funk );
491 0 : }
492 :
493 0 : fd_valloc_free( fd_spad_virtual( snapshot_ctx->spad ), snapshot_slot_keys );
494 0 : fd_valloc_free( fd_spad_virtual( snapshot_ctx->spad ), tombstones );
495 :
496 0 : }
497 :
498 : static void
499 : fd_snapshot_create_serialiable_stakes( fd_snapshot_ctx_t * snapshot_ctx,
500 : fd_stakes_t * old_stakes,
501 0 : fd_stakes_t * new_stakes ) {
502 :
503 : /* The deserialized stakes cache that is used by the runtime can't be
504 : reserialized into the format that Agave uses. For every vote account
505 : in the stakes struct, the Firedancer client holds a decoded copy of the
506 : vote state. However, this vote state can't be reserialized back into the
507 : full vote account data.
508 :
509 : This poses a problem in the Agave client client because upon boot, Agave
510 : verifies that for all of the vote accounts in the stakes struct, the data
511 : in the cache is the same as the data in the accounts db.
512 :
513 : The other problem is that the Firedancer stakes cache does not evict old
514 : entries and doesn't update delegations within the cache. The cache will
515 : just insert new pubkeys as stake accounts are created/delegated to. To
516 : make the cache conformant for the snapshot, old accounts should be removed
517 : from the snapshot and all of the delegations should be updated. */
518 :
519 : /* First populate the vote accounts using the vote accounts/stakes cache.
520 : We can populate over all of the fields except we can't reserialize the
521 : vote account data. Instead we will copy over the raw contents of all of
522 : the vote accounts. */
523 :
524 0 : ulong vote_accounts_len = fd_vote_accounts_pair_t_map_size( old_stakes->vote_accounts.vote_accounts_pool, old_stakes->vote_accounts.vote_accounts_root );
525 0 : uchar * pool_mem = fd_spad_alloc( snapshot_ctx->spad, fd_vote_accounts_pair_t_map_align(), fd_vote_accounts_pair_t_map_footprint( vote_accounts_len ) );
526 0 : new_stakes->vote_accounts.vote_accounts_pool = fd_vote_accounts_pair_t_map_join( fd_vote_accounts_pair_t_map_new( pool_mem, vote_accounts_len ) );
527 0 : new_stakes->vote_accounts.vote_accounts_root = NULL;
528 :
529 0 : for( fd_vote_accounts_pair_t_mapnode_t * n = fd_vote_accounts_pair_t_map_minimum(
530 0 : old_stakes->vote_accounts.vote_accounts_pool,
531 0 : old_stakes->vote_accounts.vote_accounts_root );
532 0 : n;
533 0 : n = fd_vote_accounts_pair_t_map_successor( old_stakes->vote_accounts.vote_accounts_pool, n ) ) {
534 :
535 0 : fd_vote_accounts_pair_t_mapnode_t * new_node = fd_vote_accounts_pair_t_map_acquire( new_stakes->vote_accounts.vote_accounts_pool );
536 0 : new_node->elem.key = n->elem.key;
537 0 : new_node->elem.stake = n->elem.stake;
538 : /* Now to populate the value, lookup the account using the acc mgr */
539 0 : FD_TXN_ACCOUNT_DECL( vote_acc );
540 0 : int err = fd_acc_mgr_view( snapshot_ctx->acc_mgr, NULL, &n->elem.key, vote_acc );
541 0 : if( FD_UNLIKELY( err ) ) {
542 0 : FD_LOG_ERR(( "Failed to view vote account from stakes cache %s", FD_BASE58_ENC_32_ALLOCA(&n->elem.key) ));
543 0 : }
544 :
545 0 : new_node->elem.value.lamports = vote_acc->const_meta->info.lamports;
546 0 : new_node->elem.value.data_len = vote_acc->const_meta->dlen;
547 0 : new_node->elem.value.data = fd_spad_alloc( snapshot_ctx->spad, 8UL, vote_acc->const_meta->dlen );
548 0 : fd_memcpy( new_node->elem.value.data, vote_acc->const_data, vote_acc->const_meta->dlen );
549 0 : fd_memcpy( &new_node->elem.value.owner, &vote_acc->const_meta->info.owner, sizeof(fd_pubkey_t) );
550 0 : new_node->elem.value.executable = vote_acc->const_meta->info.executable;
551 0 : new_node->elem.value.rent_epoch = vote_acc->const_meta->info.rent_epoch;
552 0 : fd_vote_accounts_pair_t_map_insert( new_stakes->vote_accounts.vote_accounts_pool, &new_stakes->vote_accounts.vote_accounts_root, new_node );
553 :
554 0 : }
555 :
556 : /* Stale stake delegations should also be removed or updated in the cache.
557 : TODO: This will likely be changed in the near future as the stake
558 : program is migrated to a bpf program. It will likely be replaced by an
559 : index of stake/vote accounts. */
560 :
561 0 : FD_TXN_ACCOUNT_DECL( stake_acc );
562 0 : fd_delegation_pair_t_mapnode_t * nn = NULL;
563 0 : for( fd_delegation_pair_t_mapnode_t * n = fd_delegation_pair_t_map_minimum(
564 0 : old_stakes->stake_delegations_pool, old_stakes->stake_delegations_root ); n; n=nn ) {
565 :
566 0 : nn = fd_delegation_pair_t_map_successor( old_stakes->stake_delegations_pool, n );
567 :
568 0 : int err = fd_acc_mgr_view( snapshot_ctx->acc_mgr, NULL, &n->elem.account, stake_acc );
569 0 : if( FD_UNLIKELY( err ) ) {
570 : /* If the stake account doesn't exist, the cache is stale and the entry
571 : just needs to be evicted. */
572 0 : fd_delegation_pair_t_map_remove( old_stakes->stake_delegations_pool, &old_stakes->stake_delegations_root, n );
573 0 : fd_delegation_pair_t_map_release( old_stakes->stake_delegations_pool, n );
574 0 : } else {
575 : /* Otherwise, just update the delegation in case it is stale. */
576 0 : fd_bincode_decode_ctx_t ctx = {
577 0 : .data = stake_acc->const_data,
578 0 : .dataend = stake_acc->const_data + stake_acc->const_meta->dlen,
579 0 : };
580 :
581 0 : ulong total_sz = 0UL;
582 0 : err = fd_stake_state_v2_decode_footprint( &ctx, &total_sz );
583 0 : if( FD_UNLIKELY( err ) ) {
584 0 : FD_LOG_ERR(( "Failed to decode stake state footprint" ));
585 0 : }
586 :
587 0 : uchar * mem = fd_spad_alloc( snapshot_ctx->spad, FD_STAKE_STATE_V2_ALIGN, total_sz );
588 0 : if( FD_UNLIKELY( !mem ) ) {
589 0 : FD_LOG_ERR(( "Failed to allocate memory for stake state" ));
590 0 : }
591 :
592 0 : fd_stake_state_v2_t * stake_state = fd_stake_state_v2_decode( mem, &ctx );
593 :
594 0 : n->elem.delegation = stake_state->inner.stake.stake.delegation;
595 0 : }
596 0 : }
597 :
598 : /* Copy over the rest of the fields as they are the same. */
599 :
600 0 : new_stakes->stake_delegations_pool = old_stakes->stake_delegations_pool;
601 0 : new_stakes->stake_delegations_root = old_stakes->stake_delegations_root;
602 0 : new_stakes->unused = old_stakes->unused;
603 0 : new_stakes->epoch = old_stakes->epoch;
604 0 : new_stakes->stake_history = old_stakes->stake_history;
605 :
606 0 : }
607 :
608 : static inline void
609 : fd_snapshot_create_populate_bank( fd_snapshot_ctx_t * snapshot_ctx,
610 0 : fd_versioned_bank_t * bank ) {
611 :
612 0 : fd_slot_bank_t * slot_bank = &snapshot_ctx->slot_bank;
613 0 : fd_epoch_bank_t * epoch_bank = &snapshot_ctx->epoch_bank;
614 :
615 : /* The blockhash queue has to be copied over along with all of its entries.
616 : As a note, the size is 300 but in fact is of size 301 due to a knwon bug
617 : in the agave client that is emulated by the firedancer client. */
618 :
619 0 : bank->blockhash_queue.last_hash_index = slot_bank->block_hash_queue.last_hash_index;
620 0 : bank->blockhash_queue.last_hash = fd_spad_alloc( snapshot_ctx->spad, FD_HASH_ALIGN, FD_HASH_FOOTPRINT );
621 0 : fd_memcpy( bank->blockhash_queue.last_hash, slot_bank->block_hash_queue.last_hash, sizeof(fd_hash_t) );
622 :
623 0 : bank->blockhash_queue.ages_len = fd_hash_hash_age_pair_t_map_size( slot_bank->block_hash_queue.ages_pool, slot_bank->block_hash_queue.ages_root);
624 0 : bank->blockhash_queue.ages = fd_spad_alloc( snapshot_ctx->spad, FD_HASH_HASH_AGE_PAIR_ALIGN, bank->blockhash_queue.ages_len * sizeof(fd_hash_hash_age_pair_t) );
625 0 : bank->blockhash_queue.max_age = FD_BLOCKHASH_QUEUE_SIZE;
626 :
627 0 : fd_block_hash_queue_t * queue = &slot_bank->block_hash_queue;
628 0 : fd_hash_hash_age_pair_t_mapnode_t * nn = NULL;
629 0 : ulong blockhash_queue_idx = 0UL;
630 0 : for( fd_hash_hash_age_pair_t_mapnode_t * n = fd_hash_hash_age_pair_t_map_minimum( queue->ages_pool, queue->ages_root ); n; n = nn ) {
631 0 : nn = fd_hash_hash_age_pair_t_map_successor( queue->ages_pool, n );
632 0 : fd_memcpy( &bank->blockhash_queue.ages[ blockhash_queue_idx++ ], &n->elem, sizeof(fd_hash_hash_age_pair_t) );
633 0 : }
634 :
635 :
636 :
637 : /* Ancestor can be omitted to boot off of for both clients */
638 :
639 0 : bank->ancestors_len = 0UL;
640 0 : bank->ancestors = NULL;
641 :
642 0 : bank->hash = slot_bank->banks_hash;
643 0 : bank->parent_hash = slot_bank->prev_banks_hash;
644 0 : bank->parent_slot = slot_bank->prev_slot;
645 0 : bank->hard_forks = slot_bank->hard_forks;
646 0 : bank->transaction_count = slot_bank->transaction_count;
647 0 : bank->signature_count = slot_bank->parent_signature_cnt;
648 0 : bank->capitalization = slot_bank->capitalization;
649 0 : bank->tick_height = slot_bank->tick_height;
650 0 : bank->max_tick_height = slot_bank->max_tick_height;
651 :
652 : /* The hashes_per_tick needs to be copied over from the epoch bank because
653 : the pointer could go out of bounds during an epoch boundary. */
654 0 : bank->hashes_per_tick = fd_spad_alloc( snapshot_ctx->spad, alignof(ulong), sizeof(ulong) );
655 0 : fd_memcpy( bank->hashes_per_tick, &epoch_bank->hashes_per_tick, sizeof(ulong) );
656 :
657 0 : bank->ticks_per_slot = FD_TICKS_PER_SLOT;
658 0 : bank->ns_per_slot = epoch_bank->ns_per_slot;
659 0 : bank->genesis_creation_time = epoch_bank->genesis_creation_time;
660 0 : bank->slots_per_year = epoch_bank->slots_per_year;
661 :
662 : /* This value can be set to 0 because the Agave client recomputes this value
663 : and the firedancer client doesn't use it. */
664 :
665 0 : bank->accounts_data_len = 0UL;
666 :
667 0 : bank->slot = snapshot_ctx->slot;
668 0 : bank->epoch = fd_slot_to_epoch( &epoch_bank->epoch_schedule, bank->slot, NULL );
669 0 : bank->block_height = slot_bank->block_height;
670 :
671 : /* Collector id can be left as null for both clients */
672 :
673 0 : fd_memset( &bank->collector_id, 0, sizeof(fd_pubkey_t) );
674 :
675 0 : bank->collector_fees = slot_bank->collected_execution_fees + slot_bank->collected_priority_fees;
676 0 : bank->fee_calculator.lamports_per_signature = slot_bank->lamports_per_signature;
677 0 : bank->fee_rate_governor = slot_bank->fee_rate_governor;
678 0 : bank->collected_rent = slot_bank->collected_rent;
679 :
680 0 : bank->rent_collector.epoch = bank->epoch;
681 0 : bank->rent_collector.epoch_schedule = epoch_bank->rent_epoch_schedule;
682 0 : bank->rent_collector.slots_per_year = epoch_bank->slots_per_year;
683 0 : bank->rent_collector.rent = epoch_bank->rent;
684 :
685 0 : bank->epoch_schedule = epoch_bank->epoch_schedule;
686 0 : bank->inflation = epoch_bank->inflation;
687 :
688 : /* Unused accounts can be left as NULL for both clients. */
689 :
690 0 : fd_memset( &bank->unused_accounts, 0, sizeof(fd_unused_accounts_t) );
691 :
692 : /* We need to copy over the stakes for two epochs despite the Agave client
693 : providing the stakes for 6 epochs. These stakes need to be copied over
694 : because of the fact that the leader schedule computation uses the two
695 : previous epoch stakes.
696 :
697 : TODO: This field has been deprecated by agave and has instead been
698 : replaced with the versioned epoch stakes field in the manifest. The
699 : firedancer client will populate the deprecated field. */
700 :
701 0 : fd_epoch_epoch_stakes_pair_t * relevant_epoch_stakes = fd_spad_alloc( snapshot_ctx->spad, FD_EPOCH_EPOCH_STAKES_PAIR_ALIGN, 2UL * sizeof(fd_epoch_epoch_stakes_pair_t) );
702 0 : fd_memset( &relevant_epoch_stakes[0], 0UL, sizeof(fd_epoch_epoch_stakes_pair_t) );
703 0 : fd_memset( &relevant_epoch_stakes[1], 0UL, sizeof(fd_epoch_epoch_stakes_pair_t) );
704 0 : relevant_epoch_stakes[0].key = bank->epoch;
705 0 : relevant_epoch_stakes[0].value.stakes.vote_accounts = slot_bank->epoch_stakes;
706 0 : relevant_epoch_stakes[1].key = bank->epoch+1UL;
707 0 : relevant_epoch_stakes[1].value.stakes.vote_accounts = epoch_bank->next_epoch_stakes;
708 :
709 0 : bank->epoch_stakes_len = 2UL;
710 0 : bank->epoch_stakes = relevant_epoch_stakes;
711 0 : bank->is_delta = snapshot_ctx->is_incremental;
712 :
713 : /* The firedancer runtime currently maintains a version of the stakes which
714 : can't be reserialized into a format that is compatible with the Solana
715 : snapshot format. Therefore, we must recompute the data structure using
716 : the pubkeys from the stakes cache that is currently in the epoch context. */
717 :
718 0 : fd_snapshot_create_serialiable_stakes( snapshot_ctx, &epoch_bank->stakes, &bank->stakes );
719 :
720 0 : }
721 :
722 : static inline void
723 0 : fd_snapshot_create_setup_and_validate_ctx( fd_snapshot_ctx_t * snapshot_ctx ) {
724 :
725 0 : fd_funk_t * funk = snapshot_ctx->funk;
726 :
727 : /* Initialize the account manager. */
728 :
729 0 : uchar * mem = fd_spad_alloc( snapshot_ctx->spad, FD_ACC_MGR_ALIGN, FD_ACC_MGR_FOOTPRINT );
730 0 : snapshot_ctx->acc_mgr = fd_acc_mgr_new( mem, funk );
731 0 : if( FD_UNLIKELY( !snapshot_ctx->acc_mgr ) ) {
732 0 : FD_LOG_ERR(( "Failed to initialize account manager" ));
733 0 : }
734 :
735 : /* First the epoch bank. */
736 :
737 0 : fd_funk_rec_key_t epoch_id = fd_runtime_epoch_bank_key();
738 0 : fd_funk_rec_t const * epoch_rec = fd_funk_rec_query( funk, NULL, &epoch_id );
739 0 : if( FD_UNLIKELY( !epoch_rec ) ) {
740 0 : FD_LOG_ERR(( "Failed to read epoch bank record: missing record" ));
741 0 : }
742 0 : void * epoch_val = fd_funk_val( epoch_rec, fd_funk_wksp( funk ) );
743 :
744 0 : if( FD_UNLIKELY( fd_funk_val_sz( epoch_rec )<sizeof(uint) ) ) {
745 0 : FD_LOG_ERR(( "Failed to read epoch bank record: empty record" ));
746 0 : }
747 :
748 0 : uint epoch_magic = *(uint*)epoch_val;
749 :
750 0 : fd_bincode_decode_ctx_t epoch_decode_ctx = {
751 0 : .data = (uchar*)epoch_val + sizeof(uint),
752 0 : .dataend = (uchar*)epoch_val + fd_funk_val_sz( epoch_rec ),
753 0 : };
754 :
755 0 : if( FD_UNLIKELY( epoch_magic!=FD_RUNTIME_ENC_BINCODE ) ) {
756 0 : FD_LOG_ERR(( "Epoch bank record has wrong magic" ));
757 0 : }
758 :
759 0 : ulong total_sz = 0UL;
760 0 : int err = fd_epoch_bank_decode_footprint( &epoch_decode_ctx, &total_sz );
761 0 : if( FD_UNLIKELY( err!=FD_BINCODE_SUCCESS ) ) {
762 0 : FD_LOG_ERR(( "Failed to decode epoch bank" ));
763 0 : }
764 :
765 0 : uchar * epoch_bank_mem = fd_spad_alloc( snapshot_ctx->spad, FD_EPOCH_BANK_ALIGN, total_sz );
766 0 : if( FD_UNLIKELY( !epoch_bank_mem ) ) {
767 0 : FD_LOG_ERR(( "Failed to allocate memory for epoch bank" ));
768 0 : }
769 :
770 0 : fd_epoch_bank_decode( epoch_bank_mem, &epoch_decode_ctx );
771 :
772 0 : fd_memcpy( &snapshot_ctx->epoch_bank, epoch_bank_mem, sizeof(fd_epoch_bank_t) );
773 :
774 : /* Now the slot bank. */
775 :
776 0 : fd_funk_rec_key_t slot_id = fd_runtime_slot_bank_key();
777 0 : fd_funk_rec_t const * slot_rec = fd_funk_rec_query( funk, NULL, &slot_id );
778 0 : if( FD_UNLIKELY( !slot_rec ) ) {
779 0 : FD_LOG_ERR(( "Failed to read slot bank record: missing record" ));
780 0 : }
781 0 : void * slot_val = fd_funk_val( slot_rec, fd_funk_wksp( funk ) );
782 :
783 0 : if( FD_UNLIKELY( fd_funk_val_sz( slot_rec )<sizeof(uint) ) ) {
784 0 : FD_LOG_ERR(( "Failed to read slot bank record: empty record" ));
785 0 : }
786 :
787 0 : uint slot_magic = *(uint*)slot_val;
788 :
789 0 : fd_bincode_decode_ctx_t slot_decode_ctx = {
790 0 : .data = (uchar*)slot_val + sizeof(uint),
791 0 : .dataend = (uchar*)slot_val + fd_funk_val_sz( slot_rec ),
792 0 : };
793 :
794 0 : if( FD_UNLIKELY( slot_magic!=FD_RUNTIME_ENC_BINCODE ) ) {
795 0 : FD_LOG_ERR(( "Slot bank record has wrong magic" ));
796 0 : }
797 :
798 0 : total_sz = 0UL;
799 0 : err = fd_slot_bank_decode_footprint( &slot_decode_ctx, &total_sz );
800 0 : if( FD_UNLIKELY( err!=FD_BINCODE_SUCCESS ) ) {
801 0 : FD_LOG_ERR(( "Failed to decode slot bank" ));
802 0 : }
803 :
804 0 : uchar * slot_bank_mem = fd_spad_alloc( snapshot_ctx->spad, FD_SLOT_BANK_ALIGN, total_sz );
805 0 : if( FD_UNLIKELY( !slot_bank_mem ) ) {
806 0 : FD_LOG_ERR(( "Failed to allocate memory for slot bank" ));
807 0 : }
808 :
809 0 : fd_slot_bank_decode( slot_bank_mem, &slot_decode_ctx );
810 :
811 0 : memcpy( &snapshot_ctx->slot_bank, slot_bank_mem, sizeof(fd_slot_bank_t) );
812 :
813 : /* Validate that the snapshot context is setup correctly */
814 :
815 0 : if( FD_UNLIKELY( !snapshot_ctx->out_dir ) ) {
816 0 : FD_LOG_ERR(( "Snapshot directory is not set" ));
817 0 : }
818 :
819 0 : if( FD_UNLIKELY( snapshot_ctx->slot>snapshot_ctx->slot_bank.slot ) ) {
820 0 : FD_LOG_ERR(( "Snapshot slot=%lu is greater than the current slot=%lu",
821 0 : snapshot_ctx->slot, snapshot_ctx->slot_bank.slot ));
822 0 : }
823 :
824 : /* Truncate the two files used for snapshot creation and seek to its start. */
825 :
826 0 : long seek = lseek( snapshot_ctx->tmp_fd, 0, SEEK_SET );
827 0 : if( FD_UNLIKELY( seek ) ) {
828 0 : FD_LOG_ERR(( "Failed to seek to the start of the file" ));
829 0 : }
830 :
831 0 : if( FD_UNLIKELY( ftruncate( snapshot_ctx->tmp_fd, 0UL ) < 0 ) ) {
832 0 : FD_LOG_ERR(( "Failed to truncate the temporary file" ));
833 0 : }
834 :
835 0 : seek = lseek( snapshot_ctx->snapshot_fd, 0, SEEK_SET );
836 0 : if( FD_UNLIKELY( seek ) ) {
837 0 : FD_LOG_ERR(( "Failed to seek to the start of the file" ));
838 0 : }
839 :
840 0 : if( FD_UNLIKELY( ftruncate( snapshot_ctx->snapshot_fd, 0UL ) < 0 ) ) {
841 0 : FD_LOG_ERR(( "Failed to truncate the snapshot file" ));
842 0 : }
843 :
844 0 : }
845 :
846 : static inline void
847 0 : fd_snapshot_create_setup_writer( fd_snapshot_ctx_t * snapshot_ctx ) {
848 :
849 : /* Setup a tar writer. */
850 :
851 0 : uchar * writer_mem = fd_spad_alloc( snapshot_ctx->spad, fd_tar_writer_align(), fd_tar_writer_footprint() );
852 0 : snapshot_ctx->writer = fd_tar_writer_new( writer_mem, snapshot_ctx->tmp_fd );
853 0 : if( FD_UNLIKELY( !snapshot_ctx->writer ) ) {
854 0 : FD_LOG_ERR(( "Unable to create a tar writer" ));
855 0 : }
856 0 : }
857 :
858 : static inline void
859 0 : fd_snapshot_create_write_version( fd_snapshot_ctx_t * snapshot_ctx ) {
860 :
861 : /* The first file in the tar archive should be the version file.. */
862 :
863 0 : int err = fd_tar_writer_new_file( snapshot_ctx->writer, FD_SNAPSHOT_VERSION_FILE );
864 0 : if( FD_UNLIKELY( err ) ) {
865 0 : FD_LOG_ERR(( "Failed to create the version file" ));
866 0 : }
867 :
868 0 : err = fd_tar_writer_write_file_data( snapshot_ctx->writer, FD_SNAPSHOT_VERSION, FD_SNAPSHOT_VERSION_LEN);
869 0 : if( FD_UNLIKELY( err ) ) {
870 0 : FD_LOG_ERR(( "Failed to create the version file" ));
871 0 : }
872 :
873 0 : err = fd_tar_writer_fini_file( snapshot_ctx->writer );
874 0 : if( FD_UNLIKELY( err ) ) {
875 0 : FD_LOG_ERR(( "Failed to create the version file" ));
876 0 : }
877 :
878 0 : }
879 :
880 : static inline void
881 0 : fd_snapshot_create_write_status_cache( fd_snapshot_ctx_t * snapshot_ctx ) {
882 :
883 : /* First convert the existing status cache into a snapshot-friendly format. */
884 :
885 0 : fd_bank_slot_deltas_t slot_deltas_new = {0};
886 0 : int err = fd_txncache_get_entries( snapshot_ctx->status_cache,
887 0 : &slot_deltas_new,
888 0 : snapshot_ctx->spad );
889 0 : if( FD_UNLIKELY( err ) ) {
890 0 : FD_LOG_ERR(( "Failed to get entries from the status cache" ));
891 0 : }
892 0 : ulong bank_slot_deltas_sz = fd_bank_slot_deltas_size( &slot_deltas_new );
893 0 : uchar * out_status_cache = fd_spad_alloc( snapshot_ctx->spad,
894 0 : FD_BANK_SLOT_DELTAS_ALIGN,
895 0 : bank_slot_deltas_sz );
896 0 : fd_bincode_encode_ctx_t encode_status_cache = {
897 0 : .data = out_status_cache,
898 0 : .dataend = out_status_cache + bank_slot_deltas_sz,
899 0 : };
900 0 : if( FD_UNLIKELY( fd_bank_slot_deltas_encode( &slot_deltas_new, &encode_status_cache ) ) ) {
901 0 : FD_LOG_ERR(( "Failed to encode the status cache" ));
902 0 : }
903 :
904 : /* Now write out the encoded buffer to the tar archive. */
905 :
906 0 : err = fd_tar_writer_new_file( snapshot_ctx->writer, FD_SNAPSHOT_STATUS_CACHE_FILE );
907 0 : if( FD_UNLIKELY( err ) ) {
908 0 : FD_LOG_ERR(( "Failed to create the status cache file" ));
909 0 : }
910 0 : err = fd_tar_writer_write_file_data( snapshot_ctx->writer, out_status_cache, bank_slot_deltas_sz );
911 0 : if( FD_UNLIKELY( err ) ) {
912 0 : FD_LOG_ERR(( "Failed to create the status cache file" ));
913 0 : }
914 0 : err = fd_tar_writer_fini_file( snapshot_ctx->writer );
915 0 : if( FD_UNLIKELY( err ) ) {
916 0 : FD_LOG_ERR(( "Failed to create the status cache file" ));
917 0 : }
918 :
919 : /* Registers all roots and unconstipates the status cache. */
920 :
921 0 : fd_txncache_flush_constipated_slots( snapshot_ctx->status_cache );
922 :
923 0 : }
924 :
925 : static inline void
926 : fd_snapshot_create_write_manifest_and_acc_vecs( fd_snapshot_ctx_t * snapshot_ctx,
927 : fd_hash_t * out_hash,
928 0 : ulong * out_capitalization ) {
929 :
930 :
931 0 : fd_solana_manifest_t manifest = {0};
932 :
933 : /* Copy in all the fields of the bank. */
934 :
935 0 : fd_snapshot_create_populate_bank( snapshot_ctx, &manifest.bank );
936 :
937 : /* Populate the rest of the manifest, except for the append vec index. */
938 :
939 0 : manifest.lamports_per_signature = snapshot_ctx->slot_bank.lamports_per_signature;
940 0 : manifest.epoch_account_hash = &snapshot_ctx->slot_bank.epoch_account_hash;
941 :
942 : /* FIXME: The versioned epoch stakes needs to be implemented. Right now if
943 : we try to create a snapshot on or near an epoch boundary, we will produce
944 : an invalid snapshot. */
945 :
946 0 : manifest.versioned_epoch_stakes_len = 0UL;
947 0 : manifest.versioned_epoch_stakes = NULL;
948 :
949 : /* Populate the append vec index and write out the corresponding acc files. */
950 :
951 0 : ulong incr_capitalization = 0UL;
952 0 : fd_snapshot_create_populate_acc_vecs( snapshot_ctx, &manifest, snapshot_ctx->writer, &incr_capitalization );
953 :
954 : /* Once the append vec index is populated and the hashes are calculated,
955 : propogate the hashes to the correct fields. As a note, the last_snap_hash
956 : is the full snapshot's account hash. */
957 :
958 0 : if( snapshot_ctx->is_incremental ) {
959 0 : manifest.bank_incremental_snapshot_persistence->full_slot = snapshot_ctx->last_snap_slot;
960 0 : fd_memcpy( &manifest.bank_incremental_snapshot_persistence->full_hash, snapshot_ctx->last_snap_acc_hash, sizeof(fd_hash_t) );
961 0 : manifest.bank_incremental_snapshot_persistence->full_capitalization = snapshot_ctx->last_snap_capitalization;
962 0 : manifest.bank_incremental_snapshot_persistence->incremental_hash = snapshot_ctx->acc_hash;
963 0 : manifest.bank_incremental_snapshot_persistence->incremental_capitalization = incr_capitalization;
964 0 : } else {
965 0 : memcpy( out_hash, &manifest.accounts_db.bank_hash_info.accounts_hash, sizeof(fd_hash_t) );
966 0 : *out_capitalization = snapshot_ctx->slot_bank.capitalization;
967 0 : }
968 :
969 : /* At this point, all of the account files are written out and the append
970 : vec index is populated in the manifest. We have already reserved space
971 : in the archive for the manifest. All we need to do now is encode the
972 : manifest and write it in. */
973 :
974 0 : ulong manifest_sz = fd_solana_manifest_size( &manifest );
975 0 : uchar * out_manifest = fd_spad_alloc( snapshot_ctx->spad, fd_solana_manifest_align(), manifest_sz );
976 :
977 0 : fd_bincode_encode_ctx_t encode = {
978 0 : .data = out_manifest,
979 0 : .dataend = out_manifest + manifest_sz
980 0 : };
981 :
982 0 : int err = fd_solana_manifest_encode( &manifest, &encode );
983 0 : if( FD_UNLIKELY( err ) ) {
984 0 : FD_LOG_ERR(( "Failed to encode the manifest" ));
985 0 : }
986 :
987 0 : err = fd_tar_writer_fill_space( snapshot_ctx->writer, out_manifest, manifest_sz );
988 0 : if( FD_UNLIKELY( err ) ) {
989 0 : FD_LOG_ERR(( "Failed to write out the manifest" ));
990 0 : }
991 :
992 0 : void * mem = fd_tar_writer_delete( snapshot_ctx->writer );
993 0 : if( FD_UNLIKELY( !mem ) ) {
994 0 : FD_LOG_ERR(( "Unable to delete the tar writer" ));
995 0 : }
996 :
997 0 : }
998 :
999 : static inline void
1000 0 : fd_snapshot_create_compress( fd_snapshot_ctx_t * snapshot_ctx ) {
1001 :
1002 : /* Compress the file using zstd. First open the non-compressed file and
1003 : create a file for the compressed file. The reason why we can't do this
1004 : as we stream out the snapshot archive is that we write back into the
1005 : manifest buffer.
1006 :
1007 : TODO: A way to eliminate this and to just stream out
1008 : 1 compressed file would be to totally precompute the index such that
1009 : we don't have to write back into funk.
1010 :
1011 : TODO: Currently, the snapshot service interfaces directly with the zstd
1012 : library but a generalized cstream defined in fd_zstd should be used
1013 : instead. */
1014 :
1015 0 : ulong in_buf_sz = ZSTD_CStreamInSize();
1016 0 : ulong zstd_buf_sz = ZSTD_CStreamOutSize();
1017 0 : ulong out_buf_sz = ZSTD_CStreamOutSize();
1018 :
1019 0 : char * in_buf = fd_spad_alloc( snapshot_ctx->spad, FD_ZSTD_CSTREAM_ALIGN, in_buf_sz );
1020 0 : char * zstd_buf = fd_spad_alloc( snapshot_ctx->spad, FD_ZSTD_CSTREAM_ALIGN, out_buf_sz );
1021 0 : char * out_buf = fd_spad_alloc( snapshot_ctx->spad, FD_ZSTD_CSTREAM_ALIGN, out_buf_sz );
1022 :
1023 : /* Reopen the tarball and open/overwrite the filename for the compressed,
1024 : finalized full snapshot. Setup the zstd compression stream. */
1025 :
1026 0 : int err = 0;
1027 :
1028 0 : ZSTD_CStream * cstream = ZSTD_createCStream();
1029 0 : if( FD_UNLIKELY( !cstream ) ) {
1030 0 : FD_LOG_ERR(( "Failed to create the zstd compression stream" ));
1031 0 : }
1032 0 : ZSTD_initCStream( cstream, ZSTD_CLEVEL_DEFAULT );
1033 :
1034 0 : fd_io_buffered_ostream_t ostream[1];
1035 :
1036 0 : if( FD_UNLIKELY( !fd_io_buffered_ostream_init( ostream, snapshot_ctx->snapshot_fd, out_buf, out_buf_sz ) ) ) {
1037 0 : FD_LOG_ERR(( "Failed to initialize the ostream" ));
1038 0 : }
1039 :
1040 0 : long seek = lseek( snapshot_ctx->snapshot_fd, 0, SEEK_SET );
1041 0 : if( FD_UNLIKELY( seek!=0L ) ) {
1042 0 : FD_LOG_ERR(( "Failed to seek to the start of the file" ));
1043 0 : }
1044 :
1045 : /* At this point, the tar archive and the new zstd file is open. The zstd
1046 : streamer is still open. Now, we are ready to read in bytes and stream
1047 : compress them. We will keep going until we see an EOF in a tar archive. */
1048 :
1049 0 : ulong in_sz = in_buf_sz;
1050 :
1051 0 : ulong off = (ulong)lseek( snapshot_ctx->tmp_fd, 0, SEEK_SET );
1052 0 : if( FD_UNLIKELY( off ) ) {
1053 0 : FD_LOG_ERR(( "Failed to seek to the beginning of the file" ));
1054 0 : }
1055 :
1056 0 : while( in_sz==in_buf_sz ) {
1057 :
1058 : /* Read chunks from the file. There isn't really a need to use a streamed
1059 : reader here because we will read in the max size buffer for every single
1060 : file read except for the very last one.
1061 :
1062 : in_sz will only not equal in_buf_sz on the last read. */
1063 0 : err = fd_io_read( snapshot_ctx->tmp_fd, in_buf, 0UL, in_buf_sz, &in_sz );
1064 0 : if( FD_UNLIKELY( err ) ) {
1065 0 : FD_LOG_ERR(( "Failed to read in the file" ));
1066 0 : }
1067 :
1068 : /* Compress the in memory buffer and add it to the output stream. */
1069 :
1070 0 : ZSTD_inBuffer input = { in_buf, in_sz, 0UL };
1071 0 : while( input.pos<input.size ) {
1072 0 : ZSTD_outBuffer output = { zstd_buf, zstd_buf_sz, 0UL };
1073 0 : ulong ret = ZSTD_compressStream( cstream, &output, &input );
1074 :
1075 0 : if( FD_UNLIKELY( ZSTD_isError( ret ) ) ) {
1076 0 : FD_LOG_ERR(( "Compression error: %s\n", ZSTD_getErrorName( ret ) ));
1077 0 : }
1078 :
1079 0 : err = fd_io_buffered_ostream_write( ostream, zstd_buf, output.pos );
1080 0 : if( FD_UNLIKELY( err ) ) {
1081 0 : FD_LOG_ERR(( "Failed to write out the compressed file" ));
1082 0 : }
1083 0 : }
1084 0 : }
1085 :
1086 : /* Now flush any bytes left in the zstd buffer, cleanup open file
1087 : descriptors, and deinit any data structures. */
1088 :
1089 0 : ZSTD_outBuffer output = { zstd_buf, zstd_buf_sz, 0UL };
1090 0 : ulong remaining = ZSTD_endStream( cstream, &output );
1091 :
1092 0 : if( FD_UNLIKELY( ZSTD_isError( remaining ) ) ) {
1093 0 : FD_LOG_ERR(( "Unable to end the zstd stream" ));
1094 0 : }
1095 0 : if( output.pos>0UL ) {
1096 0 : fd_io_buffered_ostream_write( ostream, zstd_buf, output.pos );
1097 0 : }
1098 :
1099 0 : ZSTD_freeCStream( cstream ); /* Works even if cstream is null */
1100 0 : err = fd_io_buffered_ostream_flush( ostream );
1101 0 : if( FD_UNLIKELY( err ) ) {
1102 0 : FD_LOG_ERR(( "Failed to flush the ostream" ));
1103 0 : }
1104 :
1105 : /* Assuming that there was a successful write, make the compressed
1106 : snapshot file readable and servable. */
1107 :
1108 0 : char tmp_directory_buf_zstd[ FD_SNAPSHOT_DIR_MAX ];
1109 0 : err = snprintf( tmp_directory_buf_zstd, FD_SNAPSHOT_DIR_MAX, "%s/%s", snapshot_ctx->out_dir, snapshot_ctx->is_incremental ? FD_SNAPSHOT_TMP_INCR_ARCHIVE_ZSTD : FD_SNAPSHOT_TMP_FULL_ARCHIVE_ZSTD );
1110 0 : if( FD_UNLIKELY( err<0 ) ) {
1111 0 : FD_LOG_ERR(( "Failed to format directory string" ));
1112 0 : }
1113 :
1114 0 : char directory_buf_zstd[ FD_SNAPSHOT_DIR_MAX ];
1115 0 : if( !snapshot_ctx->is_incremental ) {
1116 0 : err = snprintf( directory_buf_zstd, FD_SNAPSHOT_DIR_MAX, "%s/snapshot-%lu-%s.tar.zst",
1117 0 : snapshot_ctx->out_dir, snapshot_ctx->slot, FD_BASE58_ENC_32_ALLOCA(&snapshot_ctx->snap_hash) );
1118 0 : } else {
1119 0 : err = snprintf( directory_buf_zstd, FD_SNAPSHOT_DIR_MAX, "%s/incremental-snapshot-%lu-%lu-%s.tar.zst",
1120 0 : snapshot_ctx->out_dir, snapshot_ctx->last_snap_slot, snapshot_ctx->slot, FD_BASE58_ENC_32_ALLOCA(&snapshot_ctx->snap_hash) );
1121 0 : }
1122 :
1123 0 : if( FD_UNLIKELY( err<0 ) ) {
1124 0 : FD_LOG_ERR(( "Failed to format directory string" ));
1125 0 : }
1126 :
1127 0 : err = rename( tmp_directory_buf_zstd, directory_buf_zstd );
1128 0 : if( FD_UNLIKELY( err<0 ) ) {
1129 0 : FD_LOG_ERR(( "Failed to rename file from %s to %s (%i-%s)", tmp_directory_buf_zstd, directory_buf_zstd, errno, fd_io_strerror( errno ) ));
1130 0 : }
1131 :
1132 0 : }
1133 :
1134 : void
1135 : fd_snapshot_create_new_snapshot( fd_snapshot_ctx_t * snapshot_ctx,
1136 : fd_hash_t * out_hash,
1137 0 : ulong * out_capitalization ) {
1138 :
1139 0 : FD_LOG_NOTICE(( "Starting to produce a snapshot for slot=%lu in directory=%s", snapshot_ctx->slot, snapshot_ctx->out_dir ));
1140 :
1141 : /* Validate that the snapshot_ctx is setup correctly. */
1142 :
1143 0 : fd_snapshot_create_setup_and_validate_ctx( snapshot_ctx );
1144 :
1145 : /* Setup the tar archive writer. */
1146 :
1147 0 : fd_snapshot_create_setup_writer( snapshot_ctx );
1148 :
1149 : /* Write out the version file. */
1150 :
1151 0 : fd_snapshot_create_write_version( snapshot_ctx );
1152 :
1153 : /* Dump the status cache and append it to the tar archive. */
1154 :
1155 0 : fd_snapshot_create_write_status_cache( snapshot_ctx );
1156 :
1157 : /* Populate and write out the manifest and append vecs. */
1158 :
1159 0 : fd_snapshot_create_write_manifest_and_acc_vecs( snapshot_ctx, out_hash, out_capitalization );
1160 :
1161 : /* Compress the tar file and write it out to the specified directory. */
1162 :
1163 0 : fd_snapshot_create_compress( snapshot_ctx );
1164 :
1165 0 : FD_LOG_NOTICE(( "Finished producing a snapshot" ));
1166 :
1167 0 : }
|