Line data Source code
1 : #ifndef HEADER_fd_discof_restore_fd_snapwm_tile_private_h
2 : #define HEADER_fd_discof_restore_fd_snapwm_tile_private_h
3 :
4 : /* fd_snapwm_tile_private.h contains private APIs for the "snapwm" tile,
5 : which is the tile responsible for directing vinyl database writes. */
6 :
7 : #include "utils/fd_slot_delta_parser.h"
8 : #include "utils/fd_ssparse.h"
9 : #include "utils/fd_vinyl_admin.h"
10 : #include "../../ballet/lthash/fd_lthash.h"
11 : #include "../../ballet/lthash/fd_lthash_adder.h"
12 : #include "../../disco/stem/fd_stem.h"
13 : #include "../../disco/topo/fd_topo.h"
14 : #include "../../vinyl/io/fd_vinyl_io.h"
15 : #include "../../vinyl/meta/fd_vinyl_meta.h"
16 :
17 0 : #define FD_SNAPWM_WR_MTU (16UL<<20)
18 0 : #define FD_SNAPWM_PAIR_BATCH_CNT_MAX (FD_SSPARSE_ACC_BATCH_MAX)
19 0 : #define FD_SNAPWM_PAIR_SZ_MAX (fd_vinyl_bstream_pair_sz(FD_RUNTIME_ACC_SZ_MAX))
20 0 : #define FD_SNAPWM_PAIR_BATCH_SZ_MAX (FD_SNAPWM_PAIR_BATCH_CNT_MAX*FD_SNAPWM_PAIR_SZ_MAX)
21 :
22 0 : #define FD_SNAPWM_DUP_META_BATCH_CNT_MAX (FD_SNAPWM_PAIR_BATCH_CNT_MAX)
23 0 : #define FD_SNAPWM_DUP_META_SZ (sizeof(ulong)+sizeof(fd_vinyl_bstream_phdr_t))
24 0 : #define FD_SNAPWM_DUP_META_BATCH_SZ (FD_SNAPWM_DUP_META_BATCH_CNT_MAX*FD_SNAPWM_DUP_META_SZ)
25 :
26 0 : #define FD_SNAPWM_DUP_BATCH_CREDIT_MIN (1UL)
27 0 : #define FD_SNAPWM_DUP_LTHASH_CREDIT_MIN ((FD_LTHASH_LEN_BYTES+(ctx->hash_out.mtu-1))/ctx->hash_out.mtu)
28 :
29 : struct fd_snapwm_out_link {
30 : ulong idx;
31 : fd_wksp_t * mem;
32 : ulong chunk0;
33 : ulong wmark;
34 : ulong chunk;
35 : ulong mtu;
36 : ulong depth;
37 : ulong const * consumer_fseq;
38 : };
39 : typedef struct fd_snapwm_out_link fd_snapwm_out_link_t;
40 :
41 : struct fd_snapwm_tile {
42 : int state;
43 : uint full : 1; /* loading a full snapshot? */
44 : uint lthash_disabled : 1; /* disable lthash checking? */
45 :
46 : ulong seed;
47 : long boot_timestamp;
48 :
49 : fd_sstxncache_entry_t * txncache_entries;
50 : ulong const * txncache_entries_len_ptr;
51 :
52 : struct {
53 : /* Account counters (full + incremental) */
54 : ulong accounts_loaded;
55 : ulong accounts_replaced;
56 : ulong accounts_ignored;
57 :
58 : /* Account counters (snapshot taken for full snapshot only) */
59 : ulong full_accounts_loaded;
60 : ulong full_accounts_replaced;
61 : ulong full_accounts_ignored;
62 : } metrics;
63 :
64 : struct {
65 : fd_wksp_t * wksp;
66 : ulong chunk0;
67 : ulong wmark;
68 : ulong mtu;
69 : ulong pos;
70 : } in;
71 :
72 : ulong out_ct_idx;
73 : fd_snapwm_out_link_t hash_out;
74 :
75 : struct {
76 : uchar * bstream_mem;
77 : ulong bstream_sz;
78 :
79 : ulong pair_cnt;
80 :
81 : /* Vinyl in either io_wd or io_mm mode */
82 : fd_vinyl_io_t * io;
83 : fd_vinyl_io_t * io_wd;
84 : fd_vinyl_io_t * io_mm;
85 : ulong io_seed;
86 :
87 : fd_vinyl_meta_t map[1];
88 :
89 : ulong txn_seq; /* bstream seq of first txn record (in [seq_past,seq_present]) */
90 : uint txn_active : 1;
91 : uint txn_commit : 1;
92 :
93 : ulong duplicate_accounts_batch_sz;
94 : ulong duplicate_accounts_batch_cnt;
95 :
96 : fd_lthash_adder_t adder;
97 : fd_lthash_value_t running_lthash;
98 :
99 : ulong wr_cnt;
100 : fd_vinyl_admin_t * admin;
101 :
102 : struct {
103 : ulong seq_ancient;
104 : ulong seq_past;
105 : ulong seq_present;
106 : ulong seq_future;
107 : } recovery;
108 : } vinyl;
109 : };
110 :
111 : typedef struct fd_snapwm_tile fd_snapwm_tile_t;
112 :
113 : FD_PROTOTYPES_BEGIN
114 :
115 0 : #define FD_SNAPWM_IO_SPAD_MAX (64UL<<20) /* 64 MiB of I/O scratch space */
116 :
117 : /* fd_snapwm_vinyl_privileged_init performs administrative tasks, such
118 : as opening and mapping the bstream file descriptor. */
119 :
120 : void
121 : fd_snapwm_vinyl_privileged_init( fd_snapwm_tile_t * ctx,
122 : fd_topo_t * topo,
123 : fd_topo_tile_t * tile );
124 :
125 : /* fd_snapwm_vinyl_unprivileged_init performs setup tasks after being
126 : sandboxed. (anything that might be exposed to untrusted data) */
127 :
128 : void
129 : fd_snapwm_vinyl_unprivileged_init( fd_snapwm_tile_t * ctx,
130 : fd_topo_t * topo,
131 : fd_topo_tile_t * tile,
132 : void * io_mm_mem,
133 : void * io_wd_mem );
134 :
135 : /* fd_snapwm_vinyl_seccomp returns a seccomp sandbox policy suitable
136 : for vinyl operation. */
137 :
138 : ulong
139 : fd_snapwm_vinyl_seccomp( ulong out_cnt,
140 : struct sock_filter * out );
141 :
142 : /* fd_snapwm_vinyl_reset pauses the snapwr tile (waits for the snapwr
143 : tile to ack) and formats a bstream file to be empty. THIS IS A
144 : DESTRUCTIVE ACTION. */
145 :
146 : void
147 : fd_snapwm_vinyl_reset( fd_snapwm_tile_t * ctx );
148 :
149 : /* fd_snapwm_vinyl_txn_begin starts a transactional burst write.
150 : Assumes vinyl uses the io_mm backend. The write can then either be
151 : committed or cancelled. There is no practical limit on the size of
152 : this burst. */
153 :
154 : void
155 : fd_snapwm_vinyl_txn_begin( fd_snapwm_tile_t * ctx );
156 :
157 : /* fd_snapwm_vinyl_txn_commit finishes a transactional burst write.
158 : Assumes vinyl uses the io_mm backend. Reads through bstream records
159 : written since txn_begin was called and updates the vinyl_meta index. */
160 :
161 : void
162 : fd_snapwm_vinyl_txn_commit( fd_snapwm_tile_t * ctx, fd_stem_context_t * stem );
163 :
164 : /* fd_snapwm_vinyl_txn_cancel abandons a transactional burst write.
165 : Assumes vinyl uses the io_mm backend. Reverts the bstream state to
166 : when txn_begin was called. */
167 :
168 : void
169 : fd_snapwm_vinyl_txn_cancel( fd_snapwm_tile_t * ctx );
170 :
171 : /* fd_snapwm_vinyl_wd_init transitions the vinyl backend from generic
172 : vinyl accessor (io_mm) to fast dumb direct account insertion (io_wd).
173 : This must be called before calling fd_snapwm_process_account_*.
174 : Starts the snapwr tile (waits for the snapwr tile to ack). */
175 :
176 : void
177 : fd_snapwm_vinyl_wd_init( fd_snapwm_tile_t * ctx );
178 :
179 : /* fd_snapwm_vinyl_wd_fini transitions the vinyl backend from fast dumb
180 : direct account insertion (io_wd) back to generic mode (io_mm).
181 : Pauses the snapwr tile (waits for the snapwr to ack). */
182 :
183 : void
184 : fd_snapwm_vinyl_wd_fini( fd_snapwm_tile_t * ctx );
185 :
186 : /* fd_snapwm_vinyl_shutdown instructs vinyl-related tiles of the loader
187 : to shut down. Blocks until all affected tiles have acknowledged the
188 : shutdown signal. */
189 :
190 : void
191 : fd_snapwm_vinyl_shutdown( fd_snapwm_tile_t * ctx );
192 :
193 : /* fd_snapwm_vinyl_process_account reads a set of pre-generated bstream
194 : pairs and decides whether to actually add then to the vinyl database.
195 : It supports batch mode as well as single account (pair). */
196 :
197 : void
198 : fd_snapwm_vinyl_process_account( fd_snapwm_tile_t * ctx,
199 : ulong chunk,
200 : ulong acc_cnt,
201 : fd_stem_context_t * stem );
202 :
203 : /* fd_snapwm_vinyl_read_account retrieves an account from the vinyl
204 : database. */
205 :
206 : void
207 : fd_snapwm_vinyl_read_account( fd_snapwm_tile_t * ctx,
208 : void const * acct_addr,
209 : fd_account_meta_t * meta,
210 : uchar * data,
211 : ulong data_max );
212 :
213 : /* fd_snapwm_vinyl_duplicate_accounts_batch_{init,append,fini} handle
214 : duplicate accounts batching when lthash computation is enabled.
215 : The batch is needed to minimize the STEM_BURST, and make the stem
216 : credit handling possible. _fini is responsible for sending the
217 : message downstream.
218 :
219 : Typical usage:
220 : fd_snapwm_vinyl_duplicate_accounts_batch_init( ctx, stem );
221 : for(...) {
222 : ...
223 : fd_snapwm_vinyl_duplicate_accounts_batch_append( ctx, phdr, seq );
224 : }
225 : fd_snapwm_vinyl_duplicate_accounts_batch_fini( ctx, stem );
226 :
227 : They all return 1 on success, and 0 otherwise.
228 :
229 : IMPORTANT: there is an fseq check inside init, since every append
230 : modifies the output link's dcache directly. However, there is no
231 : fseq check inside fini. This is a performance optimization, which
232 : requires no fd_stem_publish between init and fini. */
233 : int
234 : fd_snapwm_vinyl_duplicate_accounts_batch_init( fd_snapwm_tile_t * ctx,
235 : fd_stem_context_t * stem );
236 : int
237 : fd_snapwm_vinyl_duplicate_accounts_batch_append( fd_snapwm_tile_t * ctx,
238 : fd_vinyl_bstream_phdr_t * phdr,
239 : ulong seq );
240 : int
241 : fd_snapwm_vinyl_duplicate_accounts_batch_fini( fd_snapwm_tile_t * ctx,
242 : fd_stem_context_t * stem );
243 :
244 : /* fd_snapwm_vinyl_duplicate_accounts_lthash_{init,append,fini} handle
245 : duplicate accounts lthash local calculation when lthash computation
246 : is enabled. This is typically only needed when the account is an
247 : "old" duplicate (meaning that it corresponds to an older slot than
248 : what is currently in the database). _fini is responsible for
249 : sending the message downstream.
250 :
251 : Typical usage:
252 : fd_snapwm_vinyl_duplicate_accounts_lthash_init( ctx, stem );
253 : for(...) {
254 : ...
255 : fd_snapwm_vinyl_duplicate_accounts_lthash_append( ctx, pair );
256 : }
257 : fd_snapwm_vinyl_duplicate_accounts_lthash_fini( ctx, stem );
258 :
259 : They all return 1 on success, and 0 otherwise.
260 :
261 : IMPORTANT: the fseq check happens only inside fini, since append
262 : only operates on internal variables. Therefore, it is safe to have
263 : fd_stem_publish in between init and fini. */
264 : int
265 : fd_snapwm_vinyl_duplicate_accounts_lthash_init( fd_snapwm_tile_t * ctx,
266 : fd_stem_context_t * stem );
267 : int
268 : fd_snapwm_vinyl_duplicate_accounts_lthash_append( fd_snapwm_tile_t * ctx,
269 : uchar * pair );
270 : int
271 : fd_snapwm_vinyl_duplicate_accounts_lthash_fini( fd_snapwm_tile_t * ctx,
272 : fd_stem_context_t * stem );
273 :
274 : /* fd_snapwm_vinyl_{init,update}_admin provide init and update helper
275 : functions on the vinyl admin object. do_rwlock is a flag indicating
276 : whether the lock is required or not. They return 1 on success and
277 : 0 otherwise. */
278 : int
279 : fd_snapwm_vinyl_init_admin( fd_snapwm_tile_t * ctx,
280 : int do_rwlock );
281 :
282 : int
283 : fd_snapwm_vinyl_update_admin( fd_snapwm_tile_t * ctx,
284 : int do_rwlock );
285 :
286 : /* fd_snapwm_vinyl_recovery_seq_{backup,apply} are helper functions
287 : that handle vinyl io bstream seq backup and apply (for recovery).
288 : Both operate on vinyl io_mm seq values, since this is the io that
289 : keeps track of those values. That means that backup must happen
290 : after io init, and apply must happen before io sync. */
291 :
292 : void
293 : fd_snapwm_vinyl_recovery_seq_backup( fd_snapwm_tile_t * ctx );
294 :
295 : void
296 : fd_snapwm_vinyl_recovery_seq_apply( fd_snapwm_tile_t * ctx );
297 :
298 : /* fd_snapwm_vinyl_revert_full provides the mechanism to revert any
299 : changes that happened during a full snapshot load that has been
300 : cancelled. It frees all elements of the vinyl meta map. Finally,
301 : it reverts the bstream seq(s) in vinyl io and syncs the bstream. */
302 :
303 : void
304 : fd_snapwm_vinyl_revert_full( fd_snapwm_tile_t * ctx );
305 :
306 : /* fd_snapwm_vinyl_revert_incr provides the mechanism to revert any
307 : changes that happened during an incr snapshot load that has been
308 : cancelled. To do this, every bstream pair's phdr info, as well
309 : as the corresponding meta map element's phdr info, is modified to
310 : include val_sz (32 bits), recovery_seq (48 bits) and slot (48 bits)
311 : in the info length of 16 bytes (128 bits). When a new account is
312 : written to the bstream, recovery_seq=0UL is assigned (which works
313 : as a sentinel value). When an account is a duplicate update of an
314 : existing account, the update's recovery_seq corresponds to the seq
315 : value of the existing account in the bstream. This is essentially
316 : a reference to the account that is being superseded.
317 :
318 : bstream: [ full | incr | free )
319 : revert: (*)->......)
320 :
321 : To revert the incremental snapshot, the function walks the bstream
322 : from recovery seq_present (*) towards the future, until all pairs
323 : are processed. If the recovery_seq (in the pair's phdr info) is
324 : 0UL (the sentinel) this account was a new account, and the meta map
325 : entry needs to be freed. If the recovery_seq is less than the
326 : recovery seq_present, the phdr of the pair at recovery_seq is read,
327 : and used to update the meta map element. If the recovery_seq is
328 : greater or equal to the recovery seq_present, this means that the
329 : update was a duplicate on the incr snapshot itself, and it can
330 : be discarded altogether.
331 : Note that as the recovery process moves forward, the meta map entry
332 : and an account update on the incr side of the bstream may see
333 : different revovery_seq values (e.g. consider what happens with
334 : chained duplicate updates). This means that the true recovery_seq
335 : is the one in the phdr info of the bstream pair.
336 : Finally, it reverts the bstream seq(s) in vinyl io and syncs the
337 : bstream. */
338 :
339 : void
340 : fd_snapwm_vinyl_revert_incr( fd_snapwm_tile_t * ctx );
341 :
342 : /* fd_snapin_vinyl_pair_info_{from_parts,update_recovery_seq} are
343 : helper functions to update the pair's info.
344 : fd_snapin_vinyl_pair_info_{val_sz,recovery_seq,slot} are helper
345 : functions to retrieve the corresponding values.
346 : In order to facilitate a recovery process, in particular when an
347 : incr snapshot is cancelled, every bstream pair's phdr info, as well
348 : as the corresponding meta map element's phdr info, is modified to
349 : include val_sz (32 bits), recovery_seq (48 bits) and slot (48 bits)
350 : in the info length of 16 bytes (128 bits). */
351 :
352 : void
353 : fd_snapin_vinyl_pair_info_from_parts( fd_vinyl_info_t * info,
354 : ulong val_sz,
355 : ulong recovery_seq,
356 : ulong slot );
357 : void
358 : fd_snapin_vinyl_pair_info_update_recovery_seq( fd_vinyl_info_t * info,
359 : ulong recovery_seq );
360 : ulong fd_snapin_vinyl_pair_info_val_sz ( fd_vinyl_info_t const * info );
361 : ulong fd_snapin_vinyl_pair_info_recovery_seq( fd_vinyl_info_t const * info );
362 : ulong fd_snapin_vinyl_pair_info_slot ( fd_vinyl_info_t const * info );
363 :
364 : FD_PROTOTYPES_END
365 :
366 : #endif /* HEADER_fd_discof_restore_fd_snapwm_tile_private_h */
|