Line data Source code
1 : #ifndef HEADER_fd_discof_restore_fd_snapwm_tile_private_h
2 : #define HEADER_fd_discof_restore_fd_snapwm_tile_private_h
3 :
4 : /* fd_snapwm_tile_private.h contains private APIs for the "snapwm" tile,
5 : which is the tile responsible for directing vinyl database writes. */
6 :
7 : #include "utils/fd_slot_delta_parser.h"
8 : #include "utils/fd_ssparse.h"
9 : #include "utils/fd_vinyl_admin.h"
10 : #include "../../ballet/lthash/fd_lthash.h"
11 : #include "../../ballet/lthash/fd_lthash_adder.h"
12 : #include "../../disco/stem/fd_stem.h"
13 : #include "../../disco/topo/fd_topo.h"
14 : #include "../../vinyl/io/fd_vinyl_io.h"
15 : #include "../../vinyl/meta/fd_vinyl_meta.h"
16 :
17 0 : #define FD_SNAPWM_WR_MTU (16UL<<20)
18 0 : #define FD_SNAPWM_PAIR_BATCH_CNT_MAX (FD_SSPARSE_ACC_BATCH_MAX)
19 0 : #define FD_SNAPWM_PAIR_SZ_MAX (fd_vinyl_bstream_pair_sz(FD_RUNTIME_ACC_SZ_MAX))
20 0 : #define FD_SNAPWM_PAIR_BATCH_SZ_MAX (FD_SNAPWM_PAIR_BATCH_CNT_MAX*FD_SNAPWM_PAIR_SZ_MAX)
21 :
22 0 : #define FD_SNAPWM_DUP_META_BATCH_CNT_MAX (FD_SNAPWM_PAIR_BATCH_CNT_MAX)
23 0 : #define FD_SNAPWM_DUP_META_SZ (sizeof(ulong)+sizeof(fd_vinyl_bstream_phdr_t))
24 0 : #define FD_SNAPWM_DUP_META_BATCH_SZ (FD_SNAPWM_DUP_META_BATCH_CNT_MAX*FD_SNAPWM_DUP_META_SZ)
25 :
26 0 : #define FD_SNAPWM_DUP_BATCH_CREDIT_MIN (1UL)
27 :
28 : struct fd_snapwm_out_link {
29 : ulong idx;
30 : fd_wksp_t * mem;
31 : ulong chunk0;
32 : ulong wmark;
33 : ulong chunk;
34 : ulong mtu;
35 : ulong depth;
36 : ulong const * consumer_fseq;
37 : };
38 : typedef struct fd_snapwm_out_link fd_snapwm_out_link_t;
39 :
40 : struct fd_snapwm_tile {
41 : int state;
42 : uint full : 1; /* loading a full snapshot? */
43 : uint lthash_disabled : 1; /* disable lthash checking? */
44 :
45 : ulong seed;
46 : long boot_timestamp;
47 :
48 : fd_sstxncache_entry_t * txncache_entries;
49 : ulong const * txncache_entries_len_ptr;
50 :
51 : struct {
52 : /* Account counters (full + incremental) */
53 : ulong accounts_loaded;
54 : ulong accounts_replaced;
55 : ulong accounts_ignored;
56 :
57 : /* Account counters (snapshot taken for full snapshot only) */
58 : ulong full_accounts_loaded;
59 : ulong full_accounts_replaced;
60 : ulong full_accounts_ignored;
61 : } metrics;
62 :
63 : struct {
64 : fd_wksp_t * wksp;
65 : ulong chunk0;
66 : ulong wmark;
67 : ulong mtu;
68 : ulong pos;
69 : } in;
70 :
71 : ulong out_ct_idx;
72 : fd_snapwm_out_link_t hash_out;
73 :
74 : struct {
75 : uchar * bstream_mem;
76 : ulong bstream_sz;
77 :
78 : ulong pair_cnt;
79 : ulong full_pair_cnt;
80 : ulong pair_cnt_max;
81 :
82 : /* Vinyl in either io_wd or io_mm mode */
83 : fd_vinyl_io_t * io;
84 : fd_vinyl_io_t * io_wd;
85 : fd_vinyl_io_t * io_mm;
86 : ulong io_seed;
87 :
88 : fd_vinyl_meta_t map[1];
89 :
90 : ulong txn_seq; /* bstream seq of first txn record (in [seq_past,seq_present]) */
91 : uint txn_active : 1;
92 : uint txn_commit : 1;
93 :
94 : ulong duplicate_accounts_batch_sz;
95 : ulong duplicate_accounts_batch_cnt;
96 :
97 : fd_lthash_adder_t adder;
98 : fd_lthash_value_t running_lthash;
99 : ulong running_capitalization; /* stores capitalization of duplicate accounts */
100 :
101 : ulong wr_cnt;
102 : fd_vinyl_admin_t * admin;
103 :
104 : struct {
105 : ulong seq_ancient;
106 : ulong seq_past;
107 : ulong seq_present;
108 : ulong seq_future;
109 : } recovery;
110 : } vinyl;
111 : };
112 :
113 : typedef struct fd_snapwm_tile fd_snapwm_tile_t;
114 :
115 : FD_PROTOTYPES_BEGIN
116 :
117 0 : #define FD_SNAPWM_IO_SPAD_MAX (64UL<<20) /* 64 MiB of I/O scratch space */
118 :
119 : /* fd_snapwm_vinyl_privileged_init performs administrative tasks, such
120 : as opening and mapping the bstream file descriptor. */
121 :
122 : void
123 : fd_snapwm_vinyl_privileged_init( fd_snapwm_tile_t * ctx,
124 : fd_topo_t * topo,
125 : fd_topo_tile_t * tile );
126 :
127 : /* fd_snapwm_vinyl_unprivileged_init performs setup tasks after being
128 : sandboxed. (anything that might be exposed to untrusted data) */
129 :
130 : void
131 : fd_snapwm_vinyl_unprivileged_init( fd_snapwm_tile_t * ctx,
132 : fd_topo_t * topo,
133 : fd_topo_tile_t * tile,
134 : void * io_mm_mem,
135 : void * io_wd_mem );
136 :
137 : /* fd_snapwm_vinyl_seccomp returns a seccomp sandbox policy suitable
138 : for vinyl operation. */
139 :
140 : ulong
141 : fd_snapwm_vinyl_seccomp( ulong out_cnt,
142 : struct sock_filter * out );
143 :
144 : /* fd_snapwm_vinyl_reset pauses the snapwr tile (waits for the snapwr
145 : tile to ack) and formats a bstream file to be empty. THIS IS A
146 : DESTRUCTIVE ACTION. */
147 :
148 : void
149 : fd_snapwm_vinyl_reset( fd_snapwm_tile_t * ctx );
150 :
151 : /* fd_snapwm_vinyl_txn_begin starts a transactional burst write.
152 : Assumes vinyl uses the io_mm backend. The write can then either be
153 : committed or cancelled. There is no practical limit on the size of
154 : this burst. Vinyl txn_{begin,commit,cancel} cannot be invoked when
155 : lthash verification is enabled, since a recovery mechanism on failed
156 : snapshots becomes computationally expensive at runtime. Further
157 : details can be found in the recovery code inside the snapwm tile. */
158 :
159 : void
160 : fd_snapwm_vinyl_txn_begin( fd_snapwm_tile_t * ctx );
161 :
162 : /* fd_snapwm_vinyl_txn_commit finishes a transactional burst write.
163 : Assumes vinyl uses the io_mm backend. Reads through bstream records
164 : written since txn_begin was called and updates the vinyl_meta index. */
165 :
166 : void
167 : fd_snapwm_vinyl_txn_commit( fd_snapwm_tile_t * ctx );
168 :
169 : /* fd_snapwm_vinyl_txn_cancel abandons a transactional burst write.
170 : Assumes vinyl uses the io_mm backend. Reverts the bstream state to
171 : when txn_begin was called. */
172 :
173 : void
174 : fd_snapwm_vinyl_txn_cancel( fd_snapwm_tile_t * ctx );
175 :
176 : /* fd_snapwm_vinyl_wd_init transitions the vinyl backend from generic
177 : vinyl accessor (io_mm) to fast dumb direct account insertion (io_wd).
178 : This must be called before calling fd_snapwm_process_account_*.
179 : Starts the snapwr tile (waits for the snapwr tile to ack). */
180 :
181 : void
182 : fd_snapwm_vinyl_wd_init( fd_snapwm_tile_t * ctx );
183 :
184 : /* fd_snapwm_vinyl_wd_fini transitions the vinyl backend from fast dumb
185 : direct account insertion (io_wd) back to generic mode (io_mm).
186 : Pauses the snapwr tile (waits for the snapwr to ack). */
187 :
188 : void
189 : fd_snapwm_vinyl_wd_fini( fd_snapwm_tile_t * ctx );
190 :
191 : /* fd_snapwm_vinyl_shutdown instructs vinyl-related tiles of the loader
192 : to shut down. Blocks until all affected tiles have acknowledged the
193 : shutdown signal. */
194 :
195 : void
196 : fd_snapwm_vinyl_shutdown( fd_snapwm_tile_t * ctx );
197 :
198 : /* fd_snapwm_vinyl_process_account reads a set of pre-generated bstream
199 : pairs and decides whether to actually add then to the vinyl database.
200 : It supports batch mode as well as single account (pair). */
201 :
202 : void
203 : fd_snapwm_vinyl_process_account( fd_snapwm_tile_t * ctx,
204 : ulong chunk,
205 : ulong acc_cnt,
206 : fd_stem_context_t * stem );
207 :
208 : /* fd_snapwm_vinyl_read_account retrieves an account from the vinyl
209 : database. */
210 :
211 : void
212 : fd_snapwm_vinyl_read_account( fd_snapwm_tile_t * ctx,
213 : void const * acct_addr,
214 : fd_account_meta_t * meta,
215 : uchar * data,
216 : ulong data_max );
217 :
218 : /* fd_snapwm_vinyl_duplicate_accounts_batch_{init,append,fini} handle
219 : duplicate accounts batching when lthash computation is enabled.
220 : The batch is needed to minimize the STEM_BURST, and make the stem
221 : credit handling possible. _fini is responsible for sending the
222 : message downstream.
223 :
224 : Typical usage:
225 : fd_snapwm_vinyl_duplicate_accounts_batch_init( ctx, stem );
226 : for(...) {
227 : ...
228 : fd_snapwm_vinyl_duplicate_accounts_batch_append( ctx, phdr, seq );
229 : }
230 : fd_snapwm_vinyl_duplicate_accounts_batch_fini( ctx, stem );
231 :
232 : They all return 1 on success, and 0 otherwise.
233 :
234 : IMPORTANT: there is an fseq check inside init, since every append
235 : modifies the output link's dcache directly. However, there is no
236 : fseq check inside fini. This is a performance optimization, which
237 : requires no fd_stem_publish between init and fini. */
238 : int
239 : fd_snapwm_vinyl_duplicate_accounts_batch_init( fd_snapwm_tile_t * ctx,
240 : fd_stem_context_t * stem );
241 : int
242 : fd_snapwm_vinyl_duplicate_accounts_batch_append( fd_snapwm_tile_t * ctx,
243 : fd_vinyl_bstream_phdr_t * phdr,
244 : ulong seq );
245 : int
246 : fd_snapwm_vinyl_duplicate_accounts_batch_fini( fd_snapwm_tile_t * ctx,
247 : fd_stem_context_t * stem );
248 :
249 : /* fd_snapwm_vinyl_{init,update}_admin provide init and update helper
250 : functions on the vinyl admin object. do_rwlock is a flag indicating
251 : whether the lock is required or not. They return 1 on success and
252 : 0 otherwise. */
253 : int
254 : fd_snapwm_vinyl_init_admin( fd_snapwm_tile_t * ctx,
255 : int do_rwlock );
256 :
257 : int
258 : fd_snapwm_vinyl_update_admin( fd_snapwm_tile_t * ctx,
259 : int do_rwlock );
260 :
261 : /* fd_snapwm_vinyl_recovery_seq_{backup,apply} are helper functions
262 : that handle vinyl io bstream seq backup and apply (for recovery).
263 : Both operate on vinyl io_mm seq values, since this is the io that
264 : keeps track of those values. That means that backup must happen
265 : after io init, and apply must happen before io sync. */
266 :
267 : void
268 : fd_snapwm_vinyl_recovery_seq_backup( fd_snapwm_tile_t * ctx );
269 :
270 : void
271 : fd_snapwm_vinyl_recovery_seq_apply( fd_snapwm_tile_t * ctx );
272 :
273 : /* fd_snapwm_vinyl_revert_full provides the mechanism to revert any
274 : changes that happened during a full snapshot load that has been
275 : cancelled. It frees all elements of the vinyl meta map. Finally,
276 : it reverts the bstream seq(s) in vinyl io and syncs the bstream. */
277 :
278 : void
279 : fd_snapwm_vinyl_revert_full( fd_snapwm_tile_t * ctx );
280 :
281 : /* fd_snapwm_vinyl_revert_incr provides the mechanism to revert any
282 : changes that happened during an incr snapshot load that has been
283 : cancelled. To do this, every bstream pair's phdr info, as well
284 : as the corresponding meta map element's phdr info, is modified to
285 : include val_sz (32 bits), recovery_seq (48 bits) and slot (48 bits)
286 : in the info length of 16 bytes (128 bits). When a new account is
287 : written to the bstream, recovery_seq=0UL is assigned (which works
288 : as a sentinel value). When an account is a duplicate update of an
289 : existing account, the update's recovery_seq corresponds to the seq
290 : value of the existing account in the bstream. This is essentially
291 : a reference to the account that is being superseded.
292 :
293 : bstream: [ full | incr | free )
294 : revert: (*)->......)
295 :
296 : To revert the incremental snapshot, the function walks the bstream
297 : from recovery seq_present (*) towards the future, until all pairs
298 : are processed. If the recovery_seq (in the pair's phdr info) is
299 : 0UL (the sentinel) this account was a new account, and the meta map
300 : entry needs to be freed. If the recovery_seq is less than the
301 : recovery seq_present, the phdr of the pair at recovery_seq is read,
302 : and used to update the meta map element. If the recovery_seq is
303 : greater or equal to the recovery seq_present, this means that the
304 : update was a duplicate on the incr snapshot itself, and it can
305 : be discarded altogether.
306 : Note that as the recovery process moves forward, the meta map entry
307 : and an account update on the incr side of the bstream may see
308 : different revovery_seq values (e.g. consider what happens with
309 : chained duplicate updates). This means that the true recovery_seq
310 : is the one in the phdr info of the bstream pair.
311 : Finally, it reverts the bstream seq(s) in vinyl io and syncs the
312 : bstream. */
313 :
314 : void
315 : fd_snapwm_vinyl_revert_incr( fd_snapwm_tile_t * ctx );
316 :
317 : /* fd_snapin_vinyl_pair_info_{from_parts,update_recovery_seq} are
318 : helper functions to update the pair's info.
319 : fd_snapin_vinyl_pair_info_{val_sz,recovery_seq,slot} are helper
320 : functions to retrieve the corresponding values.
321 : In order to facilitate a recovery process, in particular when an
322 : incr snapshot is cancelled, every bstream pair's phdr info, as well
323 : as the corresponding meta map element's phdr info, is modified to
324 : include val_sz (32 bits), recovery_seq (48 bits) and slot (48 bits)
325 : in the info length of 16 bytes (128 bits). */
326 :
327 : void
328 : fd_snapin_vinyl_pair_info_from_parts( fd_vinyl_info_t * info,
329 : ulong val_sz,
330 : ulong recovery_seq,
331 : ulong slot );
332 : void
333 : fd_snapin_vinyl_pair_info_update_recovery_seq( fd_vinyl_info_t * info,
334 : ulong recovery_seq );
335 : ulong fd_snapin_vinyl_pair_info_val_sz ( fd_vinyl_info_t const * info );
336 : ulong fd_snapin_vinyl_pair_info_recovery_seq( fd_vinyl_info_t const * info );
337 : ulong fd_snapin_vinyl_pair_info_slot ( fd_vinyl_info_t const * info );
338 :
339 : FD_PROTOTYPES_END
340 :
341 : #endif /* HEADER_fd_discof_restore_fd_snapwm_tile_private_h */
|