Line data Source code
1 : #ifndef HEADER_fd_src_flamenco_stakes_fd_vote_stakes_h
2 : #define HEADER_fd_src_flamenco_stakes_fd_vote_stakes_h
3 :
4 : #include "../../util/fd_util_base.h"
5 : #include "../types/fd_types_custom.h"
6 :
7 : /* fd_vote_stakes_t is a data structure that stores vote account stake
8 : updates across epoch boundaries. It offers a mapping from vote
9 : account pubkeys to their t_1 and t_2 stakes. The structure is
10 : designed to work with a large amount of vote accounts (in the order
11 : of 10s of millions) along with a relatively small number of forks
12 : across epoch boundaries.
13 :
14 : The structure is designed around these operations:
15 : 1. Inserting updated vote account stakes into a given fork.
16 : 2. Querying the stake for a given vote account with a given fork.
17 :
18 : Concurrent queries are allowed but concurrent inserts are not. This
19 : is fine because the structure is only modified during boot and during
20 : the epoch boundary.
21 :
22 : Given a large number of vote accounts (e.g. 2^25 = 33,554,432), we
23 : need to store the pubkey, t_1 stake, and t_2 stake for each vote
24 : account. We also need to store potentially ~32 forks across each
25 : epoch boundary. If done naively, this would require
26 : 2^25 * (32 + 8 + 8) * 32 = 51GB of memory not including any overhead
27 : for maintaining lookups for accounts.
28 :
29 : To avoid this, we can use some important runtime protocol properties.
30 : The most notable is that across forks, we will only have a small
31 : number of differences in vote account stakes: this is because
32 : realistically very few vote accounts will have differences in stakes
33 : that are caused by forks right before the epoch boundary. So we can
34 : set our bound on elements as the total number of vote accounts +
35 : stakes across forks. Let's say this is 2^26 if the max number of
36 : vote accounts we want to support is 2^25. Now to store our index we
37 : only need:
38 : 2^26 * (32 + 8 + 8) = 3GB of memory (not including overhead).
39 : Our index structure will look like this:
40 : pool<pubkey, stake_t_1, stake_t_2>
41 :
42 : What is described above the index of all vote accounts. We need to
43 : account for the stakes across forks. We have to maintain a list of
44 : all of the index entries used by each fork. It is sufficient to use
45 : a uint list of indices into the index. So each fork is just:
46 : pool<uint>.
47 :
48 : 4 (uint pool idx) * 2^25 (# vote accounts) * 32 (# forks) = 4GiB
49 :
50 : For a given fork, when we insert a vote account we check if:
51 : 1. The vote account + stake pair is already in the index. If it is,
52 : we just increment a reference to the pair. Add it to the list of
53 : pool indices that the fork maintains.
54 : 2. The vote account + stake pair is not in the index. If it is not,
55 : we need to insert it into the index and assume a reference of 1.
56 : Add it to the local list of pool indices that the fork maintains.
57 : In order to make both of these cases performant we need a mapping
58 : from pubkey + stake to the index pool element. This is simply
59 : represented as:
60 : map<(pubkey+stake), index_pool_idx>.
61 :
62 : Now for queries, we need a way to query the t_1 and t_2 stake for a
63 : pubkey on a given fork. The problem is the above map requires the
64 : t_1 stake as part of the key and there are potentially multiple
65 : entries for a given pubkey. This is solved with a
66 : multi_map<pubkey, index_pool_idx>. We query for a given pubkey and
67 : iterate through all of the entries: this is an acceptable trade-off
68 : since there will almost always be one element for a given pubkey
69 : (except around the epoch boundary). However, for each index entry
70 : we need a way to make sure that it's the one that is in our fork's
71 : list of indices. This is solved with a map for each fork that is
72 : keyed by the index pool idx.
73 : map<index_pool_idx, fork list entry>.
74 :
75 : Now, we can quickly insert entries for a given fork and also do fast
76 : queries for a given pubkey.
77 :
78 : The only remaining operation is updating the root fork. If we are
79 : updating which fork is the root, we can safely assume that all other
80 : forks are no longer valid:
81 : 1. either the fork was a competing fork that executed the epoch
82 : boundary and is no longer needed
83 : 2. the fork corresponds to a fork for the previous epoch boundary.
84 :
85 : For any fork that's being removed, we need to reset its fork's pool
86 : and remove any references to the index pool entries. If an index
87 : entry has a reference count of 0, we can remove it from the index
88 : entirely. Under the hood, the forks in use are stored in a deque;
89 : when a root is being advanced, all entries from the deque are removed
90 : and each removed fork's entries are released.
91 :
92 : The memory footprint of what is actually described above is larger
93 : because each key of the index needs to be a compound of
94 : the pubkey, stake_t_1, node_account_t_1, and epoch.
95 :
96 : All public APIs internally acquire and release a read-write lock
97 : so the caller does not need to manage synchronization. Read-only
98 : operations (query, ele_cnt, get_root_idx) acquire a shared read
99 : lock. Mutating operations and the iterator acquire an exclusive
100 : write lock.
101 :
102 : fd_vote_stakes is the Firedancer equivalent to the Agave client
103 : epoch stakes data structure. */
104 :
105 : FD_PROTOTYPES_BEGIN
106 :
107 9387 : #define FD_VOTE_STAKES_ALIGN (128UL)
108 :
109 : struct fd_vote_stakes;
110 : typedef struct fd_vote_stakes fd_vote_stakes_t;
111 :
112 : /* fd_vote_stakes_align returns the minimum alignment required for the
113 : fd_vote_stakes_t struct. */
114 :
115 : ulong
116 : fd_vote_stakes_align( void );
117 :
118 : /* fd_vote_stakes_footprint returns the minimum footprint required for
119 : the fd_vote_stakes_t object given the max number of vote accounts,
120 : the max fork width (number of forks that can cross the epoch
121 : boundary), and the max number of map chains. The map chains should
122 : be a power of 2 that is roughly equal to the expected number of vote
123 : accounts and not the maximum. */
124 :
125 : ulong
126 : fd_vote_stakes_footprint( ulong max_vote_accounts,
127 : ulong expected_vote_accounts,
128 : ulong max_fork_width );
129 :
130 :
131 : /* fd_vote_stakes_new creates a new fd_vote_stakes_t object given a
132 : region of memory sized out according to fd_vote_stakes_footprint.
133 : The underlying data storage will actually support
134 : 2 * max_vote_accounts entries because entries are deduped across
135 : forks after all of the entries have already been inserted. */
136 :
137 : void *
138 : fd_vote_stakes_new( void * shmem,
139 : ulong max_vote_accounts,
140 : ulong expected_vote_accounts,
141 : ulong max_fork_width,
142 : ulong seed );
143 :
144 :
145 : /* fd_vote_stakes_join joins a valid fd_vote_stakes_t object from a
146 : region of memory. */
147 :
148 : fd_vote_stakes_t *
149 : fd_vote_stakes_join( void * shmem );
150 :
151 : /* fd_vote_stakes_root_{insert, update}_key are APIs for
152 : inserting and updating keys for the root fork. These
153 : operations are split out in order to support the snapshot loading
154 : process. The set of stakes from the T-1 epoch are inserted into
155 : the root fork with a call to fd_vote_stakes_root_insert_key. The
156 : set of stakes from the T-2 epoch are updated with a call to
157 : fd_vote_stakes_root_update_meta. The caller is responsible for
158 : ensuring that for a given pubkey, insert_key is called before
159 : update_meta. It is important that these APIs should only be called
160 : while the root fork is the only and current fork in use.
161 :
162 : If update_meta is called on a key that has not had a corresponding
163 : insert_key call, a key is created into the root fork with a t-1 stake
164 : of 0. This usually means the vote account has been deleted, but it
165 : can be possible in the case where the only staker of a vote account
166 : has been marked delinquent in epoch T-1 and needs to be counted
167 : towards clock calculation for the rest of the epoch. */
168 :
169 : void
170 : fd_vote_stakes_root_insert_key( fd_vote_stakes_t * vote_stakes,
171 : fd_pubkey_t const * pubkey,
172 : fd_pubkey_t const * node_account_t_1,
173 : ulong stake_t_1,
174 : uchar commission_t_1,
175 : ulong epoch );
176 :
177 : void
178 : fd_vote_stakes_root_update_meta( fd_vote_stakes_t * vote_stakes,
179 : fd_pubkey_t const * pubkey,
180 : fd_pubkey_t const * node_account_t_2,
181 : ulong stake_t_2,
182 : uchar commission_t_2,
183 : ulong epoch );
184 :
185 : /* fd_vote_stakes_insert inserts a new vote account stake into a given
186 : fork. If the element already exists on a different fork, then the
187 : reference is incremented in the index and the fork will now have an
188 : element which points to the vote stake index element. */
189 :
190 : void
191 : fd_vote_stakes_insert( fd_vote_stakes_t * vote_stakes,
192 : ushort fork_idx,
193 : fd_pubkey_t const * pubkey,
194 : fd_pubkey_t const * node_account_t_1,
195 : fd_pubkey_t const * node_account_t_2,
196 : ulong stake_t_1,
197 : ulong stake_t_2,
198 : uchar commission_t_1,
199 : uchar commission_t_2,
200 : ulong epoch );
201 :
202 : /* fd_vote_stakes_genesis_fini finalizes the vote stakes on the genesis
203 : block. Any vote stakes that have been inserted will be updated to
204 : have identical T-1/T-2 stakes and node accounts. This function
205 : assumes that all vote accounts have already been inserted into the
206 : genesis fork. */
207 :
208 : void
209 : fd_vote_stakes_genesis_fini( fd_vote_stakes_t * vote_stakes );
210 :
211 : /* fd_vote_stakes_new_child creates a new child fork and returns the
212 : index identifier for the new fork. */
213 :
214 : ushort
215 : fd_vote_stakes_new_child( fd_vote_stakes_t * vote_stakes );
216 :
217 : /* fd_vote_stakes_advance_root will move the root fork to the new
218 : candidate root fork. If the root_idx is equal to the root, this
219 : function is a no-op. However, if the root is different, all other
220 : child nodes will be removed from the structure. */
221 :
222 : void
223 : fd_vote_stakes_advance_root( fd_vote_stakes_t * vote_stakes,
224 : ushort root_idx );
225 :
226 : /* fd_vote_stakes_query_stake queries the stake for a given vote account
227 : in the given fork. If the element is found returns 1, otherwise
228 : returns 0. If any of the optional fields are set to NULL, then their
229 : corresponding value will not be set. If the stake_t_{1,2}_out_opt is
230 : set to 0UL and the record is found, that means the vote account
231 : either did not exist at the end of the t-{1,2} epoch boundary or had
232 : zero stake: they are treated as the same thing. */
233 :
234 : int
235 : fd_vote_stakes_query( fd_vote_stakes_t * vote_stakes,
236 : ushort fork_idx,
237 : fd_pubkey_t const * pubkey,
238 : ulong * stake_t_1_out_opt,
239 : ulong * stake_t_2_out_opt,
240 : fd_pubkey_t * node_account_t_1_out_opt,
241 : fd_pubkey_t * node_account_t_2_out_opt,
242 : uchar * commission_t_1_out_opt,
243 : uchar * commission_t_2_out_opt );
244 :
245 : int
246 : fd_vote_stakes_query_pubkey( fd_vote_stakes_t * vote_stakes,
247 : ushort fork_idx,
248 : fd_pubkey_t const * pubkey );
249 :
250 : /* fd_vote_stakes_query_t_1 and fd_vote_stakes_query_t_2 are shortcuts
251 : for querying the t_1 and t_2 stake for a given vote account in the
252 : given fork. 0 is returned if the vote account does not exist for the
253 : epoch or if it has zero stake. If the account is found, stake_out,
254 : node_account_out, and commission_out will be set. */
255 :
256 : int
257 : fd_vote_stakes_query_t_1( fd_vote_stakes_t * vote_stakes,
258 : ushort fork_idx,
259 : fd_pubkey_t const * pubkey,
260 : ulong * stake_out,
261 : fd_pubkey_t * node_account_out,
262 : uchar * commission_out );
263 :
264 : int
265 : fd_vote_stakes_query_t_2( fd_vote_stakes_t * vote_stakes,
266 : ushort fork_idx,
267 : fd_pubkey_t const * pubkey,
268 : ulong * stake_out,
269 : fd_pubkey_t * node_account_out,
270 : uchar * commission_out );
271 :
272 : /* fd_vote_stakes_ele_cnt returns the number of entries for a given
273 : fork. */
274 :
275 : uint
276 : fd_vote_stakes_ele_cnt( fd_vote_stakes_t * vote_stakes,
277 : ushort fork_idx );
278 :
279 : /* fd_vote_stakes_get_root_idx returns the index of the root fork. */
280 :
281 : ushort
282 : fd_vote_stakes_get_root_idx( fd_vote_stakes_t * vote_stakes );
283 :
284 : /* fd_vote_stakes_reset resets the vote stakes object to the initial
285 : state. This is useful for resetting vote stakes if a new snapshot
286 : manifest is being loaded. */
287 :
288 : void
289 : fd_vote_stakes_reset( fd_vote_stakes_t * vote_stakes );
290 :
291 : #define FD_VOTE_STAKES_ITER_FOOTPRINT (16UL)
292 : #define FD_VOTE_STAKES_ITER_ALIGN (8UL)
293 : struct stakes_map_iter_t;
294 : typedef struct stakes_map_iter_t fd_vote_stakes_iter_t;
295 :
296 : /* A caller can iterate through the entries for a given fork. The
297 : iterator is initialized by a call to fd_vote_stakes_fork_iter_init.
298 : The caller is responsible for managing the memory for the iterator.
299 : It is safe to call fd_vote_stakes_fork_iter_next if the result of
300 : fd_vote_stakes_fork_iter_done() == 0. It is safe to call
301 : fd_vote_stakes_fork_iter_ele() to get the current entry if there is
302 : a valid initialized iterator. fd_vote_stakes_fork_iter_next is
303 : called to advance the iterator.
304 :
305 : fd_vote_stakes_fork_iter_init acquires a write lock on the vote
306 : stakes. The caller MUST call fd_vote_stakes_fork_iter_fini after
307 : the iteration loop completes to release the lock.
308 :
309 : It is not safe to call any other vote stakes apis while an iteration
310 : is in progress.
311 :
312 : Example use:
313 : uchar __attribute__((aligned(FD_VOTE_STAKES_ITER_ALIGN))) iter_mem[ FD_VOTE_STAKES_ITER_FOOTPRINT ];
314 : for( fd_vote_stakes_iter_t * iter = fd_vote_stakes_fork_iter_init( vote_stakes, fork_idx, iter_mem );
315 : !fd_vote_stakes_fork_iter_done( vote_stakes, fork_idx, iter );
316 : fd_vote_stakes_fork_iter_next( vote_stakes, fork_idx, iter ) ) {
317 : fd_vote_stakes_fork_iter_ele( vote_stakes, fork_idx, iter, &pubkey, &stake_t_1, &stake_t_2, &node_account_t_1, &node_account_t_2, &commission_t_1, &commission_t_2 );
318 : }
319 : fd_vote_stakes_fork_iter_fini( vote_stakes );
320 :
321 : Under the hood, the vote stakes iterator is a wrapper of the map
322 : chain iterator.
323 :
324 : TODO: fork_idx can probably get absorbed into the iterator. */
325 :
326 : fd_vote_stakes_iter_t *
327 : fd_vote_stakes_fork_iter_init( fd_vote_stakes_t * vote_stakes,
328 : ushort fork_idx,
329 : uchar iter_mem[ static FD_VOTE_STAKES_ITER_FOOTPRINT ] );
330 :
331 : int
332 : fd_vote_stakes_fork_iter_done( fd_vote_stakes_t * vote_stakes,
333 : ushort fork_idx,
334 : fd_vote_stakes_iter_t * iter );
335 :
336 : void
337 : fd_vote_stakes_fork_iter_next( fd_vote_stakes_t * vote_stakes,
338 : ushort fork_idx,
339 : fd_vote_stakes_iter_t * iter );
340 :
341 : void
342 : fd_vote_stakes_fork_iter_ele( fd_vote_stakes_t * vote_stakes,
343 : ushort fork_idx,
344 : fd_vote_stakes_iter_t * iter,
345 : fd_pubkey_t * pubkey_out,
346 : ulong * stake_t_1_out_opt,
347 : ulong * stake_t_2_out_opt,
348 : fd_pubkey_t * node_account_t_1_out_opt,
349 : fd_pubkey_t * node_account_t_2_out_opt,
350 : uchar * commission_t_1_out_opt,
351 : uchar * commission_t_2_out_opt );
352 :
353 : void
354 : fd_vote_stakes_fork_iter_fini( fd_vote_stakes_t * vote_stakes );
355 :
356 : FD_PROTOTYPES_END
357 :
358 : #endif
|