Line data Source code
1 : #ifndef HEADER_fd_src_flamenco_stakes_fd_vote_stakes_h
2 : #define HEADER_fd_src_flamenco_stakes_fd_vote_stakes_h
3 :
4 : #include "../fd_flamenco_base.h"
5 :
6 : /* fd_vote_stakes_t is a data structure that stores vote account stake
7 : updates across epoch boundaries. It offers a mapping from vote
8 : account pubkeys to their t_1 and t_2 stakes. The structure is
9 : designed to work with a large amount of vote accounts (in the order
10 : of 10s of millions) along with a relatively small number of forks
11 : across epoch boundaries.
12 :
13 : The structure is designed around these operations:
14 : 1. Inserting updated vote account stakes into a given fork.
15 : 2. Querying the stake for a given vote account with a given fork.
16 :
17 : Concurrent queries are allowed but concurrent inserts are not. This
18 : is fine because the structure is only modified during boot and during
19 : the epoch boundary.
20 :
21 : Given a large number of vote accounts (e.g. 2^25 = 33,554,432), we
22 : need to store the pubkey, t_1 stake, and t_2 stake for each vote
23 : account. We also need to store potentially ~32 forks across each
24 : epoch boundary. If done naively, this would require
25 : 2^25 * (32 + 8 + 8) * 32 = 51GB of memory not including any overhead
26 : for maintaining lookups for accounts.
27 :
28 : To avoid this, we can use some important runtime protocol properties.
29 : The most notable is that across forks, we will only have a small
30 : number of differences in vote account stakes: this is because
31 : realistically very few vote accounts will have differences in stakes
32 : that are caused by forks right before the epoch boundary. So we can
33 : set our bound on elements as the total number of vote accounts +
34 : stakes across forks. Let's say this is 2^26 if the max number of
35 : vote accounts we want to support is 2^25. Now to store our index we
36 : only need:
37 : 2^26 * (32 + 8 + 8) = 3GB of memory (not including overhead).
38 : Our index structure will look like this:
39 : pool<pubkey, stake_t_1, stake_t_2>
40 :
41 : What is described above the index of all vote accounts. We need to
42 : account for the stakes across forks. We have to maintain a list of
43 : all of the index entries used by each fork. It is sufficient to use
44 : a uint list of indices into the index. So each fork is just:
45 : pool<uint>.
46 :
47 : 4 (uint pool idx) * 2^25 (# vote accounts) * 32 (# forks) = 4GiB
48 :
49 : For a given fork, when we insert a vote account we check if:
50 : 1. The vote account + stake pair is already in the index. If it is,
51 : we just increment a reference to the pair. Add it to the list of
52 : pool indices that the fork maintains.
53 : 2. The vote account + stake pair is not in the index. If it is not,
54 : we need to insert it into the index and assume a reference of 1.
55 : Add it to the local list of pool indices that the fork maintains.
56 : In order to make both of these cases performant we need a mapping
57 : from pubkey + stake to the index pool element. This is simply
58 : represented as:
59 : map<(pubkey+stake), index_pool_idx>.
60 :
61 : Now for queries, we need a way to query the t_1 and t_2 stake for a
62 : pubkey on a given fork. The problem is the above map requires the
63 : t_1 stake as part of the key and there are potentially multiple
64 : entries for a given pubkey. This is solved with a
65 : multi_map<pubkey, index_pool_idx>. We query for a given pubkey and
66 : iterate through all of the entries: this is an acceptable trade-off
67 : since there will almost always be one element for a given pubkey
68 : (except around the epoch boundary). However, for each index entry
69 : we need a way to make sure that it's the one that is in our fork's
70 : list of indices. This is solved with a map for each fork that is
71 : keyed by the index pool idx.
72 : map<index_pool_idx, fork list entry>.
73 :
74 : Now, we can quickly insert entries for a given fork and also do fast
75 : queries for a given pubkey.
76 :
77 : The only remaining operation is updating the root fork. If we are
78 : updating which fork is the root, we can safely assume that all other
79 : forks are no longer valid:
80 : 1. either the fork was a competing fork that executed the epoch
81 : boundary and is no longer needed
82 : 2. the fork corresponds to a fork for the previous epoch boundary.
83 :
84 : For any fork that's being removed, we need to reset its fork's pool
85 : and remove any references to the index pool entries. If an index
86 : entry has a reference count of 0, we can remove it from the index
87 : entirely. Under the hood, the forks in use are stored in a deque;
88 : when a root is being advanced, all entries from the deque are removed
89 : and each removed fork's entries are released.
90 :
91 : The memory footprint of what is actually described above is larger
92 : because each key of the index needs to be a compound of
93 : the pubkey, stake_t_1, node_account_t_1, and epoch.
94 :
95 : All public APIs internally acquire and release a read-write lock
96 : so the caller does not need to manage synchronization. Read-only
97 : operations (query, ele_cnt, get_root_idx) acquire a shared read
98 : lock. Mutating operations and the iterator acquire an exclusive
99 : write lock.
100 :
101 : fd_vote_stakes is the Firedancer equivalent to the Agave client
102 : epoch stakes data structure. */
103 :
104 : FD_PROTOTYPES_BEGIN
105 :
106 14247 : #define FD_VOTE_STAKES_ALIGN (128UL)
107 :
108 : struct fd_vote_stakes;
109 : typedef struct fd_vote_stakes fd_vote_stakes_t;
110 :
111 : /* fd_vote_stakes_align returns the minimum alignment required for the
112 : fd_vote_stakes_t struct. */
113 :
114 : ulong
115 : fd_vote_stakes_align( void );
116 :
117 : /* fd_vote_stakes_footprint returns the minimum footprint required for
118 : the fd_vote_stakes_t object given the max number of vote accounts,
119 : the max fork width (number of forks that can cross the epoch
120 : boundary), and the max number of map chains. The map chains should
121 : be a power of 2 that is roughly equal to the expected number of vote
122 : accounts and not the maximum. */
123 :
124 : ulong
125 : fd_vote_stakes_footprint( ulong max_vote_accounts,
126 : ulong expected_vote_accounts,
127 : ulong max_fork_width );
128 :
129 :
130 : /* fd_vote_stakes_new creates a new fd_vote_stakes_t object given a
131 : region of memory sized out according to fd_vote_stakes_footprint.
132 : The underlying data storage will actually support
133 : 2 * max_vote_accounts entries because entries are deduped across
134 : forks after all of the entries have already been inserted. */
135 :
136 : void *
137 : fd_vote_stakes_new( void * shmem,
138 : ulong max_vote_accounts,
139 : ulong expected_vote_accounts,
140 : ulong max_fork_width,
141 : ulong seed );
142 :
143 :
144 : /* fd_vote_stakes_join joins a valid fd_vote_stakes_t object from a
145 : region of memory. */
146 :
147 : fd_vote_stakes_t *
148 : fd_vote_stakes_join( void * shmem );
149 :
150 : /* fd_vote_stakes_root_{insert, update}_key are APIs for
151 : inserting and updating keys for the root fork. These
152 : operations are split out in order to support the snapshot loading
153 : process. The set of stakes from the T-1 epoch are inserted into
154 : the root fork with a call to fd_vote_stakes_root_insert_key. The
155 : set of stakes from the T-2 epoch are updated with a call to
156 : fd_vote_stakes_root_update_meta. The caller is responsible for
157 : ensuring that for a given pubkey, insert_key is called before
158 : update_meta. It is important that these APIs should only be called
159 : while the root fork is the only and current fork in use.
160 :
161 : If update_meta is called on a key that has not had a corresponding
162 : insert_key call, a key is created into the root fork with a t-1 stake
163 : of 0. This usually means the vote account has been deleted, but it
164 : can be possible in the case where the only staker of a vote account
165 : has been marked delinquent in epoch T-1 and needs to be counted
166 : towards clock calculation for the rest of the epoch. */
167 :
168 : void
169 : fd_vote_stakes_root_insert_key( fd_vote_stakes_t * vote_stakes,
170 : fd_pubkey_t const * pubkey,
171 : fd_pubkey_t const * node_account_t_1,
172 : ulong stake_t_1,
173 : uchar commission_t_1,
174 : ulong epoch );
175 :
176 : void
177 : fd_vote_stakes_root_update_meta( fd_vote_stakes_t * vote_stakes,
178 : fd_pubkey_t const * pubkey,
179 : fd_pubkey_t const * node_account_t_2,
180 : ulong stake_t_2,
181 : uchar commission_t_2,
182 : ulong epoch );
183 :
184 : /* fd_vote_stakes_insert inserts a new vote account stake into a given
185 : fork. If the element already exists on a different fork, then the
186 : reference is incremented in the index and the fork will now have an
187 : element which points to the vote stake index element. */
188 :
189 : void
190 : fd_vote_stakes_insert( fd_vote_stakes_t * vote_stakes,
191 : ushort fork_idx,
192 : fd_pubkey_t const * pubkey,
193 : fd_pubkey_t const * node_account_t_1,
194 : fd_pubkey_t const * node_account_t_2,
195 : ulong stake_t_1,
196 : ulong stake_t_2,
197 : uchar commission_t_1,
198 : uchar commission_t_2,
199 : uchar exists_t_1,
200 : uchar exists_t_2,
201 : ulong epoch );
202 :
203 : /* fd_vote_stakes_genesis_fini finalizes the vote stakes on the genesis
204 : block. Any vote stakes that have been inserted will be updated to
205 : have identical T-1/T-2 stakes and node accounts. This function
206 : assumes that all vote accounts have already been inserted into the
207 : genesis fork. */
208 :
209 : void
210 : fd_vote_stakes_genesis_fini( fd_vote_stakes_t * vote_stakes );
211 :
212 : /* fd_vote_stakes_new_child creates a new child fork and returns the
213 : index identifier for the new fork. */
214 :
215 : ushort
216 : fd_vote_stakes_new_child( fd_vote_stakes_t * vote_stakes );
217 :
218 : /* fd_vote_stakes_advance_root will move the root fork to the new
219 : candidate root fork. If the root_idx is equal to the root, this
220 : function is a no-op. However, if the root is different, all other
221 : child nodes will be removed from the structure. */
222 :
223 : void
224 : fd_vote_stakes_advance_root( fd_vote_stakes_t * vote_stakes,
225 : ushort root_idx );
226 :
227 : /* fd_vote_stakes_query_stake queries the stake for a given vote account
228 : in the given fork. If the element is found returns 1, otherwise
229 : returns 0. If any of the optional fields are set to NULL, then their
230 : corresponding value will not be set. An account can have zero stake
231 : but still exist. */
232 :
233 : int
234 : fd_vote_stakes_query( fd_vote_stakes_t * vote_stakes,
235 : ushort fork_idx,
236 : fd_pubkey_t const * pubkey,
237 : ulong * stake_t_1_out_opt,
238 : ulong * stake_t_2_out_opt,
239 : fd_pubkey_t * node_account_t_1_out_opt,
240 : fd_pubkey_t * node_account_t_2_out_opt,
241 : uchar * commission_t_1_out_opt,
242 : uchar * commission_t_2_out_opt,
243 : uchar * exists_t_1_out_opt,
244 : uchar * exists_t_2_out_opt );
245 :
246 : /* fd_vote_stakes_query_t_1 and fd_vote_stakes_query_t_2 are shortcuts
247 : for querying the t_1 and t_2 stake for a given vote account in the
248 : given fork. 0 is returned if the vote account does not exist for the
249 : epoch. If the account is found, stake_out, node_account_out, and
250 : commission_out will be set. */
251 :
252 : int
253 : fd_vote_stakes_query_t_1( fd_vote_stakes_t * vote_stakes,
254 : ushort fork_idx,
255 : fd_pubkey_t const * pubkey,
256 : ulong * stake_out,
257 : fd_pubkey_t * node_account_out,
258 : uchar * commission_out );
259 :
260 : int
261 : fd_vote_stakes_query_t_2( fd_vote_stakes_t * vote_stakes,
262 : ushort fork_idx,
263 : fd_pubkey_t const * pubkey,
264 : ulong * stake_out,
265 : fd_pubkey_t * node_account_out,
266 : uchar * commission_out );
267 :
268 : /* fd_vote_stakes_ele_cnt returns the number of entries for a given
269 : fork. */
270 :
271 : uint
272 : fd_vote_stakes_ele_cnt( fd_vote_stakes_t * vote_stakes,
273 : ushort fork_idx );
274 :
275 : /* fd_vote_stakes_get_root_idx returns the index of the root fork. */
276 :
277 : ushort
278 : fd_vote_stakes_get_root_idx( fd_vote_stakes_t * vote_stakes );
279 :
280 : /* fd_vote_stakes_reset resets the vote stakes object to the initial
281 : state. This is useful for resetting vote stakes if a new snapshot
282 : manifest is being loaded. */
283 :
284 : void
285 : fd_vote_stakes_reset( fd_vote_stakes_t * vote_stakes );
286 :
287 : #define FD_VOTE_STAKES_ITER_FOOTPRINT (16UL)
288 : #define FD_VOTE_STAKES_ITER_ALIGN (8UL)
289 : struct stakes_map_iter_t;
290 : typedef struct stakes_map_iter_t fd_vote_stakes_iter_t;
291 :
292 : /* A caller can iterate through the entries for a given fork. The
293 : iterator is initialized by a call to fd_vote_stakes_fork_iter_init.
294 : The caller is responsible for managing the memory for the iterator.
295 : It is safe to call fd_vote_stakes_fork_iter_next if the result of
296 : fd_vote_stakes_fork_iter_done() == 0. It is safe to call
297 : fd_vote_stakes_fork_iter_ele() to get the current entry if there is
298 : a valid initialized iterator. fd_vote_stakes_fork_iter_next is
299 : called to advance the iterator.
300 :
301 : fd_vote_stakes_fork_iter_init acquires a write lock on the vote
302 : stakes. The caller MUST call fd_vote_stakes_fork_iter_fini after
303 : the iteration loop completes to release the lock.
304 :
305 : It is not safe to call any other vote stakes apis while an iteration
306 : is in progress.
307 :
308 : Example use:
309 : uchar __attribute__((aligned(FD_VOTE_STAKES_ITER_ALIGN))) iter_mem[ FD_VOTE_STAKES_ITER_FOOTPRINT ];
310 : for( fd_vote_stakes_iter_t * iter = fd_vote_stakes_fork_iter_init( vote_stakes, fork_idx, iter_mem );
311 : !fd_vote_stakes_fork_iter_done( vote_stakes, fork_idx, iter );
312 : fd_vote_stakes_fork_iter_next( vote_stakes, fork_idx, iter ) ) {
313 : fd_vote_stakes_fork_iter_ele( vote_stakes, fork_idx, iter, &pubkey, &stake_t_1, &stake_t_2, &node_account_t_1, &node_account_t_2, &commission_t_1, &commission_t_2 );
314 : }
315 : fd_vote_stakes_fork_iter_fini( vote_stakes );
316 :
317 : Under the hood, the vote stakes iterator is a wrapper of the map
318 : chain iterator.
319 :
320 : TODO: fork_idx can probably get absorbed into the iterator. */
321 :
322 : fd_vote_stakes_iter_t *
323 : fd_vote_stakes_fork_iter_init( fd_vote_stakes_t * vote_stakes,
324 : ushort fork_idx,
325 : uchar iter_mem[ static FD_VOTE_STAKES_ITER_FOOTPRINT ] );
326 :
327 : int
328 : fd_vote_stakes_fork_iter_done( fd_vote_stakes_t * vote_stakes,
329 : ushort fork_idx,
330 : fd_vote_stakes_iter_t * iter );
331 :
332 : void
333 : fd_vote_stakes_fork_iter_next( fd_vote_stakes_t * vote_stakes,
334 : ushort fork_idx,
335 : fd_vote_stakes_iter_t * iter );
336 :
337 : void
338 : fd_vote_stakes_fork_iter_ele( fd_vote_stakes_t * vote_stakes,
339 : ushort fork_idx,
340 : fd_vote_stakes_iter_t * iter,
341 : fd_pubkey_t * pubkey_out,
342 : ulong * stake_t_1_out_opt,
343 : ulong * stake_t_2_out_opt,
344 : fd_pubkey_t * node_account_t_1_out_opt,
345 : fd_pubkey_t * node_account_t_2_out_opt,
346 : uchar * commission_t_1_out_opt,
347 : uchar * commission_t_2_out_opt );
348 :
349 : void
350 : fd_vote_stakes_fork_iter_fini( fd_vote_stakes_t * vote_stakes );
351 :
352 : FD_PROTOTYPES_END
353 :
354 : #endif
|