Line data Source code
1 : #ifndef HEADER_fd_src_choreo_ghost_fd_ghost_h
2 : #define HEADER_fd_src_choreo_ghost_fd_ghost_h
3 :
4 : /* fd_ghost implements Solana's LMD-GHOST ("latest message-driven greedy
5 : heaviest-observed subtree") fork choice rule.
6 :
7 : LMD ("latest message-driven") means only a validator's latest vote
8 : counts. If a validator votes for one fork than subsequently votes
9 : for a different fork, their vote only counts towards the latter fork
10 : and not the former.
11 :
12 : GHOST ("greedy heaviest-observed subtree") describes the fork choice
13 : rule. Forks form a tree, where each node is a block. Here's an
14 : example of a fork tree in which every block is labeled with its slot:
15 :
16 : /-- 3
17 : 1-- 2
18 : \-- 4
19 :
20 : In the above tree 3 and 4 are different forks. The responsibility of
21 : fork choice is to decide whether the validator should vote for 3 or 4
22 : which ultimately determines which fork the cluster converges on.
23 :
24 : In Solana, votes are stake-weighted. Here is the same tree with
25 : stakes associated with each block.
26 :
27 : /-- 3 (30%)
28 : 1 (80%) -- 2 (70%)
29 : \-- 4 (38%)
30 :
31 : 80% of stake voted for 1, 70% for 2, 30% for 3 and 38% for 4. How
32 : does fork choice pick 3 or 4? It traverses down the tree, beginning
33 : from the root (1), then picks (2), then picks (4) where it terminates
34 : and returns 4 as the best leaf.
35 :
36 : greedy: fork choice is a greedy algorithm. During traversal, on
37 : each level of the tree it picks the locally optimal value.
38 :
39 : heaviest: pick based on the heaviest (highest) stake.
40 :
41 : observed: this is the validator's local view, and other validators
42 : may have differing views because they've observed
43 : different votes or different forks.
44 :
45 : subtree: sum the vote stake of an entire subtree rooted at a given
46 : block, not just votes for the individual block itself. In
47 : the tree above, 1 and 2 both have strictly more stake than
48 : 3 or 4. That is because the stake for 3 and 4 both rolled
49 : up into 2, and the stake for 2 rolled up into 1. There
50 : were also votes for 1 and 2 that weren't just roll-ups so
51 : the total stake for a parent can exceed (>=) the sum of its
52 : children.
53 :
54 : The above diagrams used slot numbers for simplicity but ghost in fact
55 : uses the `block_id`, a 32-byte hash that uniquely identifies a block,
56 : to key the elements of the tree. The block_id ensures that when
57 : there is equivocation (two or more blocks labeled with the same slot)
58 : the blocks can be disambiguated.
59 :
60 : Ghost handles equivocation by marking forks invalid for fork choice
61 : if any block along that fork equivocates. For example, consider the
62 : following tree:
63 :
64 : /-- 4'(30%)
65 : 1 (80%) -- 2 (70%)
66 : \-- 4 (38%)
67 :
68 : This is the same tree from earlier except 3 has been replaced with
69 : 4'. There are two equivocating blocks for slot 4: how does ghost
70 : handle this? Ghost marks both 4 and 4' as invalid for fork choice.
71 : So in this example, fork choice will pick 2 as the best leaf.
72 :
73 : Ghost can mark a fork valid again if it becomes "duplicate confirmed"
74 : ie. it has received votes from >=52% of the cluster. Revisiting the
75 : above tree, modified:
76 :
77 : /-- 4'(30%)
78 : 1 (80%) -- 2 (70%)
79 : \-- 4 (52%) <- gossip duplicate confirmed (>=52%)
80 :
81 : Now that 4 is "duplicate confirmed", ghost marks 4 as valid again.
82 : Fork choice picks 4 as the best leaf. Note gossip duplicate
83 : confirmation is separately tracked outside of fd_ghost. See the
84 : fd_ghost API for how that works. */
85 :
86 : #include "../fd_choreo_base.h"
87 :
88 : typedef struct fd_ghost fd_ghost_t; /* forward decl*/
89 :
90 : /* fd_ghost_blk_t implements a left-child, right-sibling n-ary tree.
91 : Each ele maintains the `pool` index of its left-most child
92 : (`child_idx`), its immediate-right sibling (`sibling_idx`), and its
93 : parent (`parent_idx`).
94 :
95 : This tree structure is gaddr-safe and supports accesses and
96 : operations from processes with separate local ghost joins. */
97 :
98 : struct __attribute__((aligned(128UL))) fd_ghost_blk {
99 : fd_hash_t id; /* block_id (merkle root of the last FEC set in the slot) */
100 : ulong slot; /* slot this ele is tracking */
101 : ulong next; /* reserved for internal use by fd_pool, fd_map_chain and fd_ghost_publish */
102 : ulong parent; /* pool idx of the parent */
103 : ulong child; /* pool idx of the left-child */
104 : ulong sibling; /* pool idx of the right-sibling */
105 : ulong stake; /* sum of stake that has voted for this slot or any of its descendants */
106 : ulong total_stake; /* total stake for this blk */
107 : int valid; /* whether this block is valid for fork choice. an equivocating block is valid iff duplicate confirmed */
108 : };
109 : typedef struct fd_ghost_blk fd_ghost_blk_t;
110 :
111 : /* fd_ghost_vtr_t keeps track of what a voter's previously voted for. */
112 :
113 : struct __attribute__((aligned(128UL))) fd_ghost_vtr {
114 : fd_pubkey_t addr; /* map key, vote account address */
115 : ulong next; /* reserved for internal use by fd_pool and fd_map_chain */
116 : ulong prev_stake; /* previous vote stake (vote can be from prior epoch) */
117 : ulong prev_slot; /* previous vote slot */
118 : fd_hash_t prev_block_id; /* previous vote block_id */
119 : };
120 : typedef struct fd_ghost_vtr fd_ghost_vtr_t;
121 :
122 : FD_PROTOTYPES_BEGIN
123 :
124 : /* Constructors */
125 :
126 : /* fd_ghost_{align,footprint} return the required alignment and
127 : footprint of a memory region suitable for use as ghost with up to
128 : blk_max blocks and vtr_max voters. */
129 :
130 : FD_FN_CONST ulong
131 : fd_ghost_align( void );
132 :
133 : FD_FN_CONST ulong
134 : fd_ghost_footprint( ulong blk_max,
135 : ulong vtr_max );
136 :
137 : /* fd_ghost_new formats an unused memory region for use as a ghost.
138 : mem is a non-NULL pointer to this region in the local address space
139 : with the required footprint and alignment. */
140 :
141 : void *
142 : fd_ghost_new( void * shmem,
143 : ulong blk_max,
144 : ulong vtr_max,
145 : ulong seed );
146 :
147 : /* fd_ghost_join joins the caller to the ghost. ghost points to the
148 : first byte of the memory region backing the ghost in the caller's
149 : address space.
150 :
151 : Returns a pointer in the local address space to ghost on success. */
152 :
153 : fd_ghost_t *
154 : fd_ghost_join( void * ghost );
155 :
156 : /* fd_ghost_leave leaves a current local join. Returns a pointer to the
157 : underlying shared memory region on success and NULL on failure (logs
158 : details). Reasons for failure include ghost is NULL. */
159 :
160 : void *
161 : fd_ghost_leave( fd_ghost_t const * ghost );
162 :
163 : /* fd_ghost_delete unformats a memory region used as a ghost.
164 : Assumes only the nobody is joined to the region. Returns a
165 : pointer to the underlying shared memory region or NULL if used
166 : obviously in error (e.g. ghost is obviously not a ghost ... logs
167 : details). The ownership of the memory region is transferred to the
168 : caller. */
169 :
170 : void *
171 : fd_ghost_delete( void * ghost );
172 :
173 : /* Accessors */
174 :
175 : /* fd_ghost_root returns a pointer in the caller's address space to the
176 : root. Assumes ghost is a current local join. */
177 :
178 : fd_ghost_blk_t *
179 : fd_ghost_root( fd_ghost_t * ghost );
180 :
181 : /* fd_ghost_parent returns a pointer in the caller's address space to
182 : blk's parent. Assumes ghost is a current local join and blk is a
183 : valid pointer to a pool element inside ghost. */
184 :
185 : fd_ghost_blk_t *
186 : fd_ghost_parent( fd_ghost_t * ghost, fd_ghost_blk_t * blk );
187 :
188 : /* fd_ghost_width returns the width of the ghost tree. */
189 :
190 : ulong
191 : fd_ghost_width( fd_ghost_t * ghost );
192 :
193 : /* fd_ghost_query returns the block keyed by block_id. Returns NULL if
194 : not found. */
195 :
196 : fd_ghost_blk_t *
197 : fd_ghost_query( fd_ghost_t * ghost,
198 : fd_hash_t const * block_id );
199 :
200 : /* fd_ghost_best returns the best block (according to fork choice) in
201 : the subtree beginning at root. This is the ideal block to vote on or
202 : reset to, and feeds into downstream TowerBFT rules. This is usually
203 : a leaf node in the tree but may not when blocks are marked invalid
204 : due to unconfirmed duplicates. Assumes root is marked valid so this
205 : will never return NULL. */
206 :
207 : fd_ghost_blk_t *
208 : fd_ghost_best( fd_ghost_t * ghost,
209 : fd_ghost_blk_t * root );
210 :
211 : /* fd_ghost_deepest returns the deepest ghost block (highest tree depth)
212 : in the subtree beginning at root. Unlike fd_ghost_best, deepest can
213 : return a block marked invalid for fork choice. In case of ties, the
214 : returned block will be the most recently inserted one. This will
215 : never return NULL. */
216 :
217 : fd_ghost_blk_t *
218 : fd_ghost_deepest( fd_ghost_t * ghost,
219 : fd_ghost_blk_t * root );
220 :
221 :
222 : /* fd_ghost_ancestor returns the ancestor on the same fork as descendant
223 : keyed by ancestor_id. Returns NULL if not found. */
224 :
225 : fd_ghost_blk_t *
226 : fd_ghost_ancestor( fd_ghost_t * ghost,
227 : fd_ghost_blk_t * descendant,
228 : fd_hash_t const * ancestor_id );
229 :
230 : /* fd_ghost_slot_ancestor returns the ancestor on the same fork as
231 : descendant with ancestor_slot. Returns NULL if not found. */
232 :
233 : fd_ghost_blk_t *
234 : fd_ghost_slot_ancestor( fd_ghost_t * ghost,
235 : fd_ghost_blk_t * descendant,
236 : ulong ancestor_slot );
237 :
238 : /* fd_ghost_invalid_ancestor returns the first ancestor on the same fork
239 : as descendant that is marked invalid. Includes checking descendant
240 : itself. Returns NULL if there are no invalid ancestors. */
241 :
242 : fd_ghost_blk_t *
243 : fd_ghost_invalid_ancestor( fd_ghost_t * ghost,
244 : fd_ghost_blk_t * descendant );
245 :
246 : /* Operations */
247 :
248 : /* fd_ghost_insert inserts a new tree node representing a block keyed by
249 : block_id (and slot). parent_block_id is used to link this new block
250 : to its parent in the ghost tree. parent_block_id may only be NULL if
251 : this is the very first ghost insert, in which case this new block
252 : will be set to the ghost root. Returns the new block.*/
253 :
254 : fd_ghost_blk_t *
255 : fd_ghost_insert( fd_ghost_t * ghost,
256 : fd_hash_t const * block_id,
257 : fd_hash_t const * parent_block_id,
258 : ulong slot );
259 :
260 : /* fd_ghost_count_vote return codes. */
261 :
262 15 : #define FD_GHOST_SUCCESS ( 0) /* vote counted successfully */
263 0 : #define FD_GHOST_ERR_NOT_VOTED (-1) /* slot == ULONG_MAX (hasn't voted) */
264 0 : #define FD_GHOST_ERR_VOTE_TOO_OLD (-2) /* slot < root->slot */
265 3 : #define FD_GHOST_ERR_ALREADY_VOTED (-3) /* slot <= prev_slot (not newer) */
266 :
267 : /* fd_ghost_count_vote updates ghost's stake weights with the latest
268 : vote on the given voter's tower. This counts pubkey's stake towards
269 : all ancestors on the same fork as block_id. If the voter has
270 : previously voted, it subtracts the voter's previous stake from all
271 : ancestors of vtr->prev_block_id. This ensures that only the most
272 : recent vote from a voter is counted (see top-level documentation
273 : about the LMD rule). Returns FD_GHOST_SUCCESS on success, or a
274 : negative FD_GHOST_ERR_* code if the vote was not counted.
275 :
276 : TODO the implementation can be made more efficient by incrementally
277 : updating and short-circuiting ancestor traversals. Currently this is
278 : bounded to O(h), where h is the height of ghost ie. O(block_max) in
279 : the worst case. */
280 :
281 : int
282 : fd_ghost_count_vote( fd_ghost_t * ghost,
283 : fd_ghost_blk_t * blk,
284 : fd_pubkey_t const * vtr_addr,
285 : ulong stake,
286 : ulong slot );
287 :
288 : /* fd_ghost_publish publishes newr as the new ghost root, pruning any
289 : blocks not in the subtree beginning from the new root (ie. new root
290 : and all its descendants). Returns the new root. */
291 :
292 : void
293 : fd_ghost_publish( fd_ghost_t * ghost,
294 : fd_ghost_blk_t * newr );
295 :
296 : /* Misc */
297 :
298 : /* fd_ghost_verify checks the ghost is not obviously corrupt. Returns 0
299 : if verify succeeds, -1 otherwise. */
300 :
301 : int
302 : fd_ghost_verify( fd_ghost_t * ghost );
303 :
304 : /* fd_ghost_to_cstr pretty-prints a formatted ghost tree beginning from
305 : root (arbitrary node, not necessarily the current ghost root) to the
306 : buffer cstr of at least cstr_max capacity. Returns a pointer to cstr
307 : and on return cstr_len contains actual bytes written including NUL.
308 :
309 : Typical usage:
310 :
311 : fd_ghost_to_cstr( ghost, fd_ghost_root( ghost ) ); // stringify ghost beginning from the current ghost root
312 :
313 : Alternative usage:
314 :
315 : fd_ghost_ele_t const * ele = fd_ghost_query( slot );
316 : fd_ghost_to_cstr( ghost, fd_ghost_parent( fd_ghost_parent( ele ) ) ); // print from ele's grandparent
317 :
318 : */
319 : char *
320 : fd_ghost_to_cstr( fd_ghost_t const * ghost,
321 : fd_ghost_blk_t const * root,
322 : char * cstr,
323 : ulong cstr_max,
324 : ulong * cstr_len );
325 :
326 : /* fd_ghost_blk_map_remove removes blk from the ghost map. Assumes blk
327 : is in the map. Returns blk. */
328 :
329 : fd_ghost_blk_t *
330 : fd_ghost_blk_map_remove( fd_ghost_t * ghost,
331 : fd_ghost_blk_t * blk );
332 :
333 : /* fd_ghost_blk_map_insert inserts a ghost blk into the ghost map. */
334 :
335 : void
336 : fd_ghost_blk_map_insert( fd_ghost_t * ghost,
337 : fd_ghost_blk_t * blk );
338 :
339 : /* fd_ghost_blk_child returns the left-most child of blk, or NULL if
340 : blk has no children. */
341 :
342 : fd_ghost_blk_t *
343 : fd_ghost_blk_child( fd_ghost_t * ghost,
344 : fd_ghost_blk_t * blk );
345 :
346 : /* fd_ghost_blk_sibling returns the next sibling of blk, or NULL if
347 : blk has no more siblings. */
348 :
349 : fd_ghost_blk_t *
350 : fd_ghost_blk_sibling( fd_ghost_t * ghost,
351 : fd_ghost_blk_t * blk );
352 :
353 : /* fd_ghost_blk_next returns the block linked via blk->next (used for
354 : traversing a temporary linked list built from removed map elements),
355 : or NULL if at the end of the list. */
356 :
357 : fd_ghost_blk_t *
358 : fd_ghost_blk_next( fd_ghost_t * ghost,
359 : fd_ghost_blk_t * blk );
360 :
361 : /* fd_ghost_blk_idx returns the pool index corresponding to blk. */
362 :
363 : ulong
364 : fd_ghost_blk_idx( fd_ghost_t * ghost,
365 : fd_ghost_blk_t * blk );
366 :
367 : /* fd_ghost_blk_idx_null returns the sentinel null index for the ghost
368 : block pool. */
369 :
370 : ulong
371 : fd_ghost_blk_idx_null( fd_ghost_t * ghost );
372 :
373 :
374 : FD_PROTOTYPES_END
375 :
376 : #endif /* HEADER_fd_src_choreo_ghost_fd_ghost_h */
|