Line data Source code
1 : #ifndef HEADER_fd_src_disco_topo_fd_topo_h
2 : #define HEADER_fd_src_disco_topo_fd_topo_h
3 :
4 : #include "../stem/fd_stem.h"
5 : #include "../../tango/fd_tango.h"
6 : #include "../../waltz/xdp/fd_xdp1.h"
7 : #include "../../ballet/base58/fd_base58.h"
8 :
9 : /* Maximum number of workspaces that may be present in a topology. */
10 : #define FD_TOPO_MAX_WKSPS (256UL)
11 : /* Maximum number of links that may be present in a topology. */
12 : #define FD_TOPO_MAX_LINKS (256UL)
13 : /* Maximum number of tiles that may be present in a topology. */
14 : #define FD_TOPO_MAX_TILES (256UL)
15 : /* Maximum number of objects that may be present in a topology. */
16 : #define FD_TOPO_MAX_OBJS (4096UL)
17 : /* Maximum number of links that may go into any one tile in the
18 : topology. */
19 : #define FD_TOPO_MAX_TILE_IN_LINKS ( 128UL)
20 : /* Maximum number of links that a tile may write to. */
21 : #define FD_TOPO_MAX_TILE_OUT_LINKS ( 32UL)
22 : /* Maximum number of objects that a tile can use. */
23 : #define FD_TOPO_MAX_TILE_OBJS ( 256UL)
24 :
25 : /* Maximum number of additional ip addresses */
26 : #define FD_NET_MAX_SRC_ADDR 4
27 :
28 : /* A workspace is a Firedancer specific memory management structure that
29 : sits on top of 1 or more memory mapped gigantic or huge pages mounted
30 : to the hugetlbfs. */
31 : typedef struct {
32 : ulong id; /* The ID of this workspace. Indexed from [0, wksp_cnt). When placed in a topology, the ID must be the index of the workspace in the workspaces list. */
33 : char name[ 13UL ]; /* The name of this workspace, like "pack". There can be at most one of each workspace name in a topology. */
34 :
35 : ulong numa_idx; /* The index of the NUMA node on the system that this workspace should be allocated from. */
36 :
37 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
38 : struct {
39 : ulong page_sz; /* The size of the pages that this workspace is backed by. One of FD_PAGE_SIZE_*. */
40 : ulong page_cnt; /* The number of pages that must be mapped to this workspace to store all the data needed by consumers. */
41 : ulong part_max; /* The maximum number of partitions in the underlying workspace. There can only be this many allocations made at any one time. */
42 :
43 : fd_wksp_t * wksp; /* The workspace memory in the local process. */
44 : ulong known_footprint; /* Total size in bytes of all data in Firedancer that will be stored in this workspace at startup. */
45 : ulong total_footprint; /* Total size in bytes of all data in Firedancer that could be stored in this workspace, includes known data and loose data. */
46 : };
47 : } fd_topo_wksp_t;
48 :
49 : /* A link is an mcache in a workspace that has one producer and one or
50 : more consumers. A link may optionally also have a dcache, that holds
51 : fragments referred to by the mcache entries.
52 :
53 : A link belongs to exactly one workspace. A link has exactly one
54 : producer, and 1 or more consumers. Each consumer is either reliable
55 : or not reliable. A link has a depth and a MTU, which correspond to
56 : the depth and MTU of the mcache and dcache respectively. A MTU of
57 : zero means no dcache is needed, as there is no data. */
58 : typedef struct {
59 : ulong id; /* The ID of this link. Indexed from [0, link_cnt). When placed in a topology, the ID must be the index of the link in the links list. */
60 : char name[ 13UL ]; /* The name of this link, like "pack_bank". There can be multiple of each link name in a topology. */
61 : ulong kind_id; /* The ID of this link within its name. If there are N links of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a link, as does "id" on its own. */
62 :
63 : ulong depth; /* The depth of the mcache representing the link. */
64 : ulong mtu; /* The MTU of data fragments in the mcache. A value of 0 means there is no dcache. */
65 : ulong burst; /* The max amount of MTU sized data fragments that might be bursted to the dcache. */
66 :
67 : ulong mcache_obj_id;
68 : ulong dcache_obj_id;
69 :
70 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
71 : struct {
72 : fd_frag_meta_t * mcache; /* The mcache of this link. */
73 : void * dcache; /* The dcache of this link, if it has one. */
74 : };
75 : } fd_topo_link_t;
76 :
77 : /* A tile is a unique process that is spawned by Firedancer to represent
78 : one thread of execution. Firedancer sandboxes all tiles to their own
79 : process for security reasons.
80 :
81 : A tile belongs to exactly one workspace. A tile is a consumer of 0
82 : or more links, it's inputs. A tile is a producer of 0 or more output
83 : links.
84 :
85 : All input links will be automatically polled by the tile
86 : infrastructure, and output links will automatically source and manage
87 : credits from consumers. */
88 : typedef struct {
89 : ulong id; /* The ID of this tile. Indexed from [0, tile_cnt). When placed in a topology, the ID must be the index of the tile in the tiles list. */
90 : char name[ 7UL ]; /* The name of this tile. There can be multiple of each tile name in a topology. */
91 : ulong kind_id; /* The ID of this tile within its name. If there are n tile of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a tile, as does "id" on its own. */
92 : int is_agave; /* If the tile needs to run in the Agave (Anza) address space or not. */
93 :
94 : ulong cpu_idx; /* The CPU index to pin the tile on. A value of ULONG_MAX or more indicates the tile should be floating and not pinned to a core. */
95 :
96 : ulong in_cnt; /* The number of links that this tile reads from. */
97 : ulong in_link_id[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* The link_id of each link that this tile reads from, indexed in [0, in_cnt). */
98 : int in_link_reliable[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* If each link that this tile reads from is a reliable or unreliable consumer, indexed in [0, in_cnt). */
99 : int in_link_poll[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* If each link that this tile reads from should be polled by the tile infrastructure, indexed in [0, in_cnt).
100 : If the link is not polled, the tile will not receive frags for it and the tile writer is responsible for
101 : reading from the link. The link must be marked as unreliable as it is not flow controlled. */
102 :
103 : ulong out_cnt; /* The number of links that this tile writes to. */
104 : ulong out_link_id[ FD_TOPO_MAX_TILE_OUT_LINKS ]; /* The link_id of each link that this tile writes to, indexed in [0, link_cnt). */
105 :
106 : ulong tile_obj_id;
107 : ulong metrics_obj_id;
108 : ulong in_link_fseq_obj_id[ FD_TOPO_MAX_TILE_IN_LINKS ];
109 :
110 : ulong uses_obj_cnt;
111 : ulong uses_obj_id[ FD_TOPO_MAX_TILE_OBJS ];
112 : int uses_obj_mode[ FD_TOPO_MAX_TILE_OBJS ];
113 :
114 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
115 : struct {
116 : ulong * metrics; /* The shared memory for metrics that this tile should write. Consumer by monitoring and metrics writing tiles. */
117 :
118 : /* The fseq of each link that this tile reads from. Multiple fseqs
119 : may point to the link, if there are multiple consumers. An fseq
120 : can be uniquely identified via (link_id, tile_id), or (link_kind,
121 : link_kind_id, tile_kind, tile_kind_id) */
122 : ulong * in_link_fseq[ FD_TOPO_MAX_TILE_IN_LINKS ];
123 : };
124 :
125 : /* Configuration fields. These are required to be known by the topology so it can determine the
126 : total size of Firedancer in memory. */
127 : union {
128 : struct {
129 : char interface[ 16 ];
130 : ulong xdp_rx_queue_size;
131 : ulong xdp_tx_queue_size;
132 : ulong xdp_aio_depth;
133 : char xdp_mode[4];
134 : int zero_copy;
135 : uint src_ip_addr;
136 : uchar src_mac_addr[6];
137 :
138 : ushort shred_listen_port;
139 : ushort quic_transaction_listen_port;
140 : ushort legacy_transaction_listen_port;
141 : ushort gossip_listen_port;
142 : ushort repair_intake_listen_port;
143 : ushort repair_serve_listen_port;
144 :
145 : /* multihoming support */
146 : ulong multihome_ip_addrs_cnt;
147 : uint multihome_ip_addrs[FD_NET_MAX_SRC_ADDR];
148 : } net;
149 :
150 : struct {
151 : uint out_depth;
152 : uint reasm_cnt;
153 : ulong max_concurrent_connections;
154 : ulong max_concurrent_handshakes;
155 : uint ip_addr;
156 : uchar src_mac_addr[ 6 ];
157 : ushort quic_transaction_listen_port;
158 : ulong idle_timeout_millis;
159 : uint ack_delay_millis;
160 : int retry;
161 : } quic;
162 :
163 : struct {
164 : ulong tcache_depth;
165 : } verify;
166 :
167 : struct {
168 : ulong tcache_depth;
169 : } dedup;
170 :
171 : struct {
172 : ulong max_pending_transactions;
173 : ulong bank_tile_count;
174 : int larger_max_cost_per_block;
175 : int larger_shred_limits_per_block;
176 : int use_consumed_cus;
177 : char identity_key_path[ PATH_MAX ];
178 : } pack;
179 :
180 : struct {
181 : int lagged_consecutive_leader_start;
182 : int plugins_enabled;
183 : ulong bank_cnt;
184 : char identity_key_path[ PATH_MAX ];
185 : } poh;
186 :
187 : struct {
188 : ulong depth;
189 : uint ip_addr;
190 : uchar src_mac_addr[ 6 ];
191 : ulong fec_resolver_depth;
192 : char identity_key_path[ PATH_MAX ];
193 : ushort shred_listen_port;
194 : int larger_shred_limits_per_block;
195 : ulong expected_shred_version;
196 : } shred;
197 :
198 : struct {
199 : ulong disable_blockstore_from_slot;
200 : } store;
201 :
202 : struct {
203 : char identity_key_path[ PATH_MAX ];
204 : } sign;
205 :
206 : struct {
207 : uint listen_addr;
208 : ushort listen_port;
209 :
210 : int is_voting;
211 :
212 : char cluster[ 32 ];
213 : char identity_key_path[ PATH_MAX ];
214 : } gui;
215 :
216 : struct {
217 : uint prometheus_listen_addr;
218 : ushort prometheus_listen_port;
219 : } metric;
220 :
221 : struct {
222 : int tx_metadata_storage;
223 : char capture[ PATH_MAX ];
224 : char funk_checkpt[ PATH_MAX ];
225 : ulong funk_rec_max;
226 : ulong funk_sz_gb;
227 : ulong funk_txn_max;
228 : char funk_file[ PATH_MAX ];
229 : char genesis[ PATH_MAX ];
230 : char incremental[ PATH_MAX ];
231 : char slots_replayed[ PATH_MAX ];
232 : char snapshot[ PATH_MAX ];
233 : char status_cache[ PATH_MAX ];
234 : ulong tpool_thread_count;
235 : char cluster_version[ 32 ];
236 : int in_wen_restart;
237 : char tower_checkpt[ PATH_MAX ];
238 : char wen_restart_coordinator[ FD_BASE58_ENCODED_32_SZ ];
239 : int plugins_enabled;
240 :
241 : /* not specified in TOML */
242 :
243 : char identity_key_path[ PATH_MAX ];
244 : uint ip_addr;
245 : uchar src_mac_addr[ 6 ];
246 : int vote;
247 : char vote_account_path[ PATH_MAX ];
248 : ulong bank_tile_count;
249 : ulong full_interval;
250 : ulong incremental_interval;
251 :
252 : char blockstore_file[ PATH_MAX ];
253 : char blockstore_checkpt[ PATH_MAX ];
254 : } replay;
255 :
256 : struct {
257 : ushort send_to_port;
258 : uint send_to_ip_addr;
259 : ulong conn_cnt;
260 : int no_quic;
261 : } benchs;
262 :
263 : struct {
264 : ushort rpc_port;
265 : uint rpc_ip_addr;
266 : } bencho;
267 :
268 : struct {
269 : ulong accounts_cnt;
270 : int mode;
271 : float contending_fraction;
272 : float cu_price_spread;
273 : } benchg;
274 :
275 : /* Firedancer-only tile configs */
276 :
277 : struct {
278 : ushort gossip_listen_port;
279 : ulong entrypoints_cnt;
280 : uint entrypoints[16];
281 : ulong peer_ports_cnt;
282 : ushort peer_ports[16];
283 :
284 : uint ip_addr;
285 : uchar src_mac_addr[ 6 ];
286 : char identity_key_path[ PATH_MAX ];
287 : ushort tvu_port;
288 : ushort tvu_fwd_port;
289 : ushort tpu_port;
290 : ushort tpu_vote_port;
291 : ushort repair_serve_port;
292 : ulong expected_shred_version;
293 : int plugins_enabled;
294 : } gossip;
295 :
296 : struct {
297 : ushort repair_intake_listen_port;
298 : ushort repair_serve_listen_port;
299 :
300 : /* non-config */
301 :
302 : uint ip_addr;
303 : uchar src_mac_addr[ 6 ];
304 : char identity_key_path[ PATH_MAX ];
305 : } repair;
306 :
307 : struct {
308 : char slots_pending[PATH_MAX];
309 :
310 : ulong expected_shred_version;
311 :
312 : /* non-config */
313 :
314 : char identity_key_path[ PATH_MAX ];
315 : char shred_cap_archive[ PATH_MAX ];
316 : char shred_cap_replay[ PATH_MAX ];
317 :
318 : int in_wen_restart;
319 :
320 : char blockstore_file[ PATH_MAX ];
321 : char blockstore_restore[ PATH_MAX ];
322 : } store_int;
323 :
324 : struct {
325 : ushort tpu_listen_port;
326 :
327 : /* non-config */
328 :
329 : uint ip_addr;
330 : uchar src_mac_addr[ 6 ];
331 : char identity_key_path[ PATH_MAX ];
332 : } sender;
333 :
334 : struct {
335 : char identity_key_path[ PATH_MAX ];
336 : } eqvoc;
337 :
338 : struct {
339 : ushort rpc_port;
340 : ushort tpu_port;
341 : uint tpu_ip_addr;
342 : char identity_key_path[ PATH_MAX ];
343 : } rpcserv;
344 :
345 : struct {
346 : ulong full_interval;
347 : ulong incremental_interval;
348 : char out_dir[ PATH_MAX ];
349 : int tmp_fd;
350 : int tmp_inc_fd;
351 : int full_snapshot_fd;
352 : int incremental_snapshot_fd;
353 : ulong hash_tpool_thread_count;
354 : } snaps;
355 :
356 : };
357 : } fd_topo_tile_t;
358 :
359 : typedef struct {
360 : ulong id;
361 : char name[ 13UL ];
362 : ulong wksp_id;
363 :
364 : ulong offset;
365 : ulong footprint;
366 : } fd_topo_obj_t;
367 :
368 : /* An fd_topo_t represents the overall structure of a Firedancer
369 : configuration, describing all the workspaces, tiles, and links
370 : between them. */
371 : typedef struct fd_topo_t {
372 : char app_name[ 256UL ];
373 : uchar props[ 16384UL ];
374 :
375 : ulong wksp_cnt;
376 : ulong link_cnt;
377 : ulong tile_cnt;
378 : ulong obj_cnt;
379 :
380 : fd_topo_wksp_t workspaces[ FD_TOPO_MAX_WKSPS ];
381 : fd_topo_link_t links[ FD_TOPO_MAX_LINKS ];
382 : fd_topo_tile_t tiles[ FD_TOPO_MAX_TILES ];
383 : fd_topo_obj_t objs[ FD_TOPO_MAX_OBJS ];
384 :
385 : ulong agave_affinity_cnt;
386 : ulong agave_affinity_cpu_idx[ FD_TILE_MAX ];
387 : } fd_topo_t;
388 :
389 : typedef struct {
390 : char const * name;
391 :
392 : int keep_host_networking;
393 : ulong rlimit_file_cnt;
394 : int for_tpool;
395 :
396 : ulong (*populate_allowed_seccomp)( fd_topo_t const * topo, fd_topo_tile_t const * tile, ulong out_cnt, struct sock_filter * out );
397 : ulong (*populate_allowed_fds )( fd_topo_t const * topo, fd_topo_tile_t const * tile, ulong out_fds_sz, int * out_fds );
398 : ulong (*scratch_align )( void );
399 : ulong (*scratch_footprint )( fd_topo_tile_t const * tile );
400 : ulong (*loose_footprint )( fd_topo_tile_t const * tile );
401 : void (*privileged_init )( fd_topo_t * topo, fd_topo_tile_t * tile );
402 : void (*unprivileged_init )( fd_topo_t * topo, fd_topo_tile_t * tile );
403 : void (*run )( fd_topo_t * topo, fd_topo_tile_t * tile );
404 : } fd_topo_run_tile_t;
405 :
406 : FD_PROTOTYPES_BEGIN
407 :
408 : FD_FN_CONST static inline ulong
409 0 : fd_topo_workspace_align( void ) {
410 : /* This needs to be the max( align ) of all the child members that
411 : could be aligned into this workspace, otherwise our footprint
412 : calculation will not be correct. For now just set to 4096 but this
413 : should probably be calculated dynamically, or we should reduce
414 : those child aligns if we can. */
415 0 : return 4096UL;
416 0 : }
417 :
418 : FD_FN_PURE static inline void *
419 : fd_topo_obj_laddr( fd_topo_t const * topo,
420 0 : ulong obj_id ) {
421 0 : fd_topo_obj_t const * obj = &topo->objs[ obj_id ];
422 0 : return (void *)((ulong)topo->workspaces[ obj->wksp_id ].wksp + obj->offset);
423 0 : }
424 :
425 : FD_FN_PURE static inline ulong
426 : fd_topo_tile_name_cnt( fd_topo_t const * topo,
427 0 : char const * name ) {
428 0 : ulong cnt = 0;
429 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
430 0 : if( FD_UNLIKELY( !strcmp( topo->tiles[ i ].name, name ) ) ) cnt++;
431 0 : }
432 0 : return cnt;
433 0 : }
434 :
435 : /* Finds the workspace of a given name in the topology. Returns
436 : ULONG_MAX if there is no such workspace. There can be at most one
437 : workspace of a given name. */
438 :
439 : FD_FN_PURE static inline ulong
440 : fd_topo_find_wksp( fd_topo_t const * topo,
441 495 : char const * name ) {
442 6906 : for( ulong i=0; i<topo->wksp_cnt; i++ ) {
443 6906 : if( FD_UNLIKELY( !strcmp( topo->workspaces[ i ].name, name ) ) ) return i;
444 6906 : }
445 0 : return ULONG_MAX;
446 495 : }
447 :
448 : /* Find the tile of a given name and kind_id in the topology, there will
449 : be at most one such tile, since kind_id is unique among the name.
450 : Returns ULONG_MAX if there is no such tile. */
451 :
452 : FD_FN_PURE static inline ulong
453 : fd_topo_find_tile( fd_topo_t const * topo,
454 : char const * name,
455 273 : ulong kind_id ) {
456 3435 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
457 3435 : if( FD_UNLIKELY( !strcmp( topo->tiles[ i ].name, name ) ) && topo->tiles[ i ].kind_id == kind_id ) return i;
458 3435 : }
459 0 : return ULONG_MAX;
460 273 : }
461 :
462 : /* Find the link of a given name and kind_id in the topology, there will
463 : be at most one such link, since kind_id is unique among the name.
464 : Returns ULONG_MAX if there is no such link. */
465 :
466 : FD_FN_PURE static inline ulong
467 : fd_topo_find_link( fd_topo_t const * topo,
468 : char const * name,
469 243 : ulong kind_id ) {
470 3945 : for( ulong i=0; i<topo->link_cnt; i++ ) {
471 3945 : if( FD_UNLIKELY( !strcmp( topo->links[ i ].name, name ) ) && topo->links[ i ].kind_id == kind_id ) return i;
472 3945 : }
473 0 : return ULONG_MAX;
474 243 : }
475 :
476 : FD_FN_PURE static inline ulong
477 : fd_topo_find_tile_in_link( fd_topo_t const * topo,
478 : fd_topo_tile_t const * tile,
479 : char const * name,
480 0 : ulong kind_id ) {
481 0 : for( ulong i=0; i<tile->in_cnt; i++ ) {
482 0 : if( FD_UNLIKELY( !strcmp( topo->links[ tile->in_link_id[ i ] ].name, name ) )
483 0 : && topo->links[ tile->in_link_id[ i ] ].kind_id == kind_id ) return i;
484 0 : }
485 0 : return ULONG_MAX;
486 0 : }
487 :
488 : FD_FN_PURE static inline ulong
489 : fd_topo_find_tile_out_link( fd_topo_t const * topo,
490 : fd_topo_tile_t const * tile,
491 : char const * name,
492 0 : ulong kind_id ) {
493 0 : for( ulong i=0; i<tile->out_cnt; i++ ) {
494 0 : if( FD_UNLIKELY( !strcmp( topo->links[ tile->out_link_id[ i ] ].name, name ) )
495 0 : && topo->links[ tile->out_link_id[ i ] ].kind_id == kind_id ) return i;
496 0 : }
497 0 : return ULONG_MAX;
498 0 : }
499 :
500 : /* Find the id of the tile which is a producer for the given link. If
501 : no tile is a producer for the link, returns ULONG_MAX. This should
502 : not be possible for a well formed and validated topology. */
503 : FD_FN_PURE static inline ulong
504 : fd_topo_find_link_producer( fd_topo_t const * topo,
505 0 : fd_topo_link_t const * link ) {
506 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
507 0 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
508 :
509 0 : for( ulong j=0; j<tile->out_cnt; j++ ) {
510 0 : if( FD_UNLIKELY( tile->out_link_id[ j ] == link->id ) ) return i;
511 0 : }
512 0 : }
513 0 : return ULONG_MAX;
514 0 : }
515 :
516 : /* Given a link, count the number of consumers of that link among all
517 : the tiles in the topology. */
518 : FD_FN_PURE static inline ulong
519 : fd_topo_link_consumer_cnt( fd_topo_t const * topo,
520 99 : fd_topo_link_t const * link ) {
521 99 : ulong cnt = 0;
522 2376 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
523 2277 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
524 7029 : for( ulong j=0; j<tile->in_cnt; j++ ) {
525 4752 : if( FD_UNLIKELY( tile->in_link_id[ j ] == link->id ) ) cnt++;
526 4752 : }
527 2277 : }
528 :
529 99 : return cnt;
530 99 : }
531 :
532 : /* Given a link, count the number of reliable consumers of that link
533 : among all the tiles in the topology. */
534 : FD_FN_PURE static inline ulong
535 : fd_topo_link_reliable_consumer_cnt( fd_topo_t const * topo,
536 0 : fd_topo_link_t const * link ) {
537 0 : ulong cnt = 0;
538 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
539 0 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
540 0 : for( ulong j=0; j<tile->in_cnt; j++ ) {
541 0 : if( FD_UNLIKELY( tile->in_link_id[ j ] == link->id && tile->in_link_reliable[ j ] ) ) cnt++;
542 0 : }
543 0 : }
544 0 :
545 0 : return cnt;
546 0 : }
547 :
548 : /* Join (map into the process) all shared memory (huge/gigantic pages)
549 : needed by the tile, in the given topology. All memory associated
550 : with the tile (aka. used by links that the tile either produces to or
551 : consumes from, or used by the tile itself for its cnc) will be
552 : attached (mapped into the process).
553 :
554 : This is needed to play nicely with the sandbox. Once a process is
555 : sandboxed we can no longer map any memory. */
556 : void
557 : fd_topo_join_tile_workspaces( fd_topo_t * topo,
558 : fd_topo_tile_t * tile );
559 :
560 : /* Join (map into the process) the shared memory (huge/gigantic pages)
561 : for the given workspace. Mode is one of
562 : FD_SHMEM_JOIN_MODE_READ_WRITE or FD_SHMEM_JOIN_MODE_READ_ONLY and
563 : determines the prot argument that will be passed to mmap when mapping
564 : the pages in (PROT_WRITE or PROT_READ respectively). */
565 : void
566 : fd_topo_join_workspace( fd_topo_t * topo,
567 : fd_topo_wksp_t * wksp,
568 : int mode );
569 :
570 : /* Join (map into the process) all shared memory (huge/gigantic pages)
571 : needed by all tiles in the topology. Mode is one of
572 : FD_SHMEM_JOIN_MODE_READ_WRITE or FD_SHMEM_JOIN_MODE_READ_ONLY and
573 : determines the prot argument that will be passed to mmap when
574 : mapping the pages in (PROT_WRITE or PROT_READ respectively). */
575 : void
576 : fd_topo_join_workspaces( fd_topo_t * topo,
577 : int mode );
578 :
579 : /* Leave (unmap from the process) the shared memory needed for the
580 : given workspace in the topology, if it was previously mapped.
581 :
582 : topo and wksp are assumed non-NULL. It is OK if the workspace
583 : has not been previously joined, in which case this is a no-op. */
584 :
585 : void
586 : fd_topo_leave_workspace( fd_topo_t * topo,
587 : fd_topo_wksp_t * wksp );
588 :
589 : /* Leave (unmap from the process) all shared memory needed by all
590 : tiles in the topology, if each of them was mapped.
591 :
592 : topo is assumed non-NULL. Only workspaces which were previously
593 : joined are unmapped. */
594 :
595 : void
596 : fd_topo_leave_workspaces( fd_topo_t * topo );
597 :
598 : /* Create the given workspace needed by the topology on the system.
599 : This does not "join" the workspaces (map their memory into the
600 : process), but only creates the .wksp file and formats it correctly
601 : as a workspace.
602 :
603 : Returns 0 on success and -1 on failure, with errno set to the error.
604 : The only reason for failure currently that will be returned is
605 : ENOMEM, as other unexpected errors will cause the program to exit.
606 :
607 : If update_existing is 1, the workspace will not be created from
608 : scratch but it will be assumed that it already exists from a prior
609 : run and needs to be maybe resized and then have the header
610 : structures reinitialized. This can save a very expensive operation
611 : of zeroing all of the workspace pages. This is dangerous in
612 : production because it can leave stray memory from prior runs around,
613 : and should only be used in development environments. */
614 :
615 : int
616 : fd_topo_create_workspace( fd_topo_t * topo,
617 : fd_topo_wksp_t * wksp,
618 : int update_existing );
619 :
620 : /* Join the standard IPC objects needed by the topology of this particular
621 : tile */
622 :
623 : void
624 : fd_topo_fill_tile( fd_topo_t * topo,
625 : fd_topo_tile_t * tile );
626 :
627 : /* Same as fd_topo_fill_tile but fills in all the objects for a
628 : particular workspace with the given mode. */
629 : void
630 : fd_topo_workspace_fill( fd_topo_t * topo,
631 : fd_topo_wksp_t * wksp );
632 :
633 : /* Apply a function to every object that is resident in the given
634 : workspace in the topology. */
635 :
636 : void
637 : fd_topo_wksp_apply( fd_topo_t * topo,
638 : fd_topo_wksp_t * wksp,
639 : void (* fn )( fd_topo_t const * topo, fd_topo_obj_t const * obj ) );
640 :
641 : /* Same as fd_topo_fill_tile but fills in all tiles in the topology. */
642 :
643 : void
644 : fd_topo_fill( fd_topo_t * topo );
645 :
646 : /* fd_topo_tile_stack_join joins a huge page optimized stack for the
647 : provided tile. The stack is assumed to already exist at a known
648 : path in the hugetlbfs mount. */
649 :
650 : void *
651 : fd_topo_tile_stack_join( char const * app_name,
652 : char const * tile_name,
653 : ulong tile_kind_id );
654 :
655 : /* Install the XDP program needed by the net tiles into the local device
656 : and return the xsk_map_fd. */
657 :
658 : fd_xdp_fds_t
659 : fd_topo_install_xdp( fd_topo_t * topo );
660 :
661 : /* fd_topo_run_single_process runs all the tiles in a single process
662 : (the calling process). This spawns a thread for each tile, switches
663 : that thread to the given UID and GID and then runs the tile in it.
664 : Each thread will never exit, as tiles are expected to run forever.
665 : An error is logged and the application will exit if a tile exits.
666 : The function itself does return after spawning all the threads.
667 :
668 : The threads will not be sandboxed in any way, except switching to the
669 : provided UID and GID, so they will share the same address space, and
670 : not have any seccomp restrictions or use any Linux namespaces. The
671 : calling thread will also switch to the provided UID and GID before
672 : it returns.
673 :
674 : In production, when running with an Agave child process this is
675 : used for spawning certain tiles inside the Agave address space.
676 : It's also useful for tooling and debugging, but is not how the main
677 : production Firedancer process runs. For production, each tile is run
678 : in its own address space with a separate process and full security
679 : sandbox.
680 :
681 : The agave argument determines which tiles are started. If the
682 : argument is 0 or 1, only non-agave (or only agave) tiles are started.
683 : If the argument is any other value, all tiles in the topology are
684 : started regardless of if they are Agave tiles or not. */
685 :
686 : void
687 : fd_topo_run_single_process( fd_topo_t * topo,
688 : int agave,
689 : uint uid,
690 : uint gid,
691 : fd_topo_run_tile_t (* tile_run )( fd_topo_tile_t * tile ),
692 : int * done_futex );
693 :
694 : /* fd_topo_run_tile runs the given tile directly within the current
695 : process (and thread). The function will never return, as tiles are
696 : expected to run forever. An error is logged and the application will
697 : exit if the tile exits.
698 :
699 : The sandbox argument determines if the current process will be
700 : sandboxed fully before starting the tile. The thread will switch to
701 : the UID and GID provided before starting the tile, even if the thread
702 : is not being sandboxed. Although POSIX specifies that all threads in
703 : a process must share a UID and GID, this is not the case on Linux.
704 : The thread will switch to the provided UID and GID without switching
705 : the other threads in the process.
706 :
707 : If keep_controlling_terminal is set to 0, and the sandbox is enabled
708 : the controlling terminal will be detached as an additional sandbox
709 : measure, but you will not be able to send Ctrl+C or other signals
710 : from the terminal. See fd_sandbox.h for more information.
711 :
712 : The allow_fd argument is only used if sandbox is true, and is a file
713 : descriptor which will be allowed to exist in the process. Normally
714 : the sandbox code rejects and aborts if there is an unexpected file
715 : descriptor present on boot. This is helpful to allow a parent
716 : process to be notified on termination of the tile by waiting for a
717 : pipe file descriptor to get closed.
718 :
719 : wait and debugger are both used in debugging. If wait is non-NULL,
720 : the runner will wait until the value pointed to by wait is non-zero
721 : before launching the tile. Likewise, if debugger is non-NULL, the
722 : runner will wait until a debugger is attached before setting the
723 : value pointed to by debugger to non-zero. These are intended to be
724 : used as a pair, where many tiles share a waiting reference, and then
725 : one of the tiles (a tile you want to attach the debugger to) has the
726 : same reference provided as the debugger, so all tiles will stop and
727 : wait for the debugger to attach to it before proceeding. */
728 :
729 : void
730 : fd_topo_run_tile( fd_topo_t * topo,
731 : fd_topo_tile_t * tile,
732 : int sandbox,
733 : int keep_controlling_terminal,
734 : uint uid,
735 : uint gid,
736 : int allow_fd,
737 : volatile int * wait,
738 : volatile int * debugger,
739 : fd_topo_run_tile_t * tile_run );
740 :
741 : /* This is for determining the value of RLIMIT_MLOCK that we need to
742 : successfully run all tiles in separate processes. The value returned
743 : is the maximum amount of memory that will be locked with mlock() by
744 : any individual process in the tree. Specifically, if we have three
745 : tile processes, and they each need to lock 5, 9, and 2 MiB of memory
746 : respectively, RLIMIT_MLOCK needs to be 9 MiB to allow all three
747 : process mlock() calls to succeed.
748 :
749 : Tiles lock memory in three ways. Any workspace they are using, they
750 : lock the entire workspace. Then each tile uses huge pages for the
751 : stack which are also locked, and finally some tiles use private
752 : locked mmaps outside the workspace for storing key material. The
753 : results here include all of this memory together.
754 :
755 : The result is not necessarily the amount of memory used by the tile
756 : process, although it will be quite close. Tiles could potentially
757 : allocate memory (eg, with brk) without needing to lock it, which
758 : would not need to included, and some kernel memory that tiles cause
759 : to be allocated (for example XSK buffers) is also not included. The
760 : actual amount of memory used will not be less than this value. */
761 : FD_FN_PURE ulong
762 : fd_topo_mlock_max_tile( fd_topo_t * topo );
763 :
764 : /* Same as fd_topo_mlock_max_tile, but for loading the entire topology
765 : into one process, rather than a separate process per tile. This is
766 : used, for example, by the configuration code when it creates all the
767 : workspaces, or the monitor that maps the entire system into one
768 : address space. */
769 : FD_FN_PURE ulong
770 : fd_topo_mlock( fd_topo_t * topo );
771 :
772 : /* This returns the number of gigantic pages needed by the topology on
773 : the provided numa node. It includes pages needed by the workspaces,
774 : as well as additional allocations like huge pages for process stacks
775 : and private key storage. */
776 :
777 : FD_FN_PURE ulong
778 : fd_topo_gigantic_page_cnt( fd_topo_t * topo,
779 : ulong numa_idx );
780 :
781 : /* This returns the number of huge pages in the application needed by
782 : the topology on the provided numa node. It includes pages needed by
783 : things placed in the hugetlbfs (workspaces, process stacks). If
784 : include_anonymous is true, it also includes anonymous hugepages which
785 : are needed but are not placed in the hugetlbfs. */
786 :
787 : FD_FN_PURE ulong
788 : fd_topo_huge_page_cnt( fd_topo_t * topo,
789 : ulong numa_idx,
790 : int include_anonymous );
791 :
792 : /* Check all invariants of the given topology to make sure it is valid.
793 : An invalid topology will cause the program to abort with an error
794 : message. */
795 : void
796 : fd_topo_validate( fd_topo_t const * topo );
797 :
798 : /* Prints a message describing the topology to an output stream. If
799 : stdout is true, will be written to stdout, otherwise will be written
800 : as a NOTICE log message to the log file. */
801 : void
802 : fd_topo_print_log( int stdout,
803 : fd_topo_t * topo );
804 :
805 : FD_PROTOTYPES_END
806 :
807 : #endif /* HEADER_fd_src_disco_topo_fd_topo_h */
|