Line data Source code
1 : #ifndef HEADER_fd_src_disco_topo_fd_topo_h
2 : #define HEADER_fd_src_disco_topo_fd_topo_h
3 :
4 : #include "../stem/fd_stem.h"
5 : #include "../../tango/fd_tango.h"
6 : #include "../../waltz/xdp/fd_xdp1.h"
7 : #include "../../ballet/base58/fd_base58.h"
8 : #include "../../util/net/fd_net_headers.h"
9 :
10 : /* Maximum number of workspaces that may be present in a topology. */
11 : #define FD_TOPO_MAX_WKSPS (256UL)
12 : /* Maximum number of links that may be present in a topology. */
13 : #define FD_TOPO_MAX_LINKS (256UL)
14 : /* Maximum number of tiles that may be present in a topology. */
15 : #define FD_TOPO_MAX_TILES (256UL)
16 : /* Maximum number of objects that may be present in a topology. */
17 : #define FD_TOPO_MAX_OBJS (4096UL)
18 : /* Maximum number of links that may go into any one tile in the
19 : topology. */
20 : #define FD_TOPO_MAX_TILE_IN_LINKS ( 128UL)
21 : /* Maximum number of links that a tile may write to. */
22 : #define FD_TOPO_MAX_TILE_OUT_LINKS ( 32UL)
23 : /* Maximum number of objects that a tile can use. */
24 : #define FD_TOPO_MAX_TILE_OBJS ( 256UL)
25 :
26 : /* Maximum number of additional ip addresses */
27 : #define FD_NET_MAX_SRC_ADDR 4
28 :
29 : /* A workspace is a Firedancer specific memory management structure that
30 : sits on top of 1 or more memory mapped gigantic or huge pages mounted
31 : to the hugetlbfs. */
32 : typedef struct {
33 : ulong id; /* The ID of this workspace. Indexed from [0, wksp_cnt). When placed in a topology, the ID must be the index of the workspace in the workspaces list. */
34 : char name[ 13UL ]; /* The name of this workspace, like "pack". There can be at most one of each workspace name in a topology. */
35 :
36 : ulong numa_idx; /* The index of the NUMA node on the system that this workspace should be allocated from. */
37 :
38 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
39 : struct {
40 : ulong page_sz; /* The size of the pages that this workspace is backed by. One of FD_PAGE_SIZE_*. */
41 : ulong page_cnt; /* The number of pages that must be mapped to this workspace to store all the data needed by consumers. */
42 : ulong part_max; /* The maximum number of partitions in the underlying workspace. There can only be this many allocations made at any one time. */
43 :
44 : fd_wksp_t * wksp; /* The workspace memory in the local process. */
45 : ulong known_footprint; /* Total size in bytes of all data in Firedancer that will be stored in this workspace at startup. */
46 : ulong total_footprint; /* Total size in bytes of all data in Firedancer that could be stored in this workspace, includes known data and loose data. */
47 : };
48 : } fd_topo_wksp_t;
49 :
50 : /* A link is an mcache in a workspace that has one producer and one or
51 : more consumers. A link may optionally also have a dcache, that holds
52 : fragments referred to by the mcache entries.
53 :
54 : A link belongs to exactly one workspace. A link has exactly one
55 : producer, and 1 or more consumers. Each consumer is either reliable
56 : or not reliable. A link has a depth and a MTU, which correspond to
57 : the depth and MTU of the mcache and dcache respectively. A MTU of
58 : zero means no dcache is needed, as there is no data. */
59 : typedef struct {
60 : ulong id; /* The ID of this link. Indexed from [0, link_cnt). When placed in a topology, the ID must be the index of the link in the links list. */
61 : char name[ 13UL ]; /* The name of this link, like "pack_bank". There can be multiple of each link name in a topology. */
62 : ulong kind_id; /* The ID of this link within its name. If there are N links of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a link, as does "id" on its own. */
63 :
64 : ulong depth; /* The depth of the mcache representing the link. */
65 : ulong mtu; /* The MTU of data fragments in the mcache. A value of 0 means there is no dcache. */
66 : ulong burst; /* The max amount of MTU sized data fragments that might be bursted to the dcache. */
67 :
68 : ulong mcache_obj_id;
69 : ulong dcache_obj_id;
70 :
71 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
72 : struct {
73 : fd_frag_meta_t * mcache; /* The mcache of this link. */
74 : void * dcache; /* The dcache of this link, if it has one. */
75 : };
76 : } fd_topo_link_t;
77 :
78 : /* A tile is a unique process that is spawned by Firedancer to represent
79 : one thread of execution. Firedancer sandboxes all tiles to their own
80 : process for security reasons.
81 :
82 : A tile belongs to exactly one workspace. A tile is a consumer of 0
83 : or more links, it's inputs. A tile is a producer of 0 or more output
84 : links.
85 :
86 : All input links will be automatically polled by the tile
87 : infrastructure, and output links will automatically source and manage
88 : credits from consumers. */
89 : typedef struct {
90 : ulong id; /* The ID of this tile. Indexed from [0, tile_cnt). When placed in a topology, the ID must be the index of the tile in the tiles list. */
91 : char name[ 7UL ]; /* The name of this tile. There can be multiple of each tile name in a topology. */
92 : ulong kind_id; /* The ID of this tile within its name. If there are n tile of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a tile, as does "id" on its own. */
93 : int is_agave; /* If the tile needs to run in the Agave (Anza) address space or not. */
94 :
95 : ulong cpu_idx; /* The CPU index to pin the tile on. A value of ULONG_MAX or more indicates the tile should be floating and not pinned to a core. */
96 :
97 : ulong in_cnt; /* The number of links that this tile reads from. */
98 : ulong in_link_id[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* The link_id of each link that this tile reads from, indexed in [0, in_cnt). */
99 : int in_link_reliable[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* If each link that this tile reads from is a reliable or unreliable consumer, indexed in [0, in_cnt). */
100 : int in_link_poll[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* If each link that this tile reads from should be polled by the tile infrastructure, indexed in [0, in_cnt).
101 : If the link is not polled, the tile will not receive frags for it and the tile writer is responsible for
102 : reading from the link. The link must be marked as unreliable as it is not flow controlled. */
103 :
104 : ulong out_cnt; /* The number of links that this tile writes to. */
105 : ulong out_link_id[ FD_TOPO_MAX_TILE_OUT_LINKS ]; /* The link_id of each link that this tile writes to, indexed in [0, link_cnt). */
106 :
107 : ulong tile_obj_id;
108 : ulong metrics_obj_id;
109 : ulong keyswitch_obj_id;
110 : ulong in_link_fseq_obj_id[ FD_TOPO_MAX_TILE_IN_LINKS ];
111 :
112 : ulong uses_obj_cnt;
113 : ulong uses_obj_id[ FD_TOPO_MAX_TILE_OBJS ];
114 : int uses_obj_mode[ FD_TOPO_MAX_TILE_OBJS ];
115 :
116 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
117 : struct {
118 : ulong * metrics; /* The shared memory for metrics that this tile should write. Consumer by monitoring and metrics writing tiles. */
119 :
120 : /* The fseq of each link that this tile reads from. Multiple fseqs
121 : may point to the link, if there are multiple consumers. An fseq
122 : can be uniquely identified via (link_id, tile_id), or (link_kind,
123 : link_kind_id, tile_kind, tile_kind_id) */
124 : ulong * in_link_fseq[ FD_TOPO_MAX_TILE_IN_LINKS ];
125 : };
126 :
127 : /* Configuration fields. These are required to be known by the topology so it can determine the
128 : total size of Firedancer in memory. */
129 : union {
130 : struct {
131 : char provider[ 8 ]; /* "xdp" or "socket" */
132 : char interface[ 16 ];
133 : ulong xdp_rx_queue_size;
134 : ulong xdp_tx_queue_size;
135 : ulong free_ring_depth;
136 : long tx_flush_timeout_ns;
137 : char xdp_mode[8];
138 : int zero_copy;
139 :
140 : ushort shred_listen_port;
141 : ushort quic_transaction_listen_port;
142 : ushort legacy_transaction_listen_port;
143 : ushort gossip_listen_port;
144 : ushort repair_intake_listen_port;
145 : ushort repair_serve_listen_port;
146 :
147 : ulong umem_dcache_obj_id; /* dcache for XDP UMEM frames */
148 : ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */
149 : ulong fib4_main_obj_id; /* fib4 containing main route table */
150 : ulong fib4_local_obj_id; /* fib4 containing local route table */
151 : ulong neigh4_obj_id; /* neigh4 hash map header */
152 : ulong neigh4_ele_obj_id; /* neigh4 hash map slots */
153 : } net;
154 :
155 : struct {
156 : ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */
157 : ulong fib4_main_obj_id; /* fib4 containing main route table */
158 : ulong fib4_local_obj_id; /* fib4 containing local route table */
159 : char neigh_if[ 16 ]; /* neigh4 interface name */
160 : ulong neigh4_obj_id; /* neigh4 hash map header */
161 : ulong neigh4_ele_obj_id; /* neigh4 hash map slots */
162 : } netlink;
163 :
164 : struct {
165 : uint out_depth;
166 : uint reasm_cnt;
167 : ulong max_concurrent_connections;
168 : ulong max_concurrent_handshakes;
169 : ushort quic_transaction_listen_port;
170 : ulong idle_timeout_millis;
171 : uint ack_delay_millis;
172 : int retry;
173 : } quic;
174 :
175 : struct {
176 : ulong tcache_depth;
177 : } verify;
178 :
179 : struct {
180 : ulong tcache_depth;
181 : } dedup;
182 :
183 : struct {
184 : char url[ 256 ];
185 : char tls_domain_name[ 256 ];
186 : char identity_key_path[ PATH_MAX ];
187 : } bundle;
188 :
189 : struct {
190 : ulong max_pending_transactions;
191 : ulong bank_tile_count;
192 : int larger_max_cost_per_block;
193 : int larger_shred_limits_per_block;
194 : int use_consumed_cus;
195 : struct {
196 : int enabled;
197 : uchar tip_distribution_program_addr[ 32 ];
198 : uchar tip_payment_program_addr[ 32 ];
199 : uchar tip_distribution_authority[ 32 ];
200 : ulong commission_bps;
201 : char identity_key_path[ PATH_MAX ];
202 : char vote_account_path[ PATH_MAX ]; /* or pubkey is okay */
203 : } bundle;
204 : } pack;
205 :
206 : struct {
207 : int lagged_consecutive_leader_start;
208 : int plugins_enabled;
209 : ulong bank_cnt;
210 : char identity_key_path[ PATH_MAX ];
211 : struct {
212 : int enabled;
213 : uchar tip_payment_program_addr[ 32 ];
214 : uchar tip_distribution_program_addr[ 32 ];
215 : char vote_account_path[ PATH_MAX ];
216 : } bundle;
217 : } poh;
218 :
219 : struct {
220 : ulong depth;
221 : ulong fec_resolver_depth;
222 : char identity_key_path[ PATH_MAX ];
223 : ushort shred_listen_port;
224 : int larger_shred_limits_per_block;
225 : ulong expected_shred_version;
226 : } shred;
227 :
228 : struct {
229 : ulong disable_blockstore_from_slot;
230 : } store;
231 :
232 : struct {
233 : char identity_key_path[ PATH_MAX ];
234 : } sign;
235 :
236 : struct {
237 : uint listen_addr;
238 : ushort listen_port;
239 :
240 : int is_voting;
241 :
242 : char cluster[ 32 ];
243 : char identity_key_path[ PATH_MAX ];
244 :
245 : ulong max_http_connections;
246 : ulong max_websocket_connections;
247 : ulong max_http_request_length;
248 : ulong send_buffer_size_mb;
249 : } gui;
250 :
251 : struct {
252 : uint prometheus_listen_addr;
253 : ushort prometheus_listen_port;
254 : } metric;
255 :
256 : struct {
257 : ulong fec_max;
258 : ulong slice_max;
259 :
260 : int tx_metadata_storage;
261 : char capture[ PATH_MAX ];
262 : char funk_checkpt[ PATH_MAX ];
263 : ulong funk_rec_max;
264 : ulong funk_sz_gb;
265 : ulong funk_txn_max;
266 : char funk_file[ PATH_MAX ];
267 : char genesis[ PATH_MAX ];
268 : char incremental[ PATH_MAX ];
269 : char slots_replayed[ PATH_MAX ];
270 : char snapshot[ PATH_MAX ];
271 : char status_cache[ PATH_MAX ];
272 : ulong tpool_thread_count;
273 : char cluster_version[ 32 ];
274 : char tower_checkpt[ PATH_MAX ];
275 : int plugins_enabled;
276 :
277 : /* not specified in TOML */
278 :
279 : char identity_key_path[ PATH_MAX ];
280 : uint ip_addr;
281 : int vote;
282 : char vote_account_path[ PATH_MAX ];
283 : ulong bank_tile_count;
284 : ulong exec_tile_count;
285 : ulong full_interval;
286 : ulong incremental_interval;
287 :
288 : char blockstore_file[ PATH_MAX ];
289 : char blockstore_checkpt[ PATH_MAX ];
290 : } replay;
291 :
292 : struct {
293 : int in_wen_restart;
294 : int tower_checkpt_fileno;
295 : char funk_file[ PATH_MAX ];
296 : char tower_checkpt[ PATH_MAX ];
297 : char identity_key_path[ PATH_MAX ];
298 : char genesis_hash[ FD_BASE58_ENCODED_32_SZ ];
299 : char restart_coordinator[ FD_BASE58_ENCODED_32_SZ ];
300 : } restart;
301 :
302 : struct {
303 : ulong dummy;
304 : } exec;
305 :
306 : struct {
307 : ushort send_to_port;
308 : uint send_to_ip_addr;
309 : ulong conn_cnt;
310 : int no_quic;
311 : } benchs;
312 :
313 : struct {
314 : ushort rpc_port;
315 : uint rpc_ip_addr;
316 : } bencho;
317 :
318 : struct {
319 : ulong accounts_cnt;
320 : int mode;
321 : float contending_fraction;
322 : float cu_price_spread;
323 : } benchg;
324 :
325 : struct {
326 : ushort gossip_listen_port;
327 : # define FD_TOPO_GOSSIP_ENTRYPOINTS_MAX 16
328 : ulong entrypoints_cnt;
329 : fd_ip4_port_t entrypoints[ FD_TOPO_GOSSIP_ENTRYPOINTS_MAX ];
330 : uint ip_addr;
331 : char identity_key_path[ PATH_MAX ];
332 : ushort tvu_port;
333 : ushort tpu_port;
334 : ushort tpu_quic_port;
335 : ushort tpu_vote_port;
336 : ushort repair_serve_port;
337 : ulong expected_shred_version;
338 : } gossip;
339 :
340 : struct {
341 : ushort repair_intake_listen_port;
342 : ushort repair_serve_listen_port;
343 : char good_peer_cache_file[ PATH_MAX ];
344 :
345 : /* non-config */
346 :
347 : int good_peer_cache_file_fd;
348 : char identity_key_path[ PATH_MAX ];
349 : } repair;
350 :
351 : struct {
352 : char slots_pending[PATH_MAX];
353 :
354 : ulong expected_shred_version;
355 :
356 : /* non-config */
357 :
358 : char identity_key_path[ PATH_MAX ];
359 : char shred_cap_archive[ PATH_MAX ];
360 : char shred_cap_replay[ PATH_MAX ];
361 : ulong shred_cap_end_slot;
362 :
363 : char blockstore_file[ PATH_MAX ];
364 : char blockstore_restore[ PATH_MAX ];
365 : } store_int;
366 :
367 : struct {
368 : ushort tpu_listen_port;
369 :
370 : /* non-config */
371 :
372 : uint ip_addr;
373 : char identity_key_path[ PATH_MAX ];
374 : } sender;
375 :
376 : struct {
377 : char identity_key_path[ PATH_MAX ];
378 : } eqvoc;
379 :
380 : struct {
381 : ushort rpc_port;
382 : ushort tpu_port;
383 : uint tpu_ip_addr;
384 : char identity_key_path[ PATH_MAX ];
385 : } rpcserv;
386 :
387 : struct {
388 : ulong full_interval;
389 : ulong incremental_interval;
390 : char out_dir[ PATH_MAX ];
391 : int tmp_fd;
392 : int tmp_inc_fd;
393 : int full_snapshot_fd;
394 : int incremental_snapshot_fd;
395 : ulong hash_tpool_thread_count;
396 : } batch;
397 :
398 : struct {
399 : uint fake_dst_ip;
400 : } pktgen;
401 :
402 : };
403 : } fd_topo_tile_t;
404 :
405 : typedef struct {
406 : ulong id;
407 : char name[ 13UL ];
408 : ulong wksp_id;
409 :
410 : ulong offset;
411 : ulong footprint;
412 : } fd_topo_obj_t;
413 :
414 : /* An fd_topo_t represents the overall structure of a Firedancer
415 : configuration, describing all the workspaces, tiles, and links
416 : between them. */
417 : struct fd_topo {
418 : char app_name[ 256UL ];
419 : uchar props[ 16384UL ];
420 :
421 : ulong wksp_cnt;
422 : ulong link_cnt;
423 : ulong tile_cnt;
424 : ulong obj_cnt;
425 :
426 : fd_topo_wksp_t workspaces[ FD_TOPO_MAX_WKSPS ];
427 : fd_topo_link_t links[ FD_TOPO_MAX_LINKS ];
428 : fd_topo_tile_t tiles[ FD_TOPO_MAX_TILES ];
429 : fd_topo_obj_t objs[ FD_TOPO_MAX_OBJS ];
430 :
431 : ulong agave_affinity_cnt;
432 : # if !FD_HAS_NO_AGAVE
433 : ulong agave_affinity_cpu_idx[ FD_TILE_MAX ];
434 : # endif
435 :
436 : ulong max_page_size; /* 2^21 or 2^30 */
437 : ulong gigantic_page_threshold; /* see [hugetlbfs.gigantic_page_threshold_mib]*/
438 : };
439 : typedef struct fd_topo fd_topo_t;
440 :
441 : typedef struct {
442 : char const * name;
443 :
444 : int keep_host_networking;
445 : int allow_connect;
446 : ulong rlimit_file_cnt;
447 : ulong rlimit_address_space;
448 : ulong rlimit_data;
449 : int for_tpool;
450 :
451 : ulong (*populate_allowed_seccomp)( fd_topo_t const * topo, fd_topo_tile_t const * tile, ulong out_cnt, struct sock_filter * out );
452 : ulong (*populate_allowed_fds )( fd_topo_t const * topo, fd_topo_tile_t const * tile, ulong out_fds_sz, int * out_fds );
453 : ulong (*scratch_align )( void );
454 : ulong (*scratch_footprint )( fd_topo_tile_t const * tile );
455 : ulong (*loose_footprint )( fd_topo_tile_t const * tile );
456 : void (*privileged_init )( fd_topo_t * topo, fd_topo_tile_t * tile );
457 : void (*unprivileged_init )( fd_topo_t * topo, fd_topo_tile_t * tile );
458 : void (*run )( fd_topo_t * topo, fd_topo_tile_t * tile );
459 : ulong (*rlimit_file_cnt_fn )( fd_topo_t const * topo, fd_topo_tile_t const * tile );
460 : } fd_topo_run_tile_t;
461 :
462 : FD_PROTOTYPES_BEGIN
463 :
464 : FD_FN_CONST static inline ulong
465 0 : fd_topo_workspace_align( void ) {
466 : /* This needs to be the max( align ) of all the child members that
467 : could be aligned into this workspace, otherwise our footprint
468 : calculation will not be correct. For now just set to 4096 but this
469 : should probably be calculated dynamically, or we should reduce
470 : those child aligns if we can. */
471 0 : return 4096UL;
472 0 : }
473 :
474 : FD_FN_PURE static inline void *
475 : fd_topo_obj_laddr( fd_topo_t const * topo,
476 0 : ulong obj_id ) {
477 0 : fd_topo_obj_t const * obj = &topo->objs[ obj_id ];
478 0 : return (void *)((ulong)topo->workspaces[ obj->wksp_id ].wksp + obj->offset);
479 0 : }
480 :
481 : FD_FN_PURE static inline ulong
482 : fd_topo_tile_name_cnt( fd_topo_t const * topo,
483 0 : char const * name ) {
484 0 : ulong cnt = 0;
485 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
486 0 : if( FD_UNLIKELY( !strcmp( topo->tiles[ i ].name, name ) ) ) cnt++;
487 0 : }
488 0 : return cnt;
489 0 : }
490 :
491 : /* Finds the workspace of a given name in the topology. Returns
492 : ULONG_MAX if there is no such workspace. There can be at most one
493 : workspace of a given name. */
494 :
495 : FD_FN_PURE static inline ulong
496 : fd_topo_find_wksp( fd_topo_t const * topo,
497 582 : char const * name ) {
498 6375 : for( ulong i=0; i<topo->wksp_cnt; i++ ) {
499 6375 : if( FD_UNLIKELY( !strcmp( topo->workspaces[ i ].name, name ) ) ) return i;
500 6375 : }
501 0 : return ULONG_MAX;
502 582 : }
503 :
504 : /* Find the tile of a given name and kind_id in the topology, there will
505 : be at most one such tile, since kind_id is unique among the name.
506 : Returns ULONG_MAX if there is no such tile. */
507 :
508 : FD_FN_PURE static inline ulong
509 : fd_topo_find_tile( fd_topo_t const * topo,
510 : char const * name,
511 318 : ulong kind_id ) {
512 4449 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
513 4449 : if( FD_UNLIKELY( !strcmp( topo->tiles[ i ].name, name ) ) && topo->tiles[ i ].kind_id == kind_id ) return i;
514 4449 : }
515 0 : return ULONG_MAX;
516 318 : }
517 :
518 : /* Find the link of a given name and kind_id in the topology, there will
519 : be at most one such link, since kind_id is unique among the name.
520 : Returns ULONG_MAX if there is no such link. */
521 :
522 : FD_FN_PURE static inline ulong
523 : fd_topo_find_link( fd_topo_t const * topo,
524 : char const * name,
525 285 : ulong kind_id ) {
526 5280 : for( ulong i=0; i<topo->link_cnt; i++ ) {
527 5280 : if( FD_UNLIKELY( !strcmp( topo->links[ i ].name, name ) ) && topo->links[ i ].kind_id == kind_id ) return i;
528 5280 : }
529 0 : return ULONG_MAX;
530 285 : }
531 :
532 : FD_FN_PURE static inline ulong
533 : fd_topo_find_tile_in_link( fd_topo_t const * topo,
534 : fd_topo_tile_t const * tile,
535 : char const * name,
536 0 : ulong kind_id ) {
537 0 : for( ulong i=0; i<tile->in_cnt; i++ ) {
538 0 : if( FD_UNLIKELY( !strcmp( topo->links[ tile->in_link_id[ i ] ].name, name ) )
539 0 : && topo->links[ tile->in_link_id[ i ] ].kind_id == kind_id ) return i;
540 0 : }
541 0 : return ULONG_MAX;
542 0 : }
543 :
544 : FD_FN_PURE static inline ulong
545 : fd_topo_find_tile_out_link( fd_topo_t const * topo,
546 : fd_topo_tile_t const * tile,
547 : char const * name,
548 0 : ulong kind_id ) {
549 0 : for( ulong i=0; i<tile->out_cnt; i++ ) {
550 0 : if( FD_UNLIKELY( !strcmp( topo->links[ tile->out_link_id[ i ] ].name, name ) )
551 0 : && topo->links[ tile->out_link_id[ i ] ].kind_id == kind_id ) return i;
552 0 : }
553 0 : return ULONG_MAX;
554 0 : }
555 :
556 : /* Find the id of the tile which is a producer for the given link. If
557 : no tile is a producer for the link, returns ULONG_MAX. This should
558 : not be possible for a well formed and validated topology. */
559 : FD_FN_PURE static inline ulong
560 : fd_topo_find_link_producer( fd_topo_t const * topo,
561 0 : fd_topo_link_t const * link ) {
562 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
563 0 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
564 :
565 0 : for( ulong j=0; j<tile->out_cnt; j++ ) {
566 0 : if( FD_UNLIKELY( tile->out_link_id[ j ] == link->id ) ) return i;
567 0 : }
568 0 : }
569 0 : return ULONG_MAX;
570 0 : }
571 :
572 : /* Given a link, count the number of consumers of that link among all
573 : the tiles in the topology. */
574 : FD_FN_PURE static inline ulong
575 : fd_topo_link_consumer_cnt( fd_topo_t const * topo,
576 117 : fd_topo_link_t const * link ) {
577 117 : ulong cnt = 0;
578 2925 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
579 2808 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
580 9360 : for( ulong j=0; j<tile->in_cnt; j++ ) {
581 6552 : if( FD_UNLIKELY( tile->in_link_id[ j ] == link->id ) ) cnt++;
582 6552 : }
583 2808 : }
584 :
585 117 : return cnt;
586 117 : }
587 :
588 : /* Given a link, count the number of reliable consumers of that link
589 : among all the tiles in the topology. */
590 : FD_FN_PURE static inline ulong
591 : fd_topo_link_reliable_consumer_cnt( fd_topo_t const * topo,
592 0 : fd_topo_link_t const * link ) {
593 0 : ulong cnt = 0;
594 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
595 0 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
596 0 : for( ulong j=0; j<tile->in_cnt; j++ ) {
597 0 : if( FD_UNLIKELY( tile->in_link_id[ j ] == link->id && tile->in_link_reliable[ j ] ) ) cnt++;
598 0 : }
599 0 : }
600 0 :
601 0 : return cnt;
602 0 : }
603 :
604 : /* Join (map into the process) all shared memory (huge/gigantic pages)
605 : needed by the tile, in the given topology. All memory associated
606 : with the tile (aka. used by links that the tile either produces to or
607 : consumes from, or used by the tile itself for its cnc) will be
608 : attached (mapped into the process).
609 :
610 : This is needed to play nicely with the sandbox. Once a process is
611 : sandboxed we can no longer map any memory. */
612 : void
613 : fd_topo_join_tile_workspaces( fd_topo_t * topo,
614 : fd_topo_tile_t * tile );
615 :
616 : /* Join (map into the process) the shared memory (huge/gigantic pages)
617 : for the given workspace. Mode is one of
618 : FD_SHMEM_JOIN_MODE_READ_WRITE or FD_SHMEM_JOIN_MODE_READ_ONLY and
619 : determines the prot argument that will be passed to mmap when mapping
620 : the pages in (PROT_WRITE or PROT_READ respectively). */
621 : void
622 : fd_topo_join_workspace( fd_topo_t * topo,
623 : fd_topo_wksp_t * wksp,
624 : int mode );
625 :
626 : /* Join (map into the process) all shared memory (huge/gigantic pages)
627 : needed by all tiles in the topology. Mode is one of
628 : FD_SHMEM_JOIN_MODE_READ_WRITE or FD_SHMEM_JOIN_MODE_READ_ONLY and
629 : determines the prot argument that will be passed to mmap when
630 : mapping the pages in (PROT_WRITE or PROT_READ respectively). */
631 : void
632 : fd_topo_join_workspaces( fd_topo_t * topo,
633 : int mode );
634 :
635 : /* Leave (unmap from the process) the shared memory needed for the
636 : given workspace in the topology, if it was previously mapped.
637 :
638 : topo and wksp are assumed non-NULL. It is OK if the workspace
639 : has not been previously joined, in which case this is a no-op. */
640 :
641 : void
642 : fd_topo_leave_workspace( fd_topo_t * topo,
643 : fd_topo_wksp_t * wksp );
644 :
645 : /* Leave (unmap from the process) all shared memory needed by all
646 : tiles in the topology, if each of them was mapped.
647 :
648 : topo is assumed non-NULL. Only workspaces which were previously
649 : joined are unmapped. */
650 :
651 : void
652 : fd_topo_leave_workspaces( fd_topo_t * topo );
653 :
654 : /* Create the given workspace needed by the topology on the system.
655 : This does not "join" the workspaces (map their memory into the
656 : process), but only creates the .wksp file and formats it correctly
657 : as a workspace.
658 :
659 : Returns 0 on success and -1 on failure, with errno set to the error.
660 : The only reason for failure currently that will be returned is
661 : ENOMEM, as other unexpected errors will cause the program to exit.
662 :
663 : If update_existing is 1, the workspace will not be created from
664 : scratch but it will be assumed that it already exists from a prior
665 : run and needs to be maybe resized and then have the header
666 : structures reinitialized. This can save a very expensive operation
667 : of zeroing all of the workspace pages. This is dangerous in
668 : production because it can leave stray memory from prior runs around,
669 : and should only be used in development environments. */
670 :
671 : int
672 : fd_topo_create_workspace( fd_topo_t * topo,
673 : fd_topo_wksp_t * wksp,
674 : int update_existing );
675 :
676 : /* Join the standard IPC objects needed by the topology of this particular
677 : tile */
678 :
679 : void
680 : fd_topo_fill_tile( fd_topo_t * topo,
681 : fd_topo_tile_t * tile );
682 :
683 : /* Same as fd_topo_fill_tile but fills in all the objects for a
684 : particular workspace with the given mode. */
685 : void
686 : fd_topo_workspace_fill( fd_topo_t * topo,
687 : fd_topo_wksp_t * wksp );
688 :
689 : /* Apply a function to every object that is resident in the given
690 : workspace in the topology. */
691 :
692 : void
693 : fd_topo_wksp_apply( fd_topo_t const * topo,
694 : fd_topo_wksp_t const * wksp,
695 : void (* fn )( fd_topo_t const * topo, fd_topo_obj_t const * obj ) );
696 :
697 : /* Same as fd_topo_fill_tile but fills in all tiles in the topology. */
698 :
699 : void
700 : fd_topo_fill( fd_topo_t * topo );
701 :
702 : /* fd_topo_tile_stack_join joins a huge page optimized stack for the
703 : provided tile. The stack is assumed to already exist at a known
704 : path in the hugetlbfs mount. */
705 :
706 : void *
707 : fd_topo_tile_stack_join( char const * app_name,
708 : char const * tile_name,
709 : ulong tile_kind_id );
710 :
711 : /* Install the XDP program needed by the net tiles into the local device
712 : and return the xsk_map_fd. */
713 :
714 : fd_xdp_fds_t
715 : fd_topo_install_xdp( fd_topo_t const * topo );
716 :
717 : /* fd_topo_run_single_process runs all the tiles in a single process
718 : (the calling process). This spawns a thread for each tile, switches
719 : that thread to the given UID and GID and then runs the tile in it.
720 : Each thread will never exit, as tiles are expected to run forever.
721 : An error is logged and the application will exit if a tile exits.
722 : The function itself does return after spawning all the threads.
723 :
724 : The threads will not be sandboxed in any way, except switching to the
725 : provided UID and GID, so they will share the same address space, and
726 : not have any seccomp restrictions or use any Linux namespaces. The
727 : calling thread will also switch to the provided UID and GID before
728 : it returns.
729 :
730 : In production, when running with an Agave child process this is
731 : used for spawning certain tiles inside the Agave address space.
732 : It's also useful for tooling and debugging, but is not how the main
733 : production Firedancer process runs. For production, each tile is run
734 : in its own address space with a separate process and full security
735 : sandbox.
736 :
737 : The agave argument determines which tiles are started. If the
738 : argument is 0 or 1, only non-agave (or only agave) tiles are started.
739 : If the argument is any other value, all tiles in the topology are
740 : started regardless of if they are Agave tiles or not. */
741 :
742 : void
743 : fd_topo_run_single_process( fd_topo_t * topo,
744 : int agave,
745 : uint uid,
746 : uint gid,
747 : fd_topo_run_tile_t (* tile_run )( fd_topo_tile_t const * tile ),
748 : int * done_futex );
749 :
750 : /* fd_topo_run_tile runs the given tile directly within the current
751 : process (and thread). The function will never return, as tiles are
752 : expected to run forever. An error is logged and the application will
753 : exit if the tile exits.
754 :
755 : The sandbox argument determines if the current process will be
756 : sandboxed fully before starting the tile. The thread will switch to
757 : the UID and GID provided before starting the tile, even if the thread
758 : is not being sandboxed. Although POSIX specifies that all threads in
759 : a process must share a UID and GID, this is not the case on Linux.
760 : The thread will switch to the provided UID and GID without switching
761 : the other threads in the process.
762 :
763 : If keep_controlling_terminal is set to 0, and the sandbox is enabled
764 : the controlling terminal will be detached as an additional sandbox
765 : measure, but you will not be able to send Ctrl+C or other signals
766 : from the terminal. See fd_sandbox.h for more information.
767 :
768 : The allow_fd argument is only used if sandbox is true, and is a file
769 : descriptor which will be allowed to exist in the process. Normally
770 : the sandbox code rejects and aborts if there is an unexpected file
771 : descriptor present on boot. This is helpful to allow a parent
772 : process to be notified on termination of the tile by waiting for a
773 : pipe file descriptor to get closed.
774 :
775 : wait and debugger are both used in debugging. If wait is non-NULL,
776 : the runner will wait until the value pointed to by wait is non-zero
777 : before launching the tile. Likewise, if debugger is non-NULL, the
778 : runner will wait until a debugger is attached before setting the
779 : value pointed to by debugger to non-zero. These are intended to be
780 : used as a pair, where many tiles share a waiting reference, and then
781 : one of the tiles (a tile you want to attach the debugger to) has the
782 : same reference provided as the debugger, so all tiles will stop and
783 : wait for the debugger to attach to it before proceeding. */
784 :
785 : void
786 : fd_topo_run_tile( fd_topo_t * topo,
787 : fd_topo_tile_t * tile,
788 : int sandbox,
789 : int keep_controlling_terminal,
790 : int dumpable,
791 : uint uid,
792 : uint gid,
793 : int allow_fd,
794 : volatile int * wait,
795 : volatile int * debugger,
796 : fd_topo_run_tile_t * tile_run );
797 :
798 : /* This is for determining the value of RLIMIT_MLOCK that we need to
799 : successfully run all tiles in separate processes. The value returned
800 : is the maximum amount of memory that will be locked with mlock() by
801 : any individual process in the tree. Specifically, if we have three
802 : tile processes, and they each need to lock 5, 9, and 2 MiB of memory
803 : respectively, RLIMIT_MLOCK needs to be 9 MiB to allow all three
804 : process mlock() calls to succeed.
805 :
806 : Tiles lock memory in three ways. Any workspace they are using, they
807 : lock the entire workspace. Then each tile uses huge pages for the
808 : stack which are also locked, and finally some tiles use private
809 : locked mmaps outside the workspace for storing key material. The
810 : results here include all of this memory together.
811 :
812 : The result is not necessarily the amount of memory used by the tile
813 : process, although it will be quite close. Tiles could potentially
814 : allocate memory (eg, with brk) without needing to lock it, which
815 : would not need to included, and some kernel memory that tiles cause
816 : to be allocated (for example XSK buffers) is also not included. The
817 : actual amount of memory used will not be less than this value. */
818 : FD_FN_PURE ulong
819 : fd_topo_mlock_max_tile( fd_topo_t const * topo );
820 :
821 : /* Same as fd_topo_mlock_max_tile, but for loading the entire topology
822 : into one process, rather than a separate process per tile. This is
823 : used, for example, by the configuration code when it creates all the
824 : workspaces, or the monitor that maps the entire system into one
825 : address space. */
826 : FD_FN_PURE ulong
827 : fd_topo_mlock( fd_topo_t const * topo );
828 :
829 : /* This returns the number of gigantic pages needed by the topology on
830 : the provided numa node. It includes pages needed by the workspaces,
831 : as well as additional allocations like huge pages for process stacks
832 : and private key storage. */
833 :
834 : FD_FN_PURE ulong
835 : fd_topo_gigantic_page_cnt( fd_topo_t const * topo,
836 : ulong numa_idx );
837 :
838 : /* This returns the number of huge pages in the application needed by
839 : the topology on the provided numa node. It includes pages needed by
840 : things placed in the hugetlbfs (workspaces, process stacks). If
841 : include_anonymous is true, it also includes anonymous hugepages which
842 : are needed but are not placed in the hugetlbfs. */
843 :
844 : FD_FN_PURE ulong
845 : fd_topo_huge_page_cnt( fd_topo_t const * topo,
846 : ulong numa_idx,
847 : int include_anonymous );
848 :
849 : /* Check all invariants of the given topology to make sure it is valid.
850 : An invalid topology will cause the program to abort with an error
851 : message. */
852 : void
853 : fd_topo_validate( fd_topo_t const * topo );
854 :
855 : /* Prints a message describing the topology to an output stream. If
856 : stdout is true, will be written to stdout, otherwise will be written
857 : as a NOTICE log message to the log file. */
858 : void
859 : fd_topo_print_log( int stdout,
860 : fd_topo_t * topo );
861 :
862 : FD_PROTOTYPES_END
863 :
864 : #endif /* HEADER_fd_src_disco_topo_fd_topo_h */
|