Line data Source code
1 : #ifndef HEADER_fd_src_disco_topo_fd_topo_h
2 : #define HEADER_fd_src_disco_topo_fd_topo_h
3 :
4 : #include "../stem/fd_stem.h"
5 : #include "../../tango/fd_tango.h"
6 : #include "../../waltz/xdp/fd_xdp1.h"
7 : #include "../../ballet/base58/fd_base58.h"
8 : #include "../../util/net/fd_net_headers.h"
9 :
10 : /* Maximum number of workspaces that may be present in a topology. */
11 : #define FD_TOPO_MAX_WKSPS (256UL)
12 : /* Maximum number of links that may be present in a topology. */
13 : #define FD_TOPO_MAX_LINKS (256UL)
14 : /* Maximum number of tiles that may be present in a topology. */
15 : #define FD_TOPO_MAX_TILES (256UL)
16 : /* Maximum number of objects that may be present in a topology. */
17 : #define FD_TOPO_MAX_OBJS (4096UL)
18 : /* Maximum number of links that may go into any one tile in the
19 : topology. */
20 : #define FD_TOPO_MAX_TILE_IN_LINKS ( 128UL)
21 : /* Maximum number of links that a tile may write to. */
22 : #define FD_TOPO_MAX_TILE_OUT_LINKS ( 32UL)
23 : /* Maximum number of objects that a tile can use. */
24 : #define FD_TOPO_MAX_TILE_OBJS ( 256UL)
25 :
26 : /* Maximum number of additional ip addresses */
27 : #define FD_NET_MAX_SRC_ADDR 4
28 :
29 : /* Maximum number of additional destinations for leader shreds and for retransmitted shreds */
30 : #define FD_TOPO_ADTL_DESTS_MAX ( 32UL)
31 :
32 :
33 : /* A workspace is a Firedancer specific memory management structure that
34 : sits on top of 1 or more memory mapped gigantic or huge pages mounted
35 : to the hugetlbfs. */
36 : typedef struct {
37 : ulong id; /* The ID of this workspace. Indexed from [0, wksp_cnt). When placed in a topology, the ID must be the index of the workspace in the workspaces list. */
38 : char name[ 13UL ]; /* The name of this workspace, like "pack". There can be at most one of each workspace name in a topology. */
39 :
40 : ulong numa_idx; /* The index of the NUMA node on the system that this workspace should be allocated from. */
41 :
42 : int is_locked; /* If the workspace should use pages locked and pinned to a specific numa node. */
43 :
44 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
45 : struct {
46 : ulong page_sz; /* The size of the pages that this workspace is backed by. One of FD_PAGE_SIZE_*. */
47 : ulong page_cnt; /* The number of pages that must be mapped to this workspace to store all the data needed by consumers. */
48 : ulong part_max; /* The maximum number of partitions in the underlying workspace. There can only be this many allocations made at any one time. */
49 :
50 : fd_wksp_t * wksp; /* The workspace memory in the local process. */
51 : ulong known_footprint; /* Total size in bytes of all data in Firedancer that will be stored in this workspace at startup. */
52 : ulong total_footprint; /* Total size in bytes of all data in Firedancer that could be stored in this workspace, includes known data and loose data. */
53 : };
54 : } fd_topo_wksp_t;
55 :
56 : /* A link is an mcache in a workspace that has one producer and one or
57 : more consumers. A link may optionally also have a dcache, that holds
58 : fragments referred to by the mcache entries.
59 :
60 : A link belongs to exactly one workspace. A link has exactly one
61 : producer, and 1 or more consumers. Each consumer is either reliable
62 : or not reliable. A link has a depth and a MTU, which correspond to
63 : the depth and MTU of the mcache and dcache respectively. A MTU of
64 : zero means no dcache is needed, as there is no data. */
65 : typedef struct {
66 : ulong id; /* The ID of this link. Indexed from [0, link_cnt). When placed in a topology, the ID must be the index of the link in the links list. */
67 : char name[ 13UL ]; /* The name of this link, like "pack_bank". There can be multiple of each link name in a topology. */
68 : ulong kind_id; /* The ID of this link within its name. If there are N links of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a link, as does "id" on its own. */
69 :
70 : ulong depth; /* The depth of the mcache representing the link. */
71 : ulong mtu; /* The MTU of data fragments in the mcache. A value of 0 means there is no dcache. */
72 : ulong burst; /* The max amount of MTU sized data fragments that might be bursted to the dcache. */
73 :
74 : ulong mcache_obj_id;
75 : ulong dcache_obj_id;
76 :
77 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
78 : struct {
79 : fd_frag_meta_t * mcache; /* The mcache of this link. */
80 : void * dcache; /* The dcache of this link, if it has one. */
81 : };
82 :
83 : uint permit_no_consumers : 1; /* Permit a topology where this link has no consumers */
84 : uint permit_no_producers : 1; /* Permit a topology where this link has no producers */
85 : } fd_topo_link_t;
86 :
87 : /* Be careful: ip and host are in different byte order */
88 : typedef struct {
89 : uint ip; /* in network byte order */
90 : ushort port; /* in host byte order */
91 : } fd_topo_ip_port_t;
92 :
93 : struct fd_topo_net_tile {
94 : ulong umem_dcache_obj_id; /* dcache for XDP UMEM frames */
95 : uint bind_address;
96 :
97 : ushort shred_listen_port;
98 : ushort quic_transaction_listen_port;
99 : ushort legacy_transaction_listen_port;
100 : ushort gossip_listen_port;
101 : ushort repair_intake_listen_port;
102 : ushort repair_serve_listen_port;
103 : ushort send_src_port;
104 : };
105 : typedef struct fd_topo_net_tile fd_topo_net_tile_t;
106 :
107 : /* A tile is a unique process that is spawned by Firedancer to represent
108 : one thread of execution. Firedancer sandboxes all tiles to their own
109 : process for security reasons.
110 :
111 : A tile belongs to exactly one workspace. A tile is a consumer of 0
112 : or more links, it's inputs. A tile is a producer of 0 or more output
113 : links.
114 :
115 : All input links will be automatically polled by the tile
116 : infrastructure, and output links will automatically source and manage
117 : credits from consumers. */
118 : struct fd_topo_tile {
119 : ulong id; /* The ID of this tile. Indexed from [0, tile_cnt). When placed in a topology, the ID must be the index of the tile in the tiles list. */
120 : char name[ 7UL ]; /* The name of this tile. There can be multiple of each tile name in a topology. */
121 : char metrics_name[ 10UL ]; /* The name of this tile for looking up metrics. This is used so tiles can share a name but report different metrics, for Frankendancer and Firedancer. */
122 : ulong kind_id; /* The ID of this tile within its name. If there are n tile of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a tile, as does "id" on its own. */
123 : int is_agave; /* If the tile needs to run in the Agave (Anza) address space or not. */
124 : int allow_shutdown; /* If the tile is allowed to shutdown gracefully. If false, when the tile exits it will tear down the entire application. */
125 :
126 : ulong cpu_idx; /* The CPU index to pin the tile on. A value of ULONG_MAX or more indicates the tile should be floating and not pinned to a core. */
127 :
128 : ulong in_cnt; /* The number of links that this tile reads from. */
129 : ulong in_link_id[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* The link_id of each link that this tile reads from, indexed in [0, in_cnt). */
130 : int in_link_reliable[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* If each link that this tile reads from is a reliable or unreliable consumer, indexed in [0, in_cnt). */
131 : int in_link_poll[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* If each link that this tile reads from should be polled by the tile infrastructure, indexed in [0, in_cnt).
132 : If the link is not polled, the tile will not receive frags for it and the tile writer is responsible for
133 : reading from the link. The link must be marked as unreliable as it is not flow controlled. */
134 :
135 : ulong out_cnt; /* The number of links that this tile writes to. */
136 : ulong out_link_id[ FD_TOPO_MAX_TILE_OUT_LINKS ]; /* The link_id of each link that this tile writes to, indexed in [0, link_cnt). */
137 :
138 : ulong tile_obj_id;
139 : ulong metrics_obj_id;
140 : ulong keyswitch_obj_id;
141 : ulong in_link_fseq_obj_id[ FD_TOPO_MAX_TILE_IN_LINKS ];
142 :
143 : ulong uses_obj_cnt;
144 : ulong uses_obj_id[ FD_TOPO_MAX_TILE_OBJS ];
145 : int uses_obj_mode[ FD_TOPO_MAX_TILE_OBJS ];
146 :
147 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
148 : struct {
149 : ulong * metrics; /* The shared memory for metrics that this tile should write. Consumer by monitoring and metrics writing tiles. */
150 :
151 : /* The fseq of each link that this tile reads from. Multiple fseqs
152 : may point to the link, if there are multiple consumers. An fseq
153 : can be uniquely identified via (link_id, tile_id), or (link_kind,
154 : link_kind_id, tile_kind, tile_kind_id) */
155 : ulong * in_link_fseq[ FD_TOPO_MAX_TILE_IN_LINKS ];
156 : };
157 :
158 : /* Configuration fields. These are required to be known by the topology so it can determine the
159 : total size of Firedancer in memory. */
160 : union {
161 : fd_topo_net_tile_t net;
162 :
163 : struct {
164 : fd_topo_net_tile_t net;
165 : char interface[ 16 ];
166 :
167 : /* xdp specific options */
168 : ulong xdp_rx_queue_size;
169 : ulong xdp_tx_queue_size;
170 : ulong free_ring_depth;
171 : long tx_flush_timeout_ns;
172 : char xdp_mode[8];
173 : int zero_copy;
174 :
175 : ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */
176 : ulong fib4_main_obj_id; /* fib4 containing main route table */
177 : ulong fib4_local_obj_id; /* fib4 containing local route table */
178 : ulong neigh4_obj_id; /* neigh4 hash map header */
179 : ulong neigh4_ele_obj_id; /* neigh4 hash map slots */
180 : } xdp;
181 :
182 : struct {
183 : fd_topo_net_tile_t net;
184 : /* sock specific options */
185 : int so_sndbuf;
186 : int so_rcvbuf;
187 : } sock;
188 :
189 : struct {
190 : ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */
191 : ulong fib4_main_obj_id; /* fib4 containing main route table */
192 : ulong fib4_local_obj_id; /* fib4 containing local route table */
193 : char neigh_if[ 16 ]; /* neigh4 interface name */
194 : ulong neigh4_obj_id; /* neigh4 hash map header */
195 : ulong neigh4_ele_obj_id; /* neigh4 hash map slots */
196 : } netlink;
197 :
198 : #define FD_TOPO_GOSSIP_ENTRYPOINTS_MAX 16UL
199 :
200 : struct {
201 : char identity_key_path[ PATH_MAX ];
202 :
203 : ulong entrypoints_cnt;
204 : fd_ip4_port_t entrypoints[ FD_TOPO_GOSSIP_ENTRYPOINTS_MAX ];
205 :
206 : long boot_timestamp_nanos;
207 :
208 : ulong tcache_depth;
209 :
210 : ushort shred_version;
211 : int allow_private_address;
212 : } gossvf;
213 :
214 : struct {
215 : char identity_key_path[ PATH_MAX ];
216 :
217 : ulong entrypoints_cnt;
218 : fd_ip4_port_t entrypoints[ FD_TOPO_GOSSIP_ENTRYPOINTS_MAX ];
219 :
220 : long boot_timestamp_nanos;
221 :
222 : uint ip_addr;
223 : ushort shred_version;
224 :
225 : ulong max_entries;
226 : ulong max_purged;
227 : ulong max_failed;
228 :
229 : struct {
230 : ushort gossip;
231 : ushort tvu;
232 : ushort tvu_quic;
233 : ushort tpu;
234 : ushort tpu_quic;
235 : ushort repair;
236 : } ports;
237 : } gossip;
238 :
239 : struct {
240 : uint out_depth;
241 : uint reasm_cnt;
242 : ulong max_concurrent_connections;
243 : ulong max_concurrent_handshakes;
244 : ushort quic_transaction_listen_port;
245 : long idle_timeout_millis;
246 : uint ack_delay_millis;
247 : int retry;
248 : char key_log_path[ PATH_MAX ];
249 : } quic;
250 :
251 : struct {
252 : ulong tcache_depth;
253 : } verify;
254 :
255 : struct {
256 : ulong tcache_depth;
257 : } dedup;
258 :
259 : struct {
260 : char url[ 256 ];
261 : ulong url_len;
262 : char sni[ 256 ];
263 : ulong sni_len;
264 : char identity_key_path[ PATH_MAX ];
265 : char key_log_path[ PATH_MAX ];
266 : ulong buf_sz;
267 : ulong ssl_heap_sz;
268 : ulong keepalive_interval_nanos;
269 : uchar tls_cert_verify : 1;
270 : } bundle;
271 :
272 : struct {
273 : ulong max_pending_transactions;
274 : ulong bank_tile_count;
275 : int larger_max_cost_per_block;
276 : int larger_shred_limits_per_block;
277 : int use_consumed_cus;
278 : int schedule_strategy;
279 : struct {
280 : int enabled;
281 : uchar tip_distribution_program_addr[ 32 ];
282 : uchar tip_payment_program_addr[ 32 ];
283 : uchar tip_distribution_authority[ 32 ];
284 : ulong commission_bps;
285 : char identity_key_path[ PATH_MAX ];
286 : char vote_account_path[ PATH_MAX ]; /* or pubkey is okay */
287 : } bundle;
288 : } pack;
289 :
290 : struct {
291 : int lagged_consecutive_leader_start;
292 : int plugins_enabled;
293 : ulong bank_cnt;
294 : char identity_key_path[ PATH_MAX ];
295 : struct {
296 : int enabled;
297 : uchar tip_payment_program_addr[ 32 ];
298 : uchar tip_distribution_program_addr[ 32 ];
299 : char vote_account_path[ PATH_MAX ];
300 : } bundle;
301 : } poh;
302 :
303 : struct {
304 : ulong depth;
305 : ulong fec_resolver_depth;
306 : char identity_key_path[ PATH_MAX ];
307 : ushort shred_listen_port;
308 : int larger_shred_limits_per_block;
309 : ushort expected_shred_version;
310 : ulong adtl_dests_retransmit_cnt;
311 : fd_topo_ip_port_t adtl_dests_retransmit[ FD_TOPO_ADTL_DESTS_MAX ];
312 : ulong adtl_dests_leader_cnt;
313 : fd_topo_ip_port_t adtl_dests_leader[ FD_TOPO_ADTL_DESTS_MAX ];
314 : } shred;
315 :
316 : struct {
317 : ulong disable_blockstore_from_slot;
318 : } store;
319 :
320 : struct {
321 : char identity_key_path[ PATH_MAX ];
322 : } sign;
323 :
324 : struct {
325 : uint listen_addr;
326 : ushort listen_port;
327 :
328 : int is_voting;
329 :
330 : char cluster[ 32 ];
331 : char identity_key_path[ PATH_MAX ];
332 : char vote_key_path[ PATH_MAX ];
333 :
334 : ulong max_http_connections;
335 : ulong max_websocket_connections;
336 : ulong max_http_request_length;
337 : ulong send_buffer_size_mb;
338 : int schedule_strategy;
339 :
340 : int websocket_compression;
341 : int frontend_release_channel;
342 : } gui;
343 :
344 : struct {
345 : uint prometheus_listen_addr;
346 : ushort prometheus_listen_port;
347 : } metric;
348 :
349 : struct {
350 : ulong fec_max;
351 : ulong max_vote_accounts;
352 :
353 : int tx_metadata_storage;
354 : ulong funk_obj_id;
355 : ulong txncache_obj_id;
356 :
357 : char shred_cap[ PATH_MAX ];
358 : char cluster_version[ 32 ];
359 :
360 : char identity_key_path[ PATH_MAX ];
361 : uint ip_addr;
362 : char vote_account_path[ PATH_MAX ];
363 :
364 : ulong heap_size_gib;
365 : ulong max_live_slots;
366 :
367 : /* not specified in TOML */
368 :
369 : ulong enable_features_cnt;
370 : char enable_features[ 16 ][ FD_BASE58_ENCODED_32_SZ ];
371 :
372 : ulong enable_bank_hash_cmp;
373 :
374 : ulong capture_start_slot;
375 : char solcap_capture[ PATH_MAX ];
376 : char dump_proto_dir[ PATH_MAX ];
377 : int dump_block_to_pb;
378 :
379 : } replay;
380 :
381 : struct {
382 : ulong funk_obj_id;
383 : ulong txncache_obj_id;
384 :
385 : ulong max_live_slots;
386 :
387 : ulong capture_start_slot;
388 : char solcap_capture[ PATH_MAX ];
389 : char dump_proto_dir[ PATH_MAX ];
390 : int dump_instr_to_pb;
391 : int dump_txn_to_pb;
392 : int dump_syscall_to_pb;
393 : int dump_elf_to_pb;
394 : } exec;
395 :
396 : struct {
397 : ushort send_to_port;
398 : uint send_to_ip_addr;
399 : ulong conn_cnt;
400 : int no_quic;
401 : } benchs;
402 :
403 : struct {
404 : ushort rpc_port;
405 : uint rpc_ip_addr;
406 : } bencho;
407 :
408 : struct {
409 : ulong accounts_cnt;
410 : int mode;
411 : float contending_fraction;
412 : float cu_price_spread;
413 : } benchg;
414 :
415 : struct {
416 : ushort repair_intake_listen_port;
417 : ushort repair_serve_listen_port;
418 : char identity_key_path[ PATH_MAX ];
419 : ulong max_pending_shred_sets;
420 : ulong slot_max;
421 :
422 : /* non-config */
423 :
424 : ulong repair_sign_depth;
425 : ulong repair_sign_cnt;
426 : } repair;
427 :
428 : struct {
429 : char slots_pending[PATH_MAX];
430 :
431 : ulong expected_shred_version;
432 :
433 : /* non-config */
434 :
435 : char identity_key_path[ PATH_MAX ];
436 : char shred_cap_archive[ PATH_MAX ];
437 : char shred_cap_replay[ PATH_MAX ];
438 : ulong shred_cap_end_slot;
439 :
440 : char blockstore_file[ PATH_MAX ];
441 : char blockstore_restore[ PATH_MAX ];
442 : } store_int;
443 :
444 : struct {
445 : ushort send_src_port;
446 :
447 : /* non-config */
448 :
449 : uint ip_addr;
450 : char identity_key_path[ PATH_MAX ];
451 : } send;
452 :
453 : struct {
454 : ulong funk_obj_id;
455 : ulong store_obj_id;
456 : ushort rpc_port;
457 : ushort tpu_port;
458 : uint tpu_ip_addr;
459 : char identity_key_path[ PATH_MAX ];
460 : uint block_index_max;
461 : uint txn_index_max;
462 : uint acct_index_max;
463 : char history_file[ PATH_MAX ];
464 : } rpcserv;
465 :
466 : struct {
467 : uint fake_dst_ip;
468 : } pktgen;
469 :
470 : struct {
471 : ulong end_slot;
472 : char rocksdb_path[ PATH_MAX ];
473 : char shredcap_path[ PATH_MAX ];
474 : char bank_hash_path[ PATH_MAX ];
475 : char ingest_mode[ 32 ];
476 :
477 : /* Set internally by the archiver tile */
478 : int archive_fd;
479 : } archiver;
480 :
481 : struct {
482 : ulong funk_obj_id;
483 : char identity_key_path[ PATH_MAX ];
484 : char vote_acc_path[ PATH_MAX ];
485 : char ledger_path[PATH_MAX];
486 : } tower;
487 : struct {
488 : char folder_path[ PATH_MAX ];
489 : ushort repair_intake_listen_port;
490 : ulong write_buffer_size; /* Size of the write buffer for the capture tile */
491 : int enable_publish_stake_weights;
492 : char manifest_path[ PATH_MAX ];
493 :
494 : /* Set internally by the capture tile */
495 : int shreds_fd;
496 : int requests_fd;
497 : int fecs_fd;
498 : int peers_fd;
499 : int bank_hashes_fd;
500 : int slices_fd;
501 : } shredcap;
502 :
503 : struct {
504 : char snapshots_path[ PATH_MAX ];
505 : int incremental_snapshot_fetch;
506 : int do_download;
507 : uint maximum_local_snapshot_age;
508 : uint minimum_download_speed_mib;
509 : uint maximum_download_retry_abort;
510 : uint max_full_snapshots_to_keep;
511 : uint max_incremental_snapshots_to_keep;
512 :
513 : int entrypoints_enabled;
514 : int gossip_peers_enabled;
515 :
516 : ulong gossip_entrypoints_cnt;
517 : fd_ip4_port_t gossip_entrypoints[ FD_TOPO_GOSSIP_ENTRYPOINTS_MAX ];
518 :
519 : struct {
520 : ulong peers_cnt;
521 : fd_ip4_port_t peers[ 16UL ];
522 : } http;
523 : } snaprd;
524 :
525 : struct {
526 : ulong max_live_slots;
527 : ulong funk_obj_id;
528 : ulong txncache_obj_id;
529 : } snapin;
530 :
531 : struct {
532 :
533 : uint bind_address;
534 : ushort bind_port;
535 :
536 : ushort expected_shred_version;
537 : ulong entrypoints_cnt;
538 : fd_ip4_port_t entrypoints[ FD_TOPO_GOSSIP_ENTRYPOINTS_MAX ];
539 : } ipecho;
540 :
541 : struct {
542 : ulong max_live_slots;
543 :
544 : ulong txncache_obj_id;
545 : ulong funk_obj_id;
546 : } bank;
547 :
548 : struct {
549 : ulong funk_obj_id;
550 : } resolv;
551 :
552 : struct {
553 : ulong funk_obj_id;
554 :
555 : int allow_download;
556 :
557 : ushort expected_shred_version;
558 : ulong entrypoints_cnt;
559 : fd_ip4_port_t entrypoints[ FD_TOPO_GOSSIP_ENTRYPOINTS_MAX ];
560 :
561 : char genesis_path[ PATH_MAX ];
562 : } genesi;
563 : };
564 : };
565 :
566 : typedef struct fd_topo_tile fd_topo_tile_t;
567 :
568 : typedef struct {
569 : ulong id;
570 : char name[ 13UL ];
571 : ulong wksp_id;
572 :
573 : ulong offset;
574 : ulong footprint;
575 : } fd_topo_obj_t;
576 :
577 : /* An fd_topo_t represents the overall structure of a Firedancer
578 : configuration, describing all the workspaces, tiles, and links
579 : between them. */
580 : struct fd_topo {
581 : char app_name[ 256UL ];
582 : uchar props[ 16384UL ];
583 :
584 : ulong wksp_cnt;
585 : ulong link_cnt;
586 : ulong tile_cnt;
587 : ulong obj_cnt;
588 :
589 : fd_topo_wksp_t workspaces[ FD_TOPO_MAX_WKSPS ];
590 : fd_topo_link_t links[ FD_TOPO_MAX_LINKS ];
591 : fd_topo_tile_t tiles[ FD_TOPO_MAX_TILES ];
592 : fd_topo_obj_t objs[ FD_TOPO_MAX_OBJS ];
593 :
594 : ulong agave_affinity_cnt;
595 : ulong agave_affinity_cpu_idx[ FD_TILE_MAX ];
596 :
597 : ulong max_page_size; /* 2^21 or 2^30 */
598 : ulong gigantic_page_threshold; /* see [hugetlbfs.gigantic_page_threshold_mib]*/
599 : };
600 : typedef struct fd_topo fd_topo_t;
601 :
602 : typedef struct {
603 : char const * name;
604 :
605 : int keep_host_networking;
606 : int allow_connect;
607 : int allow_renameat;
608 : ulong rlimit_file_cnt;
609 : ulong rlimit_address_space;
610 : ulong rlimit_data;
611 : int for_tpool;
612 :
613 : ulong (*populate_allowed_seccomp)( fd_topo_t const * topo, fd_topo_tile_t const * tile, ulong out_cnt, struct sock_filter * out );
614 : ulong (*populate_allowed_fds )( fd_topo_t const * topo, fd_topo_tile_t const * tile, ulong out_fds_sz, int * out_fds );
615 : ulong (*scratch_align )( void );
616 : ulong (*scratch_footprint )( fd_topo_tile_t const * tile );
617 : ulong (*loose_footprint )( fd_topo_tile_t const * tile );
618 : void (*privileged_init )( fd_topo_t * topo, fd_topo_tile_t * tile );
619 : void (*unprivileged_init )( fd_topo_t * topo, fd_topo_tile_t * tile );
620 : void (*run )( fd_topo_t * topo, fd_topo_tile_t * tile );
621 : ulong (*rlimit_file_cnt_fn )( fd_topo_t const * topo, fd_topo_tile_t const * tile );
622 : } fd_topo_run_tile_t;
623 :
624 : struct fd_topo_obj_callbacks {
625 : char const * name;
626 : ulong (* footprint )( fd_topo_t const * topo, fd_topo_obj_t const * obj );
627 : ulong (* align )( fd_topo_t const * topo, fd_topo_obj_t const * obj );
628 : ulong (* loose )( fd_topo_t const * topo, fd_topo_obj_t const * obj );
629 : void (* new )( fd_topo_t const * topo, fd_topo_obj_t const * obj );
630 : };
631 :
632 : typedef struct fd_topo_obj_callbacks fd_topo_obj_callbacks_t;
633 :
634 : FD_PROTOTYPES_BEGIN
635 :
636 : FD_FN_CONST static inline ulong
637 1389 : fd_topo_workspace_align( void ) {
638 : /* This needs to be the max( align ) of all the child members that
639 : could be aligned into this workspace, otherwise our footprint
640 : calculation will not be correct. For now just set to 4096 but this
641 : should probably be calculated dynamically, or we should reduce
642 : those child aligns if we can. */
643 1389 : return 4096UL;
644 1389 : }
645 :
646 : static inline void *
647 : fd_topo_obj_laddr( fd_topo_t const * topo,
648 1788 : ulong obj_id ) {
649 1788 : fd_topo_obj_t const * obj = &topo->objs[ obj_id ];
650 1788 : FD_TEST( obj_id<FD_TOPO_MAX_OBJS );
651 1788 : FD_TEST( obj->id == obj_id );
652 1788 : FD_TEST( obj->offset );
653 1788 : return (void *)((ulong)topo->workspaces[ obj->wksp_id ].wksp + obj->offset);
654 1788 : }
655 :
656 : /* Returns a pointer in the local address space to the base address of
657 : the workspace out of which the given object was allocated. */
658 :
659 : static inline void *
660 : fd_topo_obj_wksp_base( fd_topo_t const * topo,
661 0 : ulong obj_id ) {
662 0 : FD_TEST( obj_id<FD_TOPO_MAX_OBJS );
663 0 : fd_topo_obj_t const * obj = &topo->objs[ obj_id ];
664 0 : FD_TEST( obj->id == obj_id );
665 0 : ulong const wksp_id = obj->wksp_id;
666 :
667 0 : FD_TEST( wksp_id<FD_TOPO_MAX_WKSPS );
668 0 : fd_topo_wksp_t const * wksp = &topo->workspaces[ wksp_id ];
669 0 : FD_TEST( wksp->id == wksp_id );
670 0 : return wksp->wksp;
671 0 : }
672 :
673 : FD_FN_PURE static inline ulong
674 : fd_topo_tile_name_cnt( fd_topo_t const * topo,
675 6 : char const * name ) {
676 6 : ulong cnt = 0;
677 114 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
678 108 : if( FD_UNLIKELY( !strcmp( topo->tiles[ i ].name, name ) ) ) cnt++;
679 108 : }
680 6 : return cnt;
681 6 : }
682 :
683 : /* Finds the workspace of a given name in the topology. Returns
684 : ULONG_MAX if there is no such workspace. There can be at most one
685 : workspace of a given name. */
686 :
687 : FD_FN_PURE static inline ulong
688 : fd_topo_find_wksp( fd_topo_t const * topo,
689 1602 : char const * name ) {
690 36057 : for( ulong i=0; i<topo->wksp_cnt; i++ ) {
691 36057 : if( FD_UNLIKELY( !strcmp( topo->workspaces[ i ].name, name ) ) ) return i;
692 36057 : }
693 0 : return ULONG_MAX;
694 1602 : }
695 :
696 : /* Find the tile of a given name and kind_id in the topology, there will
697 : be at most one such tile, since kind_id is unique among the name.
698 : Returns ULONG_MAX if there is no such tile. */
699 :
700 : FD_FN_PURE static inline ulong
701 : fd_topo_find_tile( fd_topo_t const * topo,
702 : char const * name,
703 1011 : ulong kind_id ) {
704 15654 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
705 15654 : if( FD_UNLIKELY( !strcmp( topo->tiles[ i ].name, name ) ) && topo->tiles[ i ].kind_id == kind_id ) return i;
706 15654 : }
707 0 : return ULONG_MAX;
708 1011 : }
709 :
710 : /* Find the link of a given name and kind_id in the topology, there will
711 : be at most one such link, since kind_id is unique among the name.
712 : Returns ULONG_MAX if there is no such link. */
713 :
714 : FD_FN_PURE static inline ulong
715 : fd_topo_find_link( fd_topo_t const * topo,
716 : char const * name,
717 819 : ulong kind_id ) {
718 19152 : for( ulong i=0; i<topo->link_cnt; i++ ) {
719 19152 : if( FD_UNLIKELY( !strcmp( topo->links[ i ].name, name ) ) && topo->links[ i ].kind_id == kind_id ) return i;
720 19152 : }
721 0 : return ULONG_MAX;
722 819 : }
723 :
724 : FD_FN_PURE static inline ulong
725 : fd_topo_find_tile_in_link( fd_topo_t const * topo,
726 : fd_topo_tile_t const * tile,
727 : char const * name,
728 33 : ulong kind_id ) {
729 264 : for( ulong i=0; i<tile->in_cnt; i++ ) {
730 264 : if( FD_UNLIKELY( !strcmp( topo->links[ tile->in_link_id[ i ] ].name, name ) )
731 264 : && topo->links[ tile->in_link_id[ i ] ].kind_id == kind_id ) return i;
732 264 : }
733 0 : return ULONG_MAX;
734 33 : }
735 :
736 : FD_FN_PURE static inline ulong
737 : fd_topo_find_tile_out_link( fd_topo_t const * topo,
738 : fd_topo_tile_t const * tile,
739 : char const * name,
740 33 : ulong kind_id ) {
741 99 : for( ulong i=0; i<tile->out_cnt; i++ ) {
742 99 : if( FD_UNLIKELY( !strcmp( topo->links[ tile->out_link_id[ i ] ].name, name ) )
743 99 : && topo->links[ tile->out_link_id[ i ] ].kind_id == kind_id ) return i;
744 99 : }
745 0 : return ULONG_MAX;
746 33 : }
747 :
748 : /* Find the id of the tile which is a producer for the given link. If
749 : no tile is a producer for the link, returns ULONG_MAX. This should
750 : not be possible for a well formed and validated topology. */
751 : FD_FN_PURE static inline ulong
752 : fd_topo_find_link_producer( fd_topo_t const * topo,
753 0 : fd_topo_link_t const * link ) {
754 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
755 0 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
756 :
757 0 : for( ulong j=0; j<tile->out_cnt; j++ ) {
758 0 : if( FD_UNLIKELY( tile->out_link_id[ j ] == link->id ) ) return i;
759 0 : }
760 0 : }
761 0 : return ULONG_MAX;
762 0 : }
763 :
764 : /* Given a link, count the number of consumers of that link among all
765 : the tiles in the topology. */
766 : FD_FN_PURE static inline ulong
767 : fd_topo_link_consumer_cnt( fd_topo_t const * topo,
768 291 : fd_topo_link_t const * link ) {
769 291 : ulong cnt = 0;
770 9123 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
771 8832 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
772 33225 : for( ulong j=0; j<tile->in_cnt; j++ ) {
773 24393 : if( FD_UNLIKELY( tile->in_link_id[ j ] == link->id ) ) cnt++;
774 24393 : }
775 8832 : }
776 :
777 291 : return cnt;
778 291 : }
779 :
780 : /* Given a link, count the number of reliable consumers of that link
781 : among all the tiles in the topology. */
782 : FD_FN_PURE static inline ulong
783 : fd_topo_link_reliable_consumer_cnt( fd_topo_t const * topo,
784 0 : fd_topo_link_t const * link ) {
785 0 : ulong cnt = 0;
786 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
787 0 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
788 0 : for( ulong j=0; j<tile->in_cnt; j++ ) {
789 0 : if( FD_UNLIKELY( tile->in_link_id[ j ] == link->id && tile->in_link_reliable[ j ] ) ) cnt++;
790 0 : }
791 0 : }
792 0 :
793 0 : return cnt;
794 0 : }
795 :
796 : FD_FN_PURE static inline ulong
797 : fd_topo_tile_consumer_cnt( fd_topo_t const * topo,
798 0 : fd_topo_tile_t const * tile ) {
799 0 : (void)topo;
800 0 : return tile->out_cnt;
801 0 : }
802 :
803 : FD_FN_PURE static inline ulong
804 : fd_topo_tile_reliable_consumer_cnt( fd_topo_t const * topo,
805 0 : fd_topo_tile_t const * tile ) {
806 0 : ulong reliable_cons_cnt = 0UL;
807 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
808 0 : fd_topo_tile_t const * consumer_tile = &topo->tiles[ i ];
809 0 : for( ulong j=0UL; j<consumer_tile->in_cnt; j++ ) {
810 0 : for( ulong k=0UL; k<tile->out_cnt; k++ ) {
811 0 : if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) {
812 0 : reliable_cons_cnt++;
813 0 : }
814 0 : }
815 0 : }
816 0 : }
817 0 : return reliable_cons_cnt;
818 0 : }
819 :
820 : FD_FN_PURE static inline ulong
821 : fd_topo_tile_producer_cnt( fd_topo_t const * topo,
822 0 : fd_topo_tile_t const * tile ) {
823 0 : (void)topo;
824 0 : ulong in_cnt = 0UL;
825 0 : for( ulong i=0UL; i<tile->in_cnt; i++ ) {
826 0 : if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue;
827 0 : in_cnt++;
828 0 : }
829 0 : return in_cnt;
830 0 : }
831 :
832 : /* Join (map into the process) all shared memory (huge/gigantic pages)
833 : needed by the tile, in the given topology. All memory associated
834 : with the tile (aka. used by links that the tile either produces to or
835 : consumes from, or used by the tile itself for its cnc) will be
836 : attached (mapped into the process).
837 :
838 : This is needed to play nicely with the sandbox. Once a process is
839 : sandboxed we can no longer map any memory. */
840 : void
841 : fd_topo_join_tile_workspaces( fd_topo_t * topo,
842 : fd_topo_tile_t * tile );
843 :
844 : /* Join (map into the process) the shared memory (huge/gigantic pages)
845 : for the given workspace. Mode is one of
846 : FD_SHMEM_JOIN_MODE_READ_WRITE or FD_SHMEM_JOIN_MODE_READ_ONLY and
847 : determines the prot argument that will be passed to mmap when mapping
848 : the pages in (PROT_WRITE or PROT_READ respectively). */
849 : void
850 : fd_topo_join_workspace( fd_topo_t * topo,
851 : fd_topo_wksp_t * wksp,
852 : int mode );
853 :
854 : /* Join (map into the process) all shared memory (huge/gigantic pages)
855 : needed by all tiles in the topology. Mode is one of
856 : FD_SHMEM_JOIN_MODE_READ_WRITE or FD_SHMEM_JOIN_MODE_READ_ONLY and
857 : determines the prot argument that will be passed to mmap when
858 : mapping the pages in (PROT_WRITE or PROT_READ respectively). */
859 : void
860 : fd_topo_join_workspaces( fd_topo_t * topo,
861 : int mode );
862 :
863 : /* Leave (unmap from the process) the shared memory needed for the
864 : given workspace in the topology, if it was previously mapped.
865 :
866 : topo and wksp are assumed non-NULL. It is OK if the workspace
867 : has not been previously joined, in which case this is a no-op. */
868 :
869 : void
870 : fd_topo_leave_workspace( fd_topo_t * topo,
871 : fd_topo_wksp_t * wksp );
872 :
873 : /* Leave (unmap from the process) all shared memory needed by all
874 : tiles in the topology, if each of them was mapped.
875 :
876 : topo is assumed non-NULL. Only workspaces which were previously
877 : joined are unmapped. */
878 :
879 : void
880 : fd_topo_leave_workspaces( fd_topo_t * topo );
881 :
882 : /* Create the given workspace needed by the topology on the system.
883 : This does not "join" the workspaces (map their memory into the
884 : process), but only creates the .wksp file and formats it correctly
885 : as a workspace.
886 :
887 : Returns 0 on success and -1 on failure, with errno set to the error.
888 : The only reason for failure currently that will be returned is
889 : ENOMEM, as other unexpected errors will cause the program to exit.
890 :
891 : If update_existing is 1, the workspace will not be created from
892 : scratch but it will be assumed that it already exists from a prior
893 : run and needs to be maybe resized and then have the header
894 : structures reinitialized. This can save a very expensive operation
895 : of zeroing all of the workspace pages. This is dangerous in
896 : production because it can leave stray memory from prior runs around,
897 : and should only be used in development environments. */
898 :
899 : int
900 : fd_topo_create_workspace( fd_topo_t * topo,
901 : fd_topo_wksp_t * wksp,
902 : int update_existing );
903 :
904 : /* Join the standard IPC objects needed by the topology of this particular
905 : tile */
906 :
907 : void
908 : fd_topo_fill_tile( fd_topo_t * topo,
909 : fd_topo_tile_t * tile );
910 :
911 : /* Same as fd_topo_fill_tile but fills in all the objects for a
912 : particular workspace with the given mode. */
913 : void
914 : fd_topo_workspace_fill( fd_topo_t * topo,
915 : fd_topo_wksp_t * wksp );
916 :
917 : /* Apply a new function to every object that is resident in the given
918 : workspace in the topology. */
919 :
920 : void
921 : fd_topo_wksp_new( fd_topo_t const * topo,
922 : fd_topo_wksp_t const * wksp,
923 : fd_topo_obj_callbacks_t ** callbacks );
924 :
925 : /* Same as fd_topo_fill_tile but fills in all tiles in the topology. */
926 :
927 : void
928 : fd_topo_fill( fd_topo_t * topo );
929 :
930 : /* fd_topo_tile_stack_join joins a huge page optimized stack for the
931 : provided tile. The stack is assumed to already exist at a known
932 : path in the hugetlbfs mount. */
933 :
934 : void *
935 : fd_topo_tile_stack_join( char const * app_name,
936 : char const * tile_name,
937 : ulong tile_kind_id );
938 :
939 : /* Install the XDP program needed by the net tiles into the local device
940 : and return the xsk_map_fd. bind_addr is an optional IPv4 address to
941 : used for filtering by dst IP. */
942 :
943 : fd_xdp_fds_t
944 : fd_topo_install_xdp( fd_topo_t const * topo,
945 : uint bind_addr );
946 :
947 : /* fd_topo_run_single_process runs all the tiles in a single process
948 : (the calling process). This spawns a thread for each tile, switches
949 : that thread to the given UID and GID and then runs the tile in it.
950 : Each thread will never exit, as tiles are expected to run forever.
951 : An error is logged and the application will exit if a tile exits.
952 : The function itself does return after spawning all the threads.
953 :
954 : The threads will not be sandboxed in any way, except switching to the
955 : provided UID and GID, so they will share the same address space, and
956 : not have any seccomp restrictions or use any Linux namespaces. The
957 : calling thread will also switch to the provided UID and GID before
958 : it returns.
959 :
960 : In production, when running with an Agave child process this is
961 : used for spawning certain tiles inside the Agave address space.
962 : It's also useful for tooling and debugging, but is not how the main
963 : production Firedancer process runs. For production, each tile is run
964 : in its own address space with a separate process and full security
965 : sandbox.
966 :
967 : The agave argument determines which tiles are started. If the
968 : argument is 0 or 1, only non-agave (or only agave) tiles are started.
969 : If the argument is any other value, all tiles in the topology are
970 : started regardless of if they are Agave tiles or not. */
971 :
972 : void
973 : fd_topo_run_single_process( fd_topo_t * topo,
974 : int agave,
975 : uint uid,
976 : uint gid,
977 : fd_topo_run_tile_t (* tile_run )( fd_topo_tile_t const * tile ) );
978 :
979 : /* fd_topo_run_tile runs the given tile directly within the current
980 : process (and thread). The function will never return, as tiles are
981 : expected to run forever. An error is logged and the application will
982 : exit if the tile exits.
983 :
984 : The sandbox argument determines if the current process will be
985 : sandboxed fully before starting the tile. The thread will switch to
986 : the UID and GID provided before starting the tile, even if the thread
987 : is not being sandboxed. Although POSIX specifies that all threads in
988 : a process must share a UID and GID, this is not the case on Linux.
989 : The thread will switch to the provided UID and GID without switching
990 : the other threads in the process.
991 :
992 : If keep_controlling_terminal is set to 0, and the sandbox is enabled
993 : the controlling terminal will be detached as an additional sandbox
994 : measure, but you will not be able to send Ctrl+C or other signals
995 : from the terminal. See fd_sandbox.h for more information.
996 :
997 : The allow_fd argument is only used if sandbox is true, and is a file
998 : descriptor which will be allowed to exist in the process. Normally
999 : the sandbox code rejects and aborts if there is an unexpected file
1000 : descriptor present on boot. This is helpful to allow a parent
1001 : process to be notified on termination of the tile by waiting for a
1002 : pipe file descriptor to get closed.
1003 :
1004 : wait and debugger are both used in debugging. If wait is non-NULL,
1005 : the runner will wait until the value pointed to by wait is non-zero
1006 : before launching the tile. Likewise, if debugger is non-NULL, the
1007 : runner will wait until a debugger is attached before setting the
1008 : value pointed to by debugger to non-zero. These are intended to be
1009 : used as a pair, where many tiles share a waiting reference, and then
1010 : one of the tiles (a tile you want to attach the debugger to) has the
1011 : same reference provided as the debugger, so all tiles will stop and
1012 : wait for the debugger to attach to it before proceeding. */
1013 :
1014 : void
1015 : fd_topo_run_tile( fd_topo_t * topo,
1016 : fd_topo_tile_t * tile,
1017 : int sandbox,
1018 : int keep_controlling_terminal,
1019 : int dumpable,
1020 : uint uid,
1021 : uint gid,
1022 : int allow_fd,
1023 : volatile int * wait,
1024 : volatile int * debugger,
1025 : fd_topo_run_tile_t * tile_run );
1026 :
1027 : /* This is for determining the value of RLIMIT_MLOCK that we need to
1028 : successfully run all tiles in separate processes. The value returned
1029 : is the maximum amount of memory that will be locked with mlock() by
1030 : any individual process in the tree. Specifically, if we have three
1031 : tile processes, and they each need to lock 5, 9, and 2 MiB of memory
1032 : respectively, RLIMIT_MLOCK needs to be 9 MiB to allow all three
1033 : process mlock() calls to succeed.
1034 :
1035 : Tiles lock memory in three ways. Any workspace they are using, they
1036 : lock the entire workspace. Then each tile uses huge pages for the
1037 : stack which are also locked, and finally some tiles use private
1038 : locked mmaps outside the workspace for storing key material. The
1039 : results here include all of this memory together.
1040 :
1041 : The result is not necessarily the amount of memory used by the tile
1042 : process, although it will be quite close. Tiles could potentially
1043 : allocate memory (eg, with brk) without needing to lock it, which
1044 : would not need to included, and some kernel memory that tiles cause
1045 : to be allocated (for example XSK buffers) is also not included. The
1046 : actual amount of memory used will not be less than this value. */
1047 : FD_FN_PURE ulong
1048 : fd_topo_mlock_max_tile( fd_topo_t const * topo );
1049 :
1050 : /* Same as fd_topo_mlock_max_tile, but for loading the entire topology
1051 : into one process, rather than a separate process per tile. This is
1052 : used, for example, by the configuration code when it creates all the
1053 : workspaces, or the monitor that maps the entire system into one
1054 : address space. */
1055 : FD_FN_PURE ulong
1056 : fd_topo_mlock( fd_topo_t const * topo );
1057 :
1058 : /* This returns the number of gigantic pages needed by the topology on
1059 : the provided numa node. It includes pages needed by the workspaces,
1060 : as well as additional allocations like huge pages for process stacks
1061 : and private key storage. */
1062 :
1063 : FD_FN_PURE ulong
1064 : fd_topo_gigantic_page_cnt( fd_topo_t const * topo,
1065 : ulong numa_idx );
1066 :
1067 : /* This returns the number of huge pages in the application needed by
1068 : the topology on the provided numa node. It includes pages needed by
1069 : things placed in the hugetlbfs (workspaces, process stacks). If
1070 : include_anonymous is true, it also includes anonymous hugepages which
1071 : are needed but are not placed in the hugetlbfs. */
1072 :
1073 : FD_FN_PURE ulong
1074 : fd_topo_huge_page_cnt( fd_topo_t const * topo,
1075 : ulong numa_idx,
1076 : int include_anonymous );
1077 :
1078 : /* Prints a message describing the topology to an output stream. If
1079 : stdout is true, will be written to stdout, otherwise will be written
1080 : as a NOTICE log message to the log file. */
1081 : void
1082 : fd_topo_print_log( int stdout,
1083 : fd_topo_t * topo );
1084 :
1085 : FD_PROTOTYPES_END
1086 :
1087 : #endif /* HEADER_fd_src_disco_topo_fd_topo_h */
|