Line data Source code
1 : #ifndef HEADER_fd_src_disco_topo_fd_topo_h
2 : #define HEADER_fd_src_disco_topo_fd_topo_h
3 :
4 : #include "../stem/fd_stem.h"
5 : #include "../../tango/fd_tango.h"
6 : #include "../../waltz/xdp/fd_xdp1.h"
7 : #include "../../ballet/base58/fd_base58.h"
8 : #include "../../util/net/fd_net_headers.h"
9 :
10 : /* Maximum number of workspaces that may be present in a topology. */
11 : #define FD_TOPO_MAX_WKSPS (256UL)
12 : /* Maximum number of links that may be present in a topology. */
13 : #define FD_TOPO_MAX_LINKS (256UL)
14 : /* Maximum number of tiles that may be present in a topology. */
15 : #define FD_TOPO_MAX_TILES (256UL)
16 : /* Maximum number of objects that may be present in a topology. */
17 : #define FD_TOPO_MAX_OBJS (4096UL)
18 : /* Maximum number of links that may go into any one tile in the
19 : topology. */
20 : #define FD_TOPO_MAX_TILE_IN_LINKS ( 128UL)
21 : /* Maximum number of links that a tile may write to. */
22 : #define FD_TOPO_MAX_TILE_OUT_LINKS ( 32UL)
23 : /* Maximum number of objects that a tile can use. */
24 : #define FD_TOPO_MAX_TILE_OBJS ( 256UL)
25 :
26 : /* Maximum number of additional ip addresses */
27 : #define FD_NET_MAX_SRC_ADDR 4
28 :
29 : /* Maximum number of additional destinations for leader shreds and for retransmitted shreds */
30 : #define FD_TOPO_ADTL_DESTS_MAX ( 32UL)
31 :
32 :
33 : /* A workspace is a Firedancer specific memory management structure that
34 : sits on top of 1 or more memory mapped gigantic or huge pages mounted
35 : to the hugetlbfs. */
36 : typedef struct {
37 : ulong id; /* The ID of this workspace. Indexed from [0, wksp_cnt). When placed in a topology, the ID must be the index of the workspace in the workspaces list. */
38 : char name[ 13UL ]; /* The name of this workspace, like "pack". There can be at most one of each workspace name in a topology. */
39 :
40 : ulong numa_idx; /* The index of the NUMA node on the system that this workspace should be allocated from. */
41 :
42 : int is_locked; /* If the workspace should use pages locked and pinned to a specific numa node. */
43 :
44 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
45 : struct {
46 : ulong page_sz; /* The size of the pages that this workspace is backed by. One of FD_PAGE_SIZE_*. */
47 : ulong page_cnt; /* The number of pages that must be mapped to this workspace to store all the data needed by consumers. */
48 : ulong part_max; /* The maximum number of partitions in the underlying workspace. There can only be this many allocations made at any one time. */
49 :
50 : fd_wksp_t * wksp; /* The workspace memory in the local process. */
51 : ulong known_footprint; /* Total size in bytes of all data in Firedancer that will be stored in this workspace at startup. */
52 : ulong total_footprint; /* Total size in bytes of all data in Firedancer that could be stored in this workspace, includes known data and loose data. */
53 : };
54 : } fd_topo_wksp_t;
55 :
56 : /* A link is an mcache in a workspace that has one producer and one or
57 : more consumers. A link may optionally also have a dcache, that holds
58 : fragments referred to by the mcache entries.
59 :
60 : A link belongs to exactly one workspace. A link has exactly one
61 : producer, and 1 or more consumers. Each consumer is either reliable
62 : or not reliable. A link has a depth and a MTU, which correspond to
63 : the depth and MTU of the mcache and dcache respectively. A MTU of
64 : zero means no dcache is needed, as there is no data. */
65 : typedef struct {
66 : ulong id; /* The ID of this link. Indexed from [0, link_cnt). When placed in a topology, the ID must be the index of the link in the links list. */
67 : char name[ 13UL ]; /* The name of this link, like "pack_bank". There can be multiple of each link name in a topology. */
68 : ulong kind_id; /* The ID of this link within its name. If there are N links of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a link, as does "id" on its own. */
69 :
70 : ulong depth; /* The depth of the mcache representing the link. */
71 : ulong mtu; /* The MTU of data fragments in the mcache. A value of 0 means there is no dcache. */
72 : ulong burst; /* The max amount of MTU sized data fragments that might be bursted to the dcache. */
73 :
74 : ulong mcache_obj_id;
75 : ulong dcache_obj_id;
76 :
77 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
78 : struct {
79 : fd_frag_meta_t * mcache; /* The mcache of this link. */
80 : void * dcache; /* The dcache of this link, if it has one. */
81 : };
82 :
83 : uint permit_no_consumers : 1; /* Permit a topology where this link has no consumers */
84 : uint permit_no_producers : 1; /* Permit a topology where this link has no producers */
85 : } fd_topo_link_t;
86 :
87 : /* Be careful: ip and host are in different byte order */
88 : typedef struct {
89 : uint ip; /* in network byte order */
90 : ushort port; /* in host byte order */
91 : } fd_topo_ip_port_t;
92 :
93 : struct fd_topo_net_tile {
94 : ulong umem_dcache_obj_id; /* dcache for XDP UMEM frames */
95 : uint bind_address;
96 :
97 : ushort shred_listen_port;
98 : ushort quic_transaction_listen_port;
99 : ushort legacy_transaction_listen_port;
100 : ushort gossip_listen_port;
101 : ushort repair_intake_listen_port;
102 : ushort repair_serve_listen_port;
103 : ushort send_src_port;
104 : };
105 : typedef struct fd_topo_net_tile fd_topo_net_tile_t;
106 :
107 : /* A tile is a unique process that is spawned by Firedancer to represent
108 : one thread of execution. Firedancer sandboxes all tiles to their own
109 : process for security reasons.
110 :
111 : A tile belongs to exactly one workspace. A tile is a consumer of 0
112 : or more links, it's inputs. A tile is a producer of 0 or more output
113 : links.
114 :
115 : All input links will be automatically polled by the tile
116 : infrastructure, and output links will automatically source and manage
117 : credits from consumers. */
118 : struct fd_topo_tile {
119 : ulong id; /* The ID of this tile. Indexed from [0, tile_cnt). When placed in a topology, the ID must be the index of the tile in the tiles list. */
120 : char name[ 7UL ]; /* The name of this tile. There can be multiple of each tile name in a topology. */
121 : char metrics_name[ 10UL ]; /* The name of this tile for looking up metrics. This is used so tiles can share a name but report different metrics, for Frankendancer and Firedancer. */
122 : ulong kind_id; /* The ID of this tile within its name. If there are n tile of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a tile, as does "id" on its own. */
123 : int is_agave; /* If the tile needs to run in the Agave (Anza) address space or not. */
124 : int allow_shutdown; /* If the tile is allowed to shutdown gracefully. If false, when the tile exits it will tear down the entire application. */
125 :
126 : ulong cpu_idx; /* The CPU index to pin the tile on. A value of ULONG_MAX or more indicates the tile should be floating and not pinned to a core. */
127 :
128 : ulong in_cnt; /* The number of links that this tile reads from. */
129 : ulong in_link_id[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* The link_id of each link that this tile reads from, indexed in [0, in_cnt). */
130 : int in_link_reliable[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* If each link that this tile reads from is a reliable or unreliable consumer, indexed in [0, in_cnt). */
131 : int in_link_poll[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* If each link that this tile reads from should be polled by the tile infrastructure, indexed in [0, in_cnt).
132 : If the link is not polled, the tile will not receive frags for it and the tile writer is responsible for
133 : reading from the link. The link must be marked as unreliable as it is not flow controlled. */
134 :
135 : ulong out_cnt; /* The number of links that this tile writes to. */
136 : ulong out_link_id[ FD_TOPO_MAX_TILE_OUT_LINKS ]; /* The link_id of each link that this tile writes to, indexed in [0, link_cnt). */
137 :
138 : ulong tile_obj_id;
139 : ulong metrics_obj_id;
140 : ulong keyswitch_obj_id;
141 : ulong in_link_fseq_obj_id[ FD_TOPO_MAX_TILE_IN_LINKS ];
142 :
143 : ulong uses_obj_cnt;
144 : ulong uses_obj_id[ FD_TOPO_MAX_TILE_OBJS ];
145 : int uses_obj_mode[ FD_TOPO_MAX_TILE_OBJS ];
146 :
147 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
148 : struct {
149 : ulong * metrics; /* The shared memory for metrics that this tile should write. Consumer by monitoring and metrics writing tiles. */
150 :
151 : /* The fseq of each link that this tile reads from. Multiple fseqs
152 : may point to the link, if there are multiple consumers. An fseq
153 : can be uniquely identified via (link_id, tile_id), or (link_kind,
154 : link_kind_id, tile_kind, tile_kind_id) */
155 : ulong * in_link_fseq[ FD_TOPO_MAX_TILE_IN_LINKS ];
156 : };
157 :
158 : /* Configuration fields. These are required to be known by the topology so it can determine the
159 : total size of Firedancer in memory. */
160 : union {
161 : fd_topo_net_tile_t net;
162 :
163 : struct {
164 : fd_topo_net_tile_t net;
165 : char interface[ 16 ];
166 :
167 : /* xdp specific options */
168 : ulong xdp_rx_queue_size;
169 : ulong xdp_tx_queue_size;
170 : ulong free_ring_depth;
171 : long tx_flush_timeout_ns;
172 : char xdp_mode[8];
173 : int zero_copy;
174 :
175 : ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */
176 : ulong fib4_main_obj_id; /* fib4 containing main route table */
177 : ulong fib4_local_obj_id; /* fib4 containing local route table */
178 : ulong neigh4_obj_id; /* neigh4 hash map header */
179 : ulong neigh4_ele_obj_id; /* neigh4 hash map slots */
180 : } xdp;
181 :
182 : struct {
183 : fd_topo_net_tile_t net;
184 : /* sock specific options */
185 : int so_sndbuf;
186 : int so_rcvbuf;
187 : } sock;
188 :
189 : struct {
190 : ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */
191 : ulong fib4_main_obj_id; /* fib4 containing main route table */
192 : ulong fib4_local_obj_id; /* fib4 containing local route table */
193 : char neigh_if[ 16 ]; /* neigh4 interface name */
194 : ulong neigh4_obj_id; /* neigh4 hash map header */
195 : ulong neigh4_ele_obj_id; /* neigh4 hash map slots */
196 : } netlink;
197 :
198 : #define FD_TOPO_GOSSIP_ENTRYPOINTS_MAX 16UL
199 :
200 : struct {
201 : char identity_key_path[ PATH_MAX ];
202 :
203 : ulong entrypoints_cnt;
204 : fd_ip4_port_t entrypoints[ FD_TOPO_GOSSIP_ENTRYPOINTS_MAX ];
205 :
206 : long boot_timestamp_nanos;
207 :
208 : ulong tcache_depth;
209 :
210 : ushort shred_version;
211 : int allow_private_address;
212 : } gossvf;
213 :
214 : struct {
215 : char identity_key_path[ PATH_MAX ];
216 :
217 : ulong entrypoints_cnt;
218 : fd_ip4_port_t entrypoints[ FD_TOPO_GOSSIP_ENTRYPOINTS_MAX ];
219 :
220 : long boot_timestamp_nanos;
221 :
222 : uint ip_addr;
223 : ushort shred_version;
224 :
225 : ulong max_entries;
226 : ulong max_purged;
227 : ulong max_failed;
228 :
229 : struct {
230 : ushort gossip;
231 : ushort tvu;
232 : ushort tvu_quic;
233 : ushort tpu;
234 : ushort tpu_quic;
235 : ushort repair;
236 : } ports;
237 : } gossip;
238 :
239 : struct {
240 : uint out_depth;
241 : uint reasm_cnt;
242 : ulong max_concurrent_connections;
243 : ulong max_concurrent_handshakes;
244 : ushort quic_transaction_listen_port;
245 : ulong idle_timeout_millis;
246 : uint ack_delay_millis;
247 : int retry;
248 : char key_log_path[ PATH_MAX ];
249 : } quic;
250 :
251 : struct {
252 : ulong tcache_depth;
253 : } verify;
254 :
255 : struct {
256 : ulong tcache_depth;
257 : } dedup;
258 :
259 : struct {
260 : char url[ 256 ];
261 : ulong url_len;
262 : char sni[ 256 ];
263 : ulong sni_len;
264 : char identity_key_path[ PATH_MAX ];
265 : char key_log_path[ PATH_MAX ];
266 : ulong buf_sz;
267 : ulong ssl_heap_sz;
268 : ulong keepalive_interval_nanos;
269 : uchar tls_cert_verify : 1;
270 : } bundle;
271 :
272 : struct {
273 : ulong max_pending_transactions;
274 : ulong bank_tile_count;
275 : int larger_max_cost_per_block;
276 : int larger_shred_limits_per_block;
277 : int use_consumed_cus;
278 : int schedule_strategy;
279 : struct {
280 : int enabled;
281 : uchar tip_distribution_program_addr[ 32 ];
282 : uchar tip_payment_program_addr[ 32 ];
283 : uchar tip_distribution_authority[ 32 ];
284 : ulong commission_bps;
285 : char identity_key_path[ PATH_MAX ];
286 : char vote_account_path[ PATH_MAX ]; /* or pubkey is okay */
287 : } bundle;
288 : } pack;
289 :
290 : struct {
291 : int lagged_consecutive_leader_start;
292 : int plugins_enabled;
293 : ulong bank_cnt;
294 : char identity_key_path[ PATH_MAX ];
295 : struct {
296 : int enabled;
297 : uchar tip_payment_program_addr[ 32 ];
298 : uchar tip_distribution_program_addr[ 32 ];
299 : char vote_account_path[ PATH_MAX ];
300 : } bundle;
301 : } poh;
302 :
303 : struct {
304 : ulong depth;
305 : ulong fec_resolver_depth;
306 : char identity_key_path[ PATH_MAX ];
307 : ushort shred_listen_port;
308 : int larger_shred_limits_per_block;
309 : ushort expected_shred_version;
310 : ulong adtl_dests_retransmit_cnt;
311 : fd_topo_ip_port_t adtl_dests_retransmit[ FD_TOPO_ADTL_DESTS_MAX ];
312 : ulong adtl_dests_leader_cnt;
313 : fd_topo_ip_port_t adtl_dests_leader[ FD_TOPO_ADTL_DESTS_MAX ];
314 : } shred;
315 :
316 : struct {
317 : ulong disable_blockstore_from_slot;
318 : } store;
319 :
320 : struct {
321 : char identity_key_path[ PATH_MAX ];
322 : } sign;
323 :
324 : struct {
325 : uint listen_addr;
326 : ushort listen_port;
327 :
328 : int is_voting;
329 :
330 : char cluster[ 32 ];
331 : char identity_key_path[ PATH_MAX ];
332 : char vote_key_path[ PATH_MAX ];
333 :
334 : ulong max_http_connections;
335 : ulong max_websocket_connections;
336 : ulong max_http_request_length;
337 : ulong send_buffer_size_mb;
338 : int schedule_strategy;
339 :
340 : int websocket_compression;
341 : int frontend_release_channel;
342 : } gui;
343 :
344 : struct {
345 : uint prometheus_listen_addr;
346 : ushort prometheus_listen_port;
347 : } metric;
348 :
349 : struct {
350 : ulong fec_max;
351 : ulong max_vote_accounts;
352 :
353 : int tx_metadata_storage;
354 : ulong funk_obj_id;
355 :
356 : int bootstrap;
357 : char genesis_path[ PATH_MAX ];
358 :
359 : char shred_cap[ PATH_MAX ];
360 : char cluster_version[ 32 ];
361 : char tower_checkpt[ PATH_MAX ];
362 :
363 : char identity_key_path[ PATH_MAX ];
364 : uint ip_addr;
365 : char vote_account_path[ PATH_MAX ];
366 :
367 : char blockstore_file[ PATH_MAX ];
368 : char blockstore_checkpt[ PATH_MAX ];
369 :
370 : /* not specified in TOML */
371 :
372 : ulong enable_features_cnt;
373 : char enable_features[ 16 ][ FD_BASE58_ENCODED_32_SZ ];
374 :
375 : ulong enable_bank_hash_cmp;
376 :
377 : ulong capture_start_slot;
378 : char solcap_capture[ PATH_MAX ];
379 : char dump_proto_dir[ PATH_MAX ];
380 : int dump_block_to_pb;
381 :
382 : ulong manifest_dcache_obj_id;
383 :
384 : ulong heap_size_gib;
385 : } replay;
386 :
387 : struct {
388 : ulong funk_obj_id;
389 :
390 : ulong capture_start_slot;
391 : char dump_proto_dir[ PATH_MAX ];
392 : int dump_instr_to_pb;
393 : int dump_txn_to_pb;
394 : int dump_syscall_to_pb;
395 : int dump_elf_to_pb;
396 : } exec;
397 :
398 : struct {
399 : ulong funk_obj_id;
400 : char solcap_capture[ PATH_MAX ];
401 : ulong capture_start_slot;
402 : } writer;
403 :
404 : struct {
405 : ushort send_to_port;
406 : uint send_to_ip_addr;
407 : ulong conn_cnt;
408 : int no_quic;
409 : } benchs;
410 :
411 : struct {
412 : ushort rpc_port;
413 : uint rpc_ip_addr;
414 : } bencho;
415 :
416 : struct {
417 : ulong accounts_cnt;
418 : int mode;
419 : float contending_fraction;
420 : float cu_price_spread;
421 : } benchg;
422 :
423 : struct {
424 : ushort repair_intake_listen_port;
425 : ushort repair_serve_listen_port;
426 :
427 : /* non-config */
428 :
429 : char identity_key_path[ PATH_MAX ];
430 : ulong max_pending_shred_sets;
431 : ulong slot_max;
432 : } repair;
433 :
434 : struct {
435 : char slots_pending[PATH_MAX];
436 :
437 : ulong expected_shred_version;
438 :
439 : /* non-config */
440 :
441 : char identity_key_path[ PATH_MAX ];
442 : char shred_cap_archive[ PATH_MAX ];
443 : char shred_cap_replay[ PATH_MAX ];
444 : ulong shred_cap_end_slot;
445 :
446 : char blockstore_file[ PATH_MAX ];
447 : char blockstore_restore[ PATH_MAX ];
448 : } store_int;
449 :
450 : struct {
451 : ushort send_src_port;
452 :
453 : /* non-config */
454 :
455 : uint ip_addr;
456 : char identity_key_path[ PATH_MAX ];
457 : } send;
458 :
459 : struct {
460 : ulong funk_obj_id;
461 : ulong store_obj_id;
462 : ushort rpc_port;
463 : ushort tpu_port;
464 : uint tpu_ip_addr;
465 : char identity_key_path[ PATH_MAX ];
466 : uint block_index_max;
467 : uint txn_index_max;
468 : uint acct_index_max;
469 : char history_file[ PATH_MAX ];
470 : } rpcserv;
471 :
472 : struct {
473 : uint fake_dst_ip;
474 : } pktgen;
475 :
476 : struct {
477 : ulong end_slot;
478 : char rocksdb_path[ PATH_MAX ];
479 : char shredcap_path[ PATH_MAX ];
480 : char bank_hash_path[ PATH_MAX ];
481 : char ingest_mode[ 32 ];
482 :
483 : /* Set internally by the archiver tile */
484 : int archive_fd;
485 : } archiver;
486 :
487 : struct {
488 : ulong funk_obj_id;
489 : char identity_key_path[ PATH_MAX ];
490 : char vote_acc_path[ PATH_MAX ];
491 : char ledger_path[PATH_MAX];
492 : } tower;
493 : struct {
494 : char folder_path[ PATH_MAX ];
495 : ushort repair_intake_listen_port;
496 : ulong write_buffer_size; /* Size of the write buffer for the capture tile */
497 : int enable_publish_stake_weights;
498 : char manifest_path[ PATH_MAX ];
499 :
500 : /* Set internally by the capture tile */
501 : int shreds_fd;
502 : int requests_fd;
503 : int fecs_fd;
504 : int peers_fd;
505 : int bank_hashes_fd;
506 : int slices_fd;
507 : } shredcap;
508 :
509 : struct {
510 : char snapshots_path[ PATH_MAX ];
511 : int incremental_snapshot_fetch;
512 : int do_download;
513 : uint maximum_local_snapshot_age;
514 : uint minimum_download_speed_mib;
515 : uint maximum_download_retry_abort;
516 : uint max_full_snapshots_to_keep;
517 : uint max_incremental_snapshots_to_keep;
518 :
519 : struct {
520 : ulong peers_cnt;
521 : fd_ip4_port_t peers[ 16UL ];
522 : } http;
523 :
524 : int diagnostics;
525 : } snaprd;
526 :
527 : struct {
528 : ulong funk_obj_id;
529 : } snapin;
530 :
531 : struct {
532 :
533 : uint bind_address;
534 : ushort bind_port;
535 :
536 : ushort expected_shred_version;
537 : ulong entrypoints_cnt;
538 : fd_ip4_port_t entrypoints[ FD_TOPO_GOSSIP_ENTRYPOINTS_MAX ];
539 : } ipecho;
540 :
541 : struct {
542 : ulong funk_obj_id;
543 : } bank;
544 :
545 : struct {
546 : ulong funk_obj_id;
547 : } resolv;
548 :
549 : struct {
550 : ulong funk_obj_id;
551 :
552 : int allow_download;
553 :
554 : ushort expected_shred_version;
555 : ulong entrypoints_cnt;
556 : fd_ip4_port_t entrypoints[ FD_TOPO_GOSSIP_ENTRYPOINTS_MAX ];
557 :
558 : char genesis_path[ PATH_MAX ];
559 : } genesi;
560 : };
561 : };
562 :
563 : typedef struct fd_topo_tile fd_topo_tile_t;
564 :
565 : typedef struct {
566 : ulong id;
567 : char name[ 13UL ];
568 : ulong wksp_id;
569 :
570 : ulong offset;
571 : ulong footprint;
572 : } fd_topo_obj_t;
573 :
574 : /* An fd_topo_t represents the overall structure of a Firedancer
575 : configuration, describing all the workspaces, tiles, and links
576 : between them. */
577 : struct fd_topo {
578 : char app_name[ 256UL ];
579 : uchar props[ 16384UL ];
580 :
581 : ulong wksp_cnt;
582 : ulong link_cnt;
583 : ulong tile_cnt;
584 : ulong obj_cnt;
585 :
586 : fd_topo_wksp_t workspaces[ FD_TOPO_MAX_WKSPS ];
587 : fd_topo_link_t links[ FD_TOPO_MAX_LINKS ];
588 : fd_topo_tile_t tiles[ FD_TOPO_MAX_TILES ];
589 : fd_topo_obj_t objs[ FD_TOPO_MAX_OBJS ];
590 :
591 : ulong agave_affinity_cnt;
592 : ulong agave_affinity_cpu_idx[ FD_TILE_MAX ];
593 :
594 : ulong max_page_size; /* 2^21 or 2^30 */
595 : ulong gigantic_page_threshold; /* see [hugetlbfs.gigantic_page_threshold_mib]*/
596 : };
597 : typedef struct fd_topo fd_topo_t;
598 :
599 : typedef struct {
600 : char const * name;
601 :
602 : int keep_host_networking;
603 : int allow_connect;
604 : ulong rlimit_file_cnt;
605 : ulong rlimit_address_space;
606 : ulong rlimit_data;
607 : int for_tpool;
608 :
609 : ulong (*populate_allowed_seccomp)( fd_topo_t const * topo, fd_topo_tile_t const * tile, ulong out_cnt, struct sock_filter * out );
610 : ulong (*populate_allowed_fds )( fd_topo_t const * topo, fd_topo_tile_t const * tile, ulong out_fds_sz, int * out_fds );
611 : ulong (*scratch_align )( void );
612 : ulong (*scratch_footprint )( fd_topo_tile_t const * tile );
613 : ulong (*loose_footprint )( fd_topo_tile_t const * tile );
614 : void (*privileged_init )( fd_topo_t * topo, fd_topo_tile_t * tile );
615 : void (*unprivileged_init )( fd_topo_t * topo, fd_topo_tile_t * tile );
616 : void (*run )( fd_topo_t * topo, fd_topo_tile_t * tile );
617 : ulong (*rlimit_file_cnt_fn )( fd_topo_t const * topo, fd_topo_tile_t const * tile );
618 : } fd_topo_run_tile_t;
619 :
620 : struct fd_topo_obj_callbacks {
621 : char const * name;
622 : ulong (* footprint )( fd_topo_t const * topo, fd_topo_obj_t const * obj );
623 : ulong (* align )( fd_topo_t const * topo, fd_topo_obj_t const * obj );
624 : ulong (* loose )( fd_topo_t const * topo, fd_topo_obj_t const * obj );
625 : void (* new )( fd_topo_t const * topo, fd_topo_obj_t const * obj );
626 : };
627 :
628 : typedef struct fd_topo_obj_callbacks fd_topo_obj_callbacks_t;
629 :
630 : FD_PROTOTYPES_BEGIN
631 :
632 : FD_FN_CONST static inline ulong
633 1473 : fd_topo_workspace_align( void ) {
634 : /* This needs to be the max( align ) of all the child members that
635 : could be aligned into this workspace, otherwise our footprint
636 : calculation will not be correct. For now just set to 4096 but this
637 : should probably be calculated dynamically, or we should reduce
638 : those child aligns if we can. */
639 1473 : return 4096UL;
640 1473 : }
641 :
642 : static inline void *
643 : fd_topo_obj_laddr( fd_topo_t const * topo,
644 1800 : ulong obj_id ) {
645 1800 : fd_topo_obj_t const * obj = &topo->objs[ obj_id ];
646 1800 : FD_TEST( obj_id<FD_TOPO_MAX_OBJS );
647 1800 : FD_TEST( obj->id == obj_id );
648 1800 : FD_TEST( obj->offset );
649 1800 : return (void *)((ulong)topo->workspaces[ obj->wksp_id ].wksp + obj->offset);
650 1800 : }
651 :
652 : /* Returns a pointer in the local address space to the base address of
653 : the workspace out of which the given object was allocated. */
654 :
655 : static inline void *
656 : fd_topo_obj_wksp_base( fd_topo_t const * topo,
657 0 : ulong obj_id ) {
658 0 : FD_TEST( obj_id<FD_TOPO_MAX_OBJS );
659 0 : fd_topo_obj_t const * obj = &topo->objs[ obj_id ];
660 0 : FD_TEST( obj->id == obj_id );
661 0 : ulong const wksp_id = obj->wksp_id;
662 :
663 0 : FD_TEST( wksp_id<FD_TOPO_MAX_WKSPS );
664 0 : fd_topo_wksp_t const * wksp = &topo->workspaces[ wksp_id ];
665 0 : FD_TEST( wksp->id == wksp_id );
666 0 : return wksp->wksp;
667 0 : }
668 :
669 : FD_FN_PURE static inline ulong
670 : fd_topo_tile_name_cnt( fd_topo_t const * topo,
671 6 : char const * name ) {
672 6 : ulong cnt = 0;
673 120 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
674 114 : if( FD_UNLIKELY( !strcmp( topo->tiles[ i ].name, name ) ) ) cnt++;
675 114 : }
676 6 : return cnt;
677 6 : }
678 :
679 : /* Finds the workspace of a given name in the topology. Returns
680 : ULONG_MAX if there is no such workspace. There can be at most one
681 : workspace of a given name. */
682 :
683 : FD_FN_PURE static inline ulong
684 : fd_topo_find_wksp( fd_topo_t const * topo,
685 1704 : char const * name ) {
686 41046 : for( ulong i=0; i<topo->wksp_cnt; i++ ) {
687 41046 : if( FD_UNLIKELY( !strcmp( topo->workspaces[ i ].name, name ) ) ) return i;
688 41046 : }
689 0 : return ULONG_MAX;
690 1704 : }
691 :
692 : /* Find the tile of a given name and kind_id in the topology, there will
693 : be at most one such tile, since kind_id is unique among the name.
694 : Returns ULONG_MAX if there is no such tile. */
695 :
696 : FD_FN_PURE static inline ulong
697 : fd_topo_find_tile( fd_topo_t const * topo,
698 : char const * name,
699 1083 : ulong kind_id ) {
700 16968 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
701 16968 : if( FD_UNLIKELY( !strcmp( topo->tiles[ i ].name, name ) ) && topo->tiles[ i ].kind_id == kind_id ) return i;
702 16968 : }
703 0 : return ULONG_MAX;
704 1083 : }
705 :
706 : /* Find the link of a given name and kind_id in the topology, there will
707 : be at most one such link, since kind_id is unique among the name.
708 : Returns ULONG_MAX if there is no such link. */
709 :
710 : FD_FN_PURE static inline ulong
711 : fd_topo_find_link( fd_topo_t const * topo,
712 : char const * name,
713 864 : ulong kind_id ) {
714 23565 : for( ulong i=0; i<topo->link_cnt; i++ ) {
715 23565 : if( FD_UNLIKELY( !strcmp( topo->links[ i ].name, name ) ) && topo->links[ i ].kind_id == kind_id ) return i;
716 23565 : }
717 0 : return ULONG_MAX;
718 864 : }
719 :
720 : FD_FN_PURE static inline ulong
721 : fd_topo_find_tile_in_link( fd_topo_t const * topo,
722 : fd_topo_tile_t const * tile,
723 : char const * name,
724 33 : ulong kind_id ) {
725 264 : for( ulong i=0; i<tile->in_cnt; i++ ) {
726 264 : if( FD_UNLIKELY( !strcmp( topo->links[ tile->in_link_id[ i ] ].name, name ) )
727 264 : && topo->links[ tile->in_link_id[ i ] ].kind_id == kind_id ) return i;
728 264 : }
729 0 : return ULONG_MAX;
730 33 : }
731 :
732 : FD_FN_PURE static inline ulong
733 : fd_topo_find_tile_out_link( fd_topo_t const * topo,
734 : fd_topo_tile_t const * tile,
735 : char const * name,
736 33 : ulong kind_id ) {
737 99 : for( ulong i=0; i<tile->out_cnt; i++ ) {
738 99 : if( FD_UNLIKELY( !strcmp( topo->links[ tile->out_link_id[ i ] ].name, name ) )
739 99 : && topo->links[ tile->out_link_id[ i ] ].kind_id == kind_id ) return i;
740 99 : }
741 0 : return ULONG_MAX;
742 33 : }
743 :
744 : /* Find the id of the tile which is a producer for the given link. If
745 : no tile is a producer for the link, returns ULONG_MAX. This should
746 : not be possible for a well formed and validated topology. */
747 : FD_FN_PURE static inline ulong
748 : fd_topo_find_link_producer( fd_topo_t const * topo,
749 0 : fd_topo_link_t const * link ) {
750 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
751 0 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
752 :
753 0 : for( ulong j=0; j<tile->out_cnt; j++ ) {
754 0 : if( FD_UNLIKELY( tile->out_link_id[ j ] == link->id ) ) return i;
755 0 : }
756 0 : }
757 0 : return ULONG_MAX;
758 0 : }
759 :
760 : /* Given a link, count the number of consumers of that link among all
761 : the tiles in the topology. */
762 : FD_FN_PURE static inline ulong
763 : fd_topo_link_consumer_cnt( fd_topo_t const * topo,
764 336 : fd_topo_link_t const * link ) {
765 336 : ulong cnt = 0;
766 11169 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
767 10833 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
768 39816 : for( ulong j=0; j<tile->in_cnt; j++ ) {
769 28983 : if( FD_UNLIKELY( tile->in_link_id[ j ] == link->id ) ) cnt++;
770 28983 : }
771 10833 : }
772 :
773 336 : return cnt;
774 336 : }
775 :
776 : /* Given a link, count the number of reliable consumers of that link
777 : among all the tiles in the topology. */
778 : FD_FN_PURE static inline ulong
779 : fd_topo_link_reliable_consumer_cnt( fd_topo_t const * topo,
780 0 : fd_topo_link_t const * link ) {
781 0 : ulong cnt = 0;
782 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
783 0 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
784 0 : for( ulong j=0; j<tile->in_cnt; j++ ) {
785 0 : if( FD_UNLIKELY( tile->in_link_id[ j ] == link->id && tile->in_link_reliable[ j ] ) ) cnt++;
786 0 : }
787 0 : }
788 0 :
789 0 : return cnt;
790 0 : }
791 :
792 : FD_FN_PURE static inline ulong
793 : fd_topo_tile_consumer_cnt( fd_topo_t const * topo,
794 0 : fd_topo_tile_t const * tile ) {
795 0 : (void)topo;
796 0 : return tile->out_cnt;
797 0 : }
798 :
799 : FD_FN_PURE static inline ulong
800 : fd_topo_tile_reliable_consumer_cnt( fd_topo_t const * topo,
801 0 : fd_topo_tile_t const * tile ) {
802 0 : ulong reliable_cons_cnt = 0UL;
803 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
804 0 : fd_topo_tile_t const * consumer_tile = &topo->tiles[ i ];
805 0 : for( ulong j=0UL; j<consumer_tile->in_cnt; j++ ) {
806 0 : for( ulong k=0UL; k<tile->out_cnt; k++ ) {
807 0 : if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) {
808 0 : reliable_cons_cnt++;
809 0 : }
810 0 : }
811 0 : }
812 0 : }
813 0 : return reliable_cons_cnt;
814 0 : }
815 :
816 : FD_FN_PURE static inline ulong
817 : fd_topo_tile_producer_cnt( fd_topo_t const * topo,
818 0 : fd_topo_tile_t const * tile ) {
819 0 : (void)topo;
820 0 : ulong in_cnt = 0UL;
821 0 : for( ulong i=0UL; i<tile->in_cnt; i++ ) {
822 0 : if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue;
823 0 : in_cnt++;
824 0 : }
825 0 : return in_cnt;
826 0 : }
827 :
828 : /* Join (map into the process) all shared memory (huge/gigantic pages)
829 : needed by the tile, in the given topology. All memory associated
830 : with the tile (aka. used by links that the tile either produces to or
831 : consumes from, or used by the tile itself for its cnc) will be
832 : attached (mapped into the process).
833 :
834 : This is needed to play nicely with the sandbox. Once a process is
835 : sandboxed we can no longer map any memory. */
836 : void
837 : fd_topo_join_tile_workspaces( fd_topo_t * topo,
838 : fd_topo_tile_t * tile );
839 :
840 : /* Join (map into the process) the shared memory (huge/gigantic pages)
841 : for the given workspace. Mode is one of
842 : FD_SHMEM_JOIN_MODE_READ_WRITE or FD_SHMEM_JOIN_MODE_READ_ONLY and
843 : determines the prot argument that will be passed to mmap when mapping
844 : the pages in (PROT_WRITE or PROT_READ respectively). */
845 : void
846 : fd_topo_join_workspace( fd_topo_t * topo,
847 : fd_topo_wksp_t * wksp,
848 : int mode );
849 :
850 : /* Join (map into the process) all shared memory (huge/gigantic pages)
851 : needed by all tiles in the topology. Mode is one of
852 : FD_SHMEM_JOIN_MODE_READ_WRITE or FD_SHMEM_JOIN_MODE_READ_ONLY and
853 : determines the prot argument that will be passed to mmap when
854 : mapping the pages in (PROT_WRITE or PROT_READ respectively). */
855 : void
856 : fd_topo_join_workspaces( fd_topo_t * topo,
857 : int mode );
858 :
859 : /* Leave (unmap from the process) the shared memory needed for the
860 : given workspace in the topology, if it was previously mapped.
861 :
862 : topo and wksp are assumed non-NULL. It is OK if the workspace
863 : has not been previously joined, in which case this is a no-op. */
864 :
865 : void
866 : fd_topo_leave_workspace( fd_topo_t * topo,
867 : fd_topo_wksp_t * wksp );
868 :
869 : /* Leave (unmap from the process) all shared memory needed by all
870 : tiles in the topology, if each of them was mapped.
871 :
872 : topo is assumed non-NULL. Only workspaces which were previously
873 : joined are unmapped. */
874 :
875 : void
876 : fd_topo_leave_workspaces( fd_topo_t * topo );
877 :
878 : /* Create the given workspace needed by the topology on the system.
879 : This does not "join" the workspaces (map their memory into the
880 : process), but only creates the .wksp file and formats it correctly
881 : as a workspace.
882 :
883 : Returns 0 on success and -1 on failure, with errno set to the error.
884 : The only reason for failure currently that will be returned is
885 : ENOMEM, as other unexpected errors will cause the program to exit.
886 :
887 : If update_existing is 1, the workspace will not be created from
888 : scratch but it will be assumed that it already exists from a prior
889 : run and needs to be maybe resized and then have the header
890 : structures reinitialized. This can save a very expensive operation
891 : of zeroing all of the workspace pages. This is dangerous in
892 : production because it can leave stray memory from prior runs around,
893 : and should only be used in development environments. */
894 :
895 : int
896 : fd_topo_create_workspace( fd_topo_t * topo,
897 : fd_topo_wksp_t * wksp,
898 : int update_existing );
899 :
900 : /* Join the standard IPC objects needed by the topology of this particular
901 : tile */
902 :
903 : void
904 : fd_topo_fill_tile( fd_topo_t * topo,
905 : fd_topo_tile_t * tile );
906 :
907 : /* Same as fd_topo_fill_tile but fills in all the objects for a
908 : particular workspace with the given mode. */
909 : void
910 : fd_topo_workspace_fill( fd_topo_t * topo,
911 : fd_topo_wksp_t * wksp );
912 :
913 : /* Apply a new function to every object that is resident in the given
914 : workspace in the topology. */
915 :
916 : void
917 : fd_topo_wksp_new( fd_topo_t const * topo,
918 : fd_topo_wksp_t const * wksp,
919 : fd_topo_obj_callbacks_t ** callbacks );
920 :
921 : /* Same as fd_topo_fill_tile but fills in all tiles in the topology. */
922 :
923 : void
924 : fd_topo_fill( fd_topo_t * topo );
925 :
926 : /* fd_topo_tile_stack_join joins a huge page optimized stack for the
927 : provided tile. The stack is assumed to already exist at a known
928 : path in the hugetlbfs mount. */
929 :
930 : void *
931 : fd_topo_tile_stack_join( char const * app_name,
932 : char const * tile_name,
933 : ulong tile_kind_id );
934 :
935 : /* Install the XDP program needed by the net tiles into the local device
936 : and return the xsk_map_fd. bind_addr is an optional IPv4 address to
937 : used for filtering by dst IP. */
938 :
939 : fd_xdp_fds_t
940 : fd_topo_install_xdp( fd_topo_t const * topo,
941 : uint bind_addr );
942 :
943 : /* fd_topo_run_single_process runs all the tiles in a single process
944 : (the calling process). This spawns a thread for each tile, switches
945 : that thread to the given UID and GID and then runs the tile in it.
946 : Each thread will never exit, as tiles are expected to run forever.
947 : An error is logged and the application will exit if a tile exits.
948 : The function itself does return after spawning all the threads.
949 :
950 : The threads will not be sandboxed in any way, except switching to the
951 : provided UID and GID, so they will share the same address space, and
952 : not have any seccomp restrictions or use any Linux namespaces. The
953 : calling thread will also switch to the provided UID and GID before
954 : it returns.
955 :
956 : In production, when running with an Agave child process this is
957 : used for spawning certain tiles inside the Agave address space.
958 : It's also useful for tooling and debugging, but is not how the main
959 : production Firedancer process runs. For production, each tile is run
960 : in its own address space with a separate process and full security
961 : sandbox.
962 :
963 : The agave argument determines which tiles are started. If the
964 : argument is 0 or 1, only non-agave (or only agave) tiles are started.
965 : If the argument is any other value, all tiles in the topology are
966 : started regardless of if they are Agave tiles or not. */
967 :
968 : void
969 : fd_topo_run_single_process( fd_topo_t * topo,
970 : int agave,
971 : uint uid,
972 : uint gid,
973 : fd_topo_run_tile_t (* tile_run )( fd_topo_tile_t const * tile ) );
974 :
975 : /* fd_topo_run_tile runs the given tile directly within the current
976 : process (and thread). The function will never return, as tiles are
977 : expected to run forever. An error is logged and the application will
978 : exit if the tile exits.
979 :
980 : The sandbox argument determines if the current process will be
981 : sandboxed fully before starting the tile. The thread will switch to
982 : the UID and GID provided before starting the tile, even if the thread
983 : is not being sandboxed. Although POSIX specifies that all threads in
984 : a process must share a UID and GID, this is not the case on Linux.
985 : The thread will switch to the provided UID and GID without switching
986 : the other threads in the process.
987 :
988 : If keep_controlling_terminal is set to 0, and the sandbox is enabled
989 : the controlling terminal will be detached as an additional sandbox
990 : measure, but you will not be able to send Ctrl+C or other signals
991 : from the terminal. See fd_sandbox.h for more information.
992 :
993 : The allow_fd argument is only used if sandbox is true, and is a file
994 : descriptor which will be allowed to exist in the process. Normally
995 : the sandbox code rejects and aborts if there is an unexpected file
996 : descriptor present on boot. This is helpful to allow a parent
997 : process to be notified on termination of the tile by waiting for a
998 : pipe file descriptor to get closed.
999 :
1000 : wait and debugger are both used in debugging. If wait is non-NULL,
1001 : the runner will wait until the value pointed to by wait is non-zero
1002 : before launching the tile. Likewise, if debugger is non-NULL, the
1003 : runner will wait until a debugger is attached before setting the
1004 : value pointed to by debugger to non-zero. These are intended to be
1005 : used as a pair, where many tiles share a waiting reference, and then
1006 : one of the tiles (a tile you want to attach the debugger to) has the
1007 : same reference provided as the debugger, so all tiles will stop and
1008 : wait for the debugger to attach to it before proceeding. */
1009 :
1010 : void
1011 : fd_topo_run_tile( fd_topo_t * topo,
1012 : fd_topo_tile_t * tile,
1013 : int sandbox,
1014 : int keep_controlling_terminal,
1015 : int dumpable,
1016 : uint uid,
1017 : uint gid,
1018 : int allow_fd,
1019 : volatile int * wait,
1020 : volatile int * debugger,
1021 : fd_topo_run_tile_t * tile_run );
1022 :
1023 : /* This is for determining the value of RLIMIT_MLOCK that we need to
1024 : successfully run all tiles in separate processes. The value returned
1025 : is the maximum amount of memory that will be locked with mlock() by
1026 : any individual process in the tree. Specifically, if we have three
1027 : tile processes, and they each need to lock 5, 9, and 2 MiB of memory
1028 : respectively, RLIMIT_MLOCK needs to be 9 MiB to allow all three
1029 : process mlock() calls to succeed.
1030 :
1031 : Tiles lock memory in three ways. Any workspace they are using, they
1032 : lock the entire workspace. Then each tile uses huge pages for the
1033 : stack which are also locked, and finally some tiles use private
1034 : locked mmaps outside the workspace for storing key material. The
1035 : results here include all of this memory together.
1036 :
1037 : The result is not necessarily the amount of memory used by the tile
1038 : process, although it will be quite close. Tiles could potentially
1039 : allocate memory (eg, with brk) without needing to lock it, which
1040 : would not need to included, and some kernel memory that tiles cause
1041 : to be allocated (for example XSK buffers) is also not included. The
1042 : actual amount of memory used will not be less than this value. */
1043 : FD_FN_PURE ulong
1044 : fd_topo_mlock_max_tile( fd_topo_t const * topo );
1045 :
1046 : /* Same as fd_topo_mlock_max_tile, but for loading the entire topology
1047 : into one process, rather than a separate process per tile. This is
1048 : used, for example, by the configuration code when it creates all the
1049 : workspaces, or the monitor that maps the entire system into one
1050 : address space. */
1051 : FD_FN_PURE ulong
1052 : fd_topo_mlock( fd_topo_t const * topo );
1053 :
1054 : /* This returns the number of gigantic pages needed by the topology on
1055 : the provided numa node. It includes pages needed by the workspaces,
1056 : as well as additional allocations like huge pages for process stacks
1057 : and private key storage. */
1058 :
1059 : FD_FN_PURE ulong
1060 : fd_topo_gigantic_page_cnt( fd_topo_t const * topo,
1061 : ulong numa_idx );
1062 :
1063 : /* This returns the number of huge pages in the application needed by
1064 : the topology on the provided numa node. It includes pages needed by
1065 : things placed in the hugetlbfs (workspaces, process stacks). If
1066 : include_anonymous is true, it also includes anonymous hugepages which
1067 : are needed but are not placed in the hugetlbfs. */
1068 :
1069 : FD_FN_PURE ulong
1070 : fd_topo_huge_page_cnt( fd_topo_t const * topo,
1071 : ulong numa_idx,
1072 : int include_anonymous );
1073 :
1074 : /* Prints a message describing the topology to an output stream. If
1075 : stdout is true, will be written to stdout, otherwise will be written
1076 : as a NOTICE log message to the log file. */
1077 : void
1078 : fd_topo_print_log( int stdout,
1079 : fd_topo_t * topo );
1080 :
1081 : FD_PROTOTYPES_END
1082 :
1083 : #endif /* HEADER_fd_src_disco_topo_fd_topo_h */
|