Line data Source code
1 : #ifndef HEADER_fd_src_disco_topo_fd_topo_h
2 : #define HEADER_fd_src_disco_topo_fd_topo_h
3 :
4 : #include "../stem/fd_stem.h"
5 : #include "../../tango/fd_tango.h"
6 : #include "../../waltz/xdp/fd_xdp1.h"
7 : #include "../../ballet/base58/fd_base58.h"
8 : #include "../../util/net/fd_net_headers.h"
9 :
10 : /* Maximum number of workspaces that may be present in a topology. */
11 : #define FD_TOPO_MAX_WKSPS (256UL)
12 : /* Maximum number of links that may be present in a topology. */
13 : #define FD_TOPO_MAX_LINKS (256UL)
14 : /* Maximum number of tiles that may be present in a topology. */
15 : #define FD_TOPO_MAX_TILES (256UL)
16 : /* Maximum number of objects that may be present in a topology. */
17 : #define FD_TOPO_MAX_OBJS (4096UL)
18 : /* Maximum number of links that may go into any one tile in the
19 : topology. */
20 : #define FD_TOPO_MAX_TILE_IN_LINKS ( 128UL)
21 : /* Maximum number of links that a tile may write to. */
22 : #define FD_TOPO_MAX_TILE_OUT_LINKS ( 32UL)
23 : /* Maximum number of objects that a tile can use. */
24 : #define FD_TOPO_MAX_TILE_OBJS ( 256UL)
25 :
26 : /* Maximum number of additional ip addresses */
27 : #define FD_NET_MAX_SRC_ADDR 4
28 :
29 : /* Maximum number of additional destinations for leader shreds and for retransmitted shreds */
30 : #define FD_TOPO_ADTL_DESTS_MAX ( 32UL)
31 :
32 :
33 : /* A workspace is a Firedancer specific memory management structure that
34 : sits on top of 1 or more memory mapped gigantic or huge pages mounted
35 : to the hugetlbfs. */
36 : typedef struct {
37 : ulong id; /* The ID of this workspace. Indexed from [0, wksp_cnt). When placed in a topology, the ID must be the index of the workspace in the workspaces list. */
38 : char name[ 13UL ]; /* The name of this workspace, like "pack". There can be at most one of each workspace name in a topology. */
39 :
40 : ulong numa_idx; /* The index of the NUMA node on the system that this workspace should be allocated from. */
41 :
42 : int is_locked; /* If the workspace should use pages locked and pinned to a specific numa node. */
43 :
44 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
45 : struct {
46 : ulong page_sz; /* The size of the pages that this workspace is backed by. One of FD_PAGE_SIZE_*. */
47 : ulong page_cnt; /* The number of pages that must be mapped to this workspace to store all the data needed by consumers. */
48 : ulong part_max; /* The maximum number of partitions in the underlying workspace. There can only be this many allocations made at any one time. */
49 :
50 : fd_wksp_t * wksp; /* The workspace memory in the local process. */
51 : ulong known_footprint; /* Total size in bytes of all data in Firedancer that will be stored in this workspace at startup. */
52 : ulong total_footprint; /* Total size in bytes of all data in Firedancer that could be stored in this workspace, includes known data and loose data. */
53 : };
54 : } fd_topo_wksp_t;
55 :
56 : /* A link is an mcache in a workspace that has one producer and one or
57 : more consumers. A link may optionally also have a dcache, that holds
58 : fragments referred to by the mcache entries.
59 :
60 : A link belongs to exactly one workspace. A link has exactly one
61 : producer, and 1 or more consumers. Each consumer is either reliable
62 : or not reliable. A link has a depth and a MTU, which correspond to
63 : the depth and MTU of the mcache and dcache respectively. A MTU of
64 : zero means no dcache is needed, as there is no data. */
65 : typedef struct {
66 : ulong id; /* The ID of this link. Indexed from [0, link_cnt). When placed in a topology, the ID must be the index of the link in the links list. */
67 : char name[ 13UL ]; /* The name of this link, like "pack_bank". There can be multiple of each link name in a topology. */
68 : ulong kind_id; /* The ID of this link within its name. If there are N links of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a link, as does "id" on its own. */
69 :
70 : ulong depth; /* The depth of the mcache representing the link. */
71 : ulong mtu; /* The MTU of data fragments in the mcache. A value of 0 means there is no dcache. */
72 : ulong burst; /* The max amount of MTU sized data fragments that might be bursted to the dcache. */
73 :
74 : ulong mcache_obj_id;
75 : ulong dcache_obj_id;
76 :
77 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
78 : struct {
79 : fd_frag_meta_t * mcache; /* The mcache of this link. */
80 : void * dcache; /* The dcache of this link, if it has one. */
81 : };
82 :
83 : uint permit_no_consumers : 1; /* Permit a topology where this link has no consumers */
84 : uint permit_no_producers : 1; /* Permit a topology where this link has no producers */
85 : } fd_topo_link_t;
86 :
87 : /* Be careful: ip and host are in different byte order */
88 : typedef struct {
89 : uint ip; /* in network byte order */
90 : ushort port; /* in host byte order */
91 : } fd_topo_ip_port_t;
92 :
93 : struct fd_topo_net_tile {
94 : ulong umem_dcache_obj_id; /* dcache for XDP UMEM frames */
95 : uint bind_address;
96 :
97 : ushort shred_listen_port;
98 : ushort quic_transaction_listen_port;
99 : ushort legacy_transaction_listen_port;
100 : ushort gossip_listen_port;
101 : ushort repair_intake_listen_port;
102 : ushort repair_serve_listen_port;
103 : ushort send_src_port;
104 : };
105 : typedef struct fd_topo_net_tile fd_topo_net_tile_t;
106 :
107 : /* A tile is a unique process that is spawned by Firedancer to represent
108 : one thread of execution. Firedancer sandboxes all tiles to their own
109 : process for security reasons.
110 :
111 : A tile belongs to exactly one workspace. A tile is a consumer of 0
112 : or more links, it's inputs. A tile is a producer of 0 or more output
113 : links.
114 :
115 : All input links will be automatically polled by the tile
116 : infrastructure, and output links will automatically source and manage
117 : credits from consumers. */
118 : struct fd_topo_tile {
119 : ulong id; /* The ID of this tile. Indexed from [0, tile_cnt). When placed in a topology, the ID must be the index of the tile in the tiles list. */
120 : char name[ 7UL ]; /* The name of this tile. There can be multiple of each tile name in a topology. */
121 : ulong kind_id; /* The ID of this tile within its name. If there are n tile of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a tile, as does "id" on its own. */
122 : int is_agave; /* If the tile needs to run in the Agave (Anza) address space or not. */
123 : int allow_shutdown; /* If the tile is allowed to shutdown gracefully. If false, when the tile exits it will tear down the entire application. */
124 :
125 : ulong cpu_idx; /* The CPU index to pin the tile on. A value of ULONG_MAX or more indicates the tile should be floating and not pinned to a core. */
126 :
127 : ulong in_cnt; /* The number of links that this tile reads from. */
128 : ulong in_link_id[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* The link_id of each link that this tile reads from, indexed in [0, in_cnt). */
129 : int in_link_reliable[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* If each link that this tile reads from is a reliable or unreliable consumer, indexed in [0, in_cnt). */
130 : int in_link_poll[ FD_TOPO_MAX_TILE_IN_LINKS ]; /* If each link that this tile reads from should be polled by the tile infrastructure, indexed in [0, in_cnt).
131 : If the link is not polled, the tile will not receive frags for it and the tile writer is responsible for
132 : reading from the link. The link must be marked as unreliable as it is not flow controlled. */
133 :
134 : ulong out_cnt; /* The number of links that this tile writes to. */
135 : ulong out_link_id[ FD_TOPO_MAX_TILE_OUT_LINKS ]; /* The link_id of each link that this tile writes to, indexed in [0, link_cnt). */
136 :
137 : ulong tile_obj_id;
138 : ulong metrics_obj_id;
139 : ulong keyswitch_obj_id;
140 : ulong in_link_fseq_obj_id[ FD_TOPO_MAX_TILE_IN_LINKS ];
141 :
142 : ulong uses_obj_cnt;
143 : ulong uses_obj_id[ FD_TOPO_MAX_TILE_OBJS ];
144 : int uses_obj_mode[ FD_TOPO_MAX_TILE_OBJS ];
145 :
146 : /* Computed fields. These are not supplied as configuration but calculated as needed. */
147 : struct {
148 : ulong * metrics; /* The shared memory for metrics that this tile should write. Consumer by monitoring and metrics writing tiles. */
149 :
150 : /* The fseq of each link that this tile reads from. Multiple fseqs
151 : may point to the link, if there are multiple consumers. An fseq
152 : can be uniquely identified via (link_id, tile_id), or (link_kind,
153 : link_kind_id, tile_kind, tile_kind_id) */
154 : ulong * in_link_fseq[ FD_TOPO_MAX_TILE_IN_LINKS ];
155 : };
156 :
157 : /* Configuration fields. These are required to be known by the topology so it can determine the
158 : total size of Firedancer in memory. */
159 : union {
160 : fd_topo_net_tile_t net;
161 :
162 : struct {
163 : fd_topo_net_tile_t net;
164 : char interface[ 16 ];
165 :
166 : /* xdp specific options */
167 : ulong xdp_rx_queue_size;
168 : ulong xdp_tx_queue_size;
169 : ulong free_ring_depth;
170 : long tx_flush_timeout_ns;
171 : char xdp_mode[8];
172 : int zero_copy;
173 :
174 : ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */
175 : ulong fib4_main_obj_id; /* fib4 containing main route table */
176 : ulong fib4_local_obj_id; /* fib4 containing local route table */
177 : ulong neigh4_obj_id; /* neigh4 hash map header */
178 : ulong neigh4_ele_obj_id; /* neigh4 hash map slots */
179 : } xdp;
180 :
181 : struct {
182 : fd_topo_net_tile_t net;
183 : /* sock specific options */
184 : int so_sndbuf;
185 : int so_rcvbuf;
186 : } sock;
187 :
188 : struct {
189 : ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */
190 : ulong fib4_main_obj_id; /* fib4 containing main route table */
191 : ulong fib4_local_obj_id; /* fib4 containing local route table */
192 : char neigh_if[ 16 ]; /* neigh4 interface name */
193 : ulong neigh4_obj_id; /* neigh4 hash map header */
194 : ulong neigh4_ele_obj_id; /* neigh4 hash map slots */
195 : } netlink;
196 :
197 : struct {
198 : uint out_depth;
199 : uint reasm_cnt;
200 : ulong max_concurrent_connections;
201 : ulong max_concurrent_handshakes;
202 : ushort quic_transaction_listen_port;
203 : ulong idle_timeout_millis;
204 : uint ack_delay_millis;
205 : int retry;
206 : char key_log_path[ PATH_MAX ];
207 : } quic;
208 :
209 : struct {
210 : ulong tcache_depth;
211 : } verify;
212 :
213 : struct {
214 : ulong tcache_depth;
215 : } dedup;
216 :
217 : struct {
218 : char url[ 256 ];
219 : ulong url_len;
220 : char sni[ 256 ];
221 : ulong sni_len;
222 : char identity_key_path[ PATH_MAX ];
223 : char key_log_path[ PATH_MAX ];
224 : ulong buf_sz;
225 : ulong ssl_heap_sz;
226 : ulong keepalive_interval_nanos;
227 : uchar tls_cert_verify : 1;
228 : } bundle;
229 :
230 : struct {
231 : ulong max_pending_transactions;
232 : ulong bank_tile_count;
233 : int larger_max_cost_per_block;
234 : int larger_shred_limits_per_block;
235 : int use_consumed_cus;
236 : int schedule_strategy;
237 : struct {
238 : int enabled;
239 : uchar tip_distribution_program_addr[ 32 ];
240 : uchar tip_payment_program_addr[ 32 ];
241 : uchar tip_distribution_authority[ 32 ];
242 : ulong commission_bps;
243 : char identity_key_path[ PATH_MAX ];
244 : char vote_account_path[ PATH_MAX ]; /* or pubkey is okay */
245 : } bundle;
246 : } pack;
247 :
248 : struct {
249 : int lagged_consecutive_leader_start;
250 : int plugins_enabled;
251 : ulong bank_cnt;
252 : char identity_key_path[ PATH_MAX ];
253 : struct {
254 : int enabled;
255 : uchar tip_payment_program_addr[ 32 ];
256 : uchar tip_distribution_program_addr[ 32 ];
257 : char vote_account_path[ PATH_MAX ];
258 : } bundle;
259 : } poh;
260 :
261 : struct {
262 : ulong depth;
263 : ulong fec_resolver_depth;
264 : char identity_key_path[ PATH_MAX ];
265 : ushort shred_listen_port;
266 : int larger_shred_limits_per_block;
267 : ulong expected_shred_version;
268 : ulong adtl_dests_retransmit_cnt;
269 : fd_topo_ip_port_t adtl_dests_retransmit[ FD_TOPO_ADTL_DESTS_MAX ];
270 : ulong adtl_dests_leader_cnt;
271 : fd_topo_ip_port_t adtl_dests_leader[ FD_TOPO_ADTL_DESTS_MAX ];
272 : } shred;
273 :
274 : struct {
275 : ulong disable_blockstore_from_slot;
276 : } store;
277 :
278 : struct {
279 : char identity_key_path[ PATH_MAX ];
280 : } sign;
281 :
282 : struct {
283 : uint listen_addr;
284 : ushort listen_port;
285 :
286 : int is_voting;
287 :
288 : char cluster[ 32 ];
289 : char identity_key_path[ PATH_MAX ];
290 : char vote_key_path[ PATH_MAX ];
291 :
292 : ulong max_http_connections;
293 : ulong max_websocket_connections;
294 : ulong max_http_request_length;
295 : ulong send_buffer_size_mb;
296 : int schedule_strategy;
297 : } gui;
298 :
299 : struct {
300 : uint prometheus_listen_addr;
301 : ushort prometheus_listen_port;
302 : } metric;
303 :
304 : struct {
305 : ulong fec_max;
306 : ulong max_vote_accounts;
307 :
308 : int tx_metadata_storage;
309 : ulong funk_obj_id;
310 : char funk_checkpt[ PATH_MAX ];
311 : char genesis[ PATH_MAX ];
312 : char slots_replayed[ PATH_MAX ];
313 : char shred_cap[ PATH_MAX ];
314 : char status_cache[ PATH_MAX ];
315 : char cluster_version[ 32 ];
316 : char tower_checkpt[ PATH_MAX ];
317 : int plugins_enabled;
318 :
319 : char identity_key_path[ PATH_MAX ];
320 : uint ip_addr;
321 : char vote_account_path[ PATH_MAX ];
322 :
323 : char blockstore_file[ PATH_MAX ];
324 : char blockstore_checkpt[ PATH_MAX ];
325 :
326 : /* not specified in TOML */
327 :
328 : ulong enable_features_cnt;
329 : char enable_features[ 16 ][ FD_BASE58_ENCODED_32_SZ ];
330 :
331 : ulong enable_bank_hash_cmp;
332 :
333 : ulong max_exec_slices;
334 :
335 : ulong capture_start_slot;
336 : char solcap_capture[ PATH_MAX ];
337 : char dump_proto_dir[ PATH_MAX ];
338 : int dump_block_to_pb;
339 :
340 : ulong manifest_dcache_obj_id;
341 : } replay;
342 :
343 : struct {
344 : ulong funk_obj_id;
345 :
346 : ulong capture_start_slot;
347 : char dump_proto_dir[ PATH_MAX ];
348 : int dump_instr_to_pb;
349 : int dump_txn_to_pb;
350 : int dump_syscall_to_pb;
351 : } exec;
352 :
353 : struct {
354 : ulong funk_obj_id;
355 : } writer;
356 :
357 : struct {
358 : ushort send_to_port;
359 : uint send_to_ip_addr;
360 : ulong conn_cnt;
361 : int no_quic;
362 : } benchs;
363 :
364 : struct {
365 : ushort rpc_port;
366 : uint rpc_ip_addr;
367 : } bencho;
368 :
369 : struct {
370 : ulong accounts_cnt;
371 : int mode;
372 : float contending_fraction;
373 : float cu_price_spread;
374 : } benchg;
375 :
376 : struct {
377 : ushort gossip_listen_port;
378 0 : # define FD_TOPO_GOSSIP_ENTRYPOINTS_MAX 16
379 : ulong entrypoints_cnt;
380 : fd_ip4_port_t entrypoints[ FD_TOPO_GOSSIP_ENTRYPOINTS_MAX ];
381 : uint ip_addr;
382 : char identity_key_path[ PATH_MAX ];
383 : ushort tvu_port;
384 : ushort tpu_port;
385 : ushort tpu_quic_port;
386 : ushort tpu_vote_port;
387 : ushort repair_serve_port;
388 : ulong expected_shred_version;
389 : } gossip;
390 :
391 : struct {
392 : ushort repair_intake_listen_port;
393 : ushort repair_serve_listen_port;
394 : char good_peer_cache_file[ PATH_MAX ];
395 :
396 : /* non-config */
397 :
398 : int good_peer_cache_file_fd;
399 : char identity_key_path[ PATH_MAX ];
400 : ulong max_pending_shred_sets;
401 : ulong slot_max;
402 : } repair;
403 :
404 : struct {
405 : char slots_pending[PATH_MAX];
406 :
407 : ulong expected_shred_version;
408 :
409 : /* non-config */
410 :
411 : char identity_key_path[ PATH_MAX ];
412 : char shred_cap_archive[ PATH_MAX ];
413 : char shred_cap_replay[ PATH_MAX ];
414 : ulong shred_cap_end_slot;
415 :
416 : char blockstore_file[ PATH_MAX ];
417 : char blockstore_restore[ PATH_MAX ];
418 : } store_int;
419 :
420 : struct {
421 : ushort send_src_port;
422 :
423 : /* non-config */
424 :
425 : uint ip_addr;
426 : char identity_key_path[ PATH_MAX ];
427 : } send;
428 :
429 : struct {
430 : ulong funk_obj_id;
431 : ushort rpc_port;
432 : ushort tpu_port;
433 : uint tpu_ip_addr;
434 : char identity_key_path[ PATH_MAX ];
435 : uint block_index_max;
436 : uint txn_index_max;
437 : uint acct_index_max;
438 : char history_file[ PATH_MAX ];
439 : } rpcserv;
440 :
441 : struct {
442 : uint fake_dst_ip;
443 : } pktgen;
444 :
445 : struct {
446 : ulong end_slot;
447 : char rocksdb_path[ PATH_MAX ];
448 : char shredcap_path[ PATH_MAX ];
449 : char bank_hash_path[ PATH_MAX ];
450 : char ingest_mode[ 32 ];
451 :
452 : /* Set internally by the archiver tile */
453 : int archive_fd;
454 : } archiver;
455 :
456 : struct {
457 : ulong funk_obj_id;
458 : char identity_key_path[ PATH_MAX ];
459 : char vote_acc_path[ PATH_MAX ];
460 : } tower;
461 : struct {
462 : char folder_path[ PATH_MAX ];
463 : ushort repair_intake_listen_port;
464 : ulong write_buffer_size; /* Size of the write buffer for the capture tile */
465 : int enable_publish_stake_weights;
466 : char manifest_path[ PATH_MAX ];
467 :
468 : /* Set internally by the capture tile */
469 : int shreds_fd;
470 : int requests_fd;
471 : int fecs_fd;
472 : int peers_fd;
473 : int bank_hashes_fd;
474 : int slices_fd;
475 : } shredcap;
476 :
477 : struct {
478 : char snapshots_path[ PATH_MAX ];
479 : char cluster[ 8UL ];
480 : int incremental_snapshot_fetch;
481 : int do_download;
482 : uint maximum_local_snapshot_age;
483 : uint minimum_download_speed_mib;
484 : uint maximum_download_retry_abort;
485 : } snaprd;
486 :
487 : struct {
488 : ulong funk_obj_id;
489 : } snapin;
490 :
491 : };
492 : };
493 :
494 : typedef struct fd_topo_tile fd_topo_tile_t;
495 :
496 : typedef struct {
497 : ulong id;
498 : char name[ 13UL ];
499 : ulong wksp_id;
500 :
501 : ulong offset;
502 : ulong footprint;
503 : } fd_topo_obj_t;
504 :
505 : /* An fd_topo_t represents the overall structure of a Firedancer
506 : configuration, describing all the workspaces, tiles, and links
507 : between them. */
508 : struct fd_topo {
509 : char app_name[ 256UL ];
510 : uchar props[ 16384UL ];
511 :
512 : ulong wksp_cnt;
513 : ulong link_cnt;
514 : ulong tile_cnt;
515 : ulong obj_cnt;
516 :
517 : fd_topo_wksp_t workspaces[ FD_TOPO_MAX_WKSPS ];
518 : fd_topo_link_t links[ FD_TOPO_MAX_LINKS ];
519 : fd_topo_tile_t tiles[ FD_TOPO_MAX_TILES ];
520 : fd_topo_obj_t objs[ FD_TOPO_MAX_OBJS ];
521 :
522 : ulong agave_affinity_cnt;
523 : ulong agave_affinity_cpu_idx[ FD_TILE_MAX ];
524 :
525 : ulong max_page_size; /* 2^21 or 2^30 */
526 : ulong gigantic_page_threshold; /* see [hugetlbfs.gigantic_page_threshold_mib]*/
527 : };
528 : typedef struct fd_topo fd_topo_t;
529 :
530 : typedef struct {
531 : char const * name;
532 :
533 : int keep_host_networking;
534 : int allow_connect;
535 : ulong rlimit_file_cnt;
536 : ulong rlimit_address_space;
537 : ulong rlimit_data;
538 : int for_tpool;
539 :
540 : ulong (*populate_allowed_seccomp)( fd_topo_t const * topo, fd_topo_tile_t const * tile, ulong out_cnt, struct sock_filter * out );
541 : ulong (*populate_allowed_fds )( fd_topo_t const * topo, fd_topo_tile_t const * tile, ulong out_fds_sz, int * out_fds );
542 : ulong (*scratch_align )( void );
543 : ulong (*scratch_footprint )( fd_topo_tile_t const * tile );
544 : ulong (*loose_footprint )( fd_topo_tile_t const * tile );
545 : void (*privileged_init )( fd_topo_t * topo, fd_topo_tile_t * tile );
546 : void (*unprivileged_init )( fd_topo_t * topo, fd_topo_tile_t * tile );
547 : void (*run )( fd_topo_t * topo, fd_topo_tile_t * tile );
548 : ulong (*rlimit_file_cnt_fn )( fd_topo_t const * topo, fd_topo_tile_t const * tile );
549 : } fd_topo_run_tile_t;
550 :
551 : struct fd_topo_obj_callbacks {
552 : char const * name;
553 : ulong (* footprint )( fd_topo_t const * topo, fd_topo_obj_t const * obj );
554 : ulong (* align )( fd_topo_t const * topo, fd_topo_obj_t const * obj );
555 : ulong (* loose )( fd_topo_t const * topo, fd_topo_obj_t const * obj );
556 : void (* new )( fd_topo_t const * topo, fd_topo_obj_t const * obj );
557 : };
558 :
559 : typedef struct fd_topo_obj_callbacks fd_topo_obj_callbacks_t;
560 :
561 : FD_PROTOTYPES_BEGIN
562 :
563 : FD_FN_CONST static inline ulong
564 0 : fd_topo_workspace_align( void ) {
565 : /* This needs to be the max( align ) of all the child members that
566 : could be aligned into this workspace, otherwise our footprint
567 : calculation will not be correct. For now just set to 4096 but this
568 : should probably be calculated dynamically, or we should reduce
569 : those child aligns if we can. */
570 0 : return 4096UL;
571 0 : }
572 :
573 : static inline void *
574 : fd_topo_obj_laddr( fd_topo_t const * topo,
575 24 : ulong obj_id ) {
576 24 : fd_topo_obj_t const * obj = &topo->objs[ obj_id ];
577 24 : FD_TEST( obj_id<FD_TOPO_MAX_OBJS );
578 24 : FD_TEST( obj->id == obj_id );
579 24 : FD_TEST( obj->offset );
580 24 : return (void *)((ulong)topo->workspaces[ obj->wksp_id ].wksp + obj->offset);
581 24 : }
582 :
583 : FD_FN_PURE static inline ulong
584 : fd_topo_tile_name_cnt( fd_topo_t const * topo,
585 0 : char const * name ) {
586 0 : ulong cnt = 0;
587 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
588 0 : if( FD_UNLIKELY( !strcmp( topo->tiles[ i ].name, name ) ) ) cnt++;
589 0 : }
590 0 : return cnt;
591 0 : }
592 :
593 : /* Finds the workspace of a given name in the topology. Returns
594 : ULONG_MAX if there is no such workspace. There can be at most one
595 : workspace of a given name. */
596 :
597 : FD_FN_PURE static inline ulong
598 : fd_topo_find_wksp( fd_topo_t const * topo,
599 39 : char const * name ) {
600 39 : for( ulong i=0; i<topo->wksp_cnt; i++ ) {
601 39 : if( FD_UNLIKELY( !strcmp( topo->workspaces[ i ].name, name ) ) ) return i;
602 39 : }
603 0 : return ULONG_MAX;
604 39 : }
605 :
606 : /* Find the tile of a given name and kind_id in the topology, there will
607 : be at most one such tile, since kind_id is unique among the name.
608 : Returns ULONG_MAX if there is no such tile. */
609 :
610 : FD_FN_PURE static inline ulong
611 : fd_topo_find_tile( fd_topo_t const * topo,
612 : char const * name,
613 6 : ulong kind_id ) {
614 6 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
615 6 : if( FD_UNLIKELY( !strcmp( topo->tiles[ i ].name, name ) ) && topo->tiles[ i ].kind_id == kind_id ) return i;
616 6 : }
617 0 : return ULONG_MAX;
618 6 : }
619 :
620 : /* Find the link of a given name and kind_id in the topology, there will
621 : be at most one such link, since kind_id is unique among the name.
622 : Returns ULONG_MAX if there is no such link. */
623 :
624 : FD_FN_PURE static inline ulong
625 : fd_topo_find_link( fd_topo_t const * topo,
626 : char const * name,
627 6 : ulong kind_id ) {
628 9 : for( ulong i=0; i<topo->link_cnt; i++ ) {
629 9 : if( FD_UNLIKELY( !strcmp( topo->links[ i ].name, name ) ) && topo->links[ i ].kind_id == kind_id ) return i;
630 9 : }
631 0 : return ULONG_MAX;
632 6 : }
633 :
634 : FD_FN_PURE static inline ulong
635 : fd_topo_find_tile_in_link( fd_topo_t const * topo,
636 : fd_topo_tile_t const * tile,
637 : char const * name,
638 0 : ulong kind_id ) {
639 0 : for( ulong i=0; i<tile->in_cnt; i++ ) {
640 0 : if( FD_UNLIKELY( !strcmp( topo->links[ tile->in_link_id[ i ] ].name, name ) )
641 0 : && topo->links[ tile->in_link_id[ i ] ].kind_id == kind_id ) return i;
642 0 : }
643 0 : return ULONG_MAX;
644 0 : }
645 :
646 : FD_FN_PURE static inline ulong
647 : fd_topo_find_tile_out_link( fd_topo_t const * topo,
648 : fd_topo_tile_t const * tile,
649 : char const * name,
650 0 : ulong kind_id ) {
651 0 : for( ulong i=0; i<tile->out_cnt; i++ ) {
652 0 : if( FD_UNLIKELY( !strcmp( topo->links[ tile->out_link_id[ i ] ].name, name ) )
653 0 : && topo->links[ tile->out_link_id[ i ] ].kind_id == kind_id ) return i;
654 0 : }
655 0 : return ULONG_MAX;
656 0 : }
657 :
658 : /* Find the id of the tile which is a producer for the given link. If
659 : no tile is a producer for the link, returns ULONG_MAX. This should
660 : not be possible for a well formed and validated topology. */
661 : FD_FN_PURE static inline ulong
662 : fd_topo_find_link_producer( fd_topo_t const * topo,
663 0 : fd_topo_link_t const * link ) {
664 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
665 0 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
666 :
667 0 : for( ulong j=0; j<tile->out_cnt; j++ ) {
668 0 : if( FD_UNLIKELY( tile->out_link_id[ j ] == link->id ) ) return i;
669 0 : }
670 0 : }
671 0 : return ULONG_MAX;
672 0 : }
673 :
674 : /* Given a link, count the number of consumers of that link among all
675 : the tiles in the topology. */
676 : FD_FN_PURE static inline ulong
677 : fd_topo_link_consumer_cnt( fd_topo_t const * topo,
678 0 : fd_topo_link_t const * link ) {
679 0 : ulong cnt = 0;
680 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
681 0 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
682 0 : for( ulong j=0; j<tile->in_cnt; j++ ) {
683 0 : if( FD_UNLIKELY( tile->in_link_id[ j ] == link->id ) ) cnt++;
684 0 : }
685 0 : }
686 :
687 0 : return cnt;
688 0 : }
689 :
690 : /* Given a link, count the number of reliable consumers of that link
691 : among all the tiles in the topology. */
692 : FD_FN_PURE static inline ulong
693 : fd_topo_link_reliable_consumer_cnt( fd_topo_t const * topo,
694 0 : fd_topo_link_t const * link ) {
695 0 : ulong cnt = 0;
696 0 : for( ulong i=0; i<topo->tile_cnt; i++ ) {
697 0 : fd_topo_tile_t const * tile = &topo->tiles[ i ];
698 0 : for( ulong j=0; j<tile->in_cnt; j++ ) {
699 0 : if( FD_UNLIKELY( tile->in_link_id[ j ] == link->id && tile->in_link_reliable[ j ] ) ) cnt++;
700 0 : }
701 0 : }
702 0 :
703 0 : return cnt;
704 0 : }
705 :
706 : FD_FN_PURE static inline ulong
707 : fd_topo_tile_consumer_cnt( fd_topo_t const * topo,
708 0 : fd_topo_tile_t const * tile ) {
709 0 : (void)topo;
710 0 : return tile->out_cnt;
711 0 : }
712 :
713 : FD_FN_PURE static inline ulong
714 : fd_topo_tile_reliable_consumer_cnt( fd_topo_t const * topo,
715 0 : fd_topo_tile_t const * tile ) {
716 0 : ulong reliable_cons_cnt = 0UL;
717 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
718 0 : fd_topo_tile_t const * consumer_tile = &topo->tiles[ i ];
719 0 : for( ulong j=0UL; j<consumer_tile->in_cnt; j++ ) {
720 0 : for( ulong k=0UL; k<tile->out_cnt; k++ ) {
721 0 : if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) {
722 0 : reliable_cons_cnt++;
723 0 : }
724 0 : }
725 0 : }
726 0 : }
727 0 : return reliable_cons_cnt;
728 0 : }
729 :
730 : FD_FN_PURE static inline ulong
731 : fd_topo_tile_producer_cnt( fd_topo_t const * topo,
732 0 : fd_topo_tile_t const * tile ) {
733 0 : (void)topo;
734 0 : ulong in_cnt = 0UL;
735 0 : for( ulong i=0UL; i<tile->in_cnt; i++ ) {
736 0 : if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue;
737 0 : in_cnt++;
738 0 : }
739 0 : return in_cnt;
740 0 : }
741 :
742 : /* Join (map into the process) all shared memory (huge/gigantic pages)
743 : needed by the tile, in the given topology. All memory associated
744 : with the tile (aka. used by links that the tile either produces to or
745 : consumes from, or used by the tile itself for its cnc) will be
746 : attached (mapped into the process).
747 :
748 : This is needed to play nicely with the sandbox. Once a process is
749 : sandboxed we can no longer map any memory. */
750 : void
751 : fd_topo_join_tile_workspaces( fd_topo_t * topo,
752 : fd_topo_tile_t * tile );
753 :
754 : /* Join (map into the process) the shared memory (huge/gigantic pages)
755 : for the given workspace. Mode is one of
756 : FD_SHMEM_JOIN_MODE_READ_WRITE or FD_SHMEM_JOIN_MODE_READ_ONLY and
757 : determines the prot argument that will be passed to mmap when mapping
758 : the pages in (PROT_WRITE or PROT_READ respectively). */
759 : void
760 : fd_topo_join_workspace( fd_topo_t * topo,
761 : fd_topo_wksp_t * wksp,
762 : int mode );
763 :
764 : /* Join (map into the process) all shared memory (huge/gigantic pages)
765 : needed by all tiles in the topology. Mode is one of
766 : FD_SHMEM_JOIN_MODE_READ_WRITE or FD_SHMEM_JOIN_MODE_READ_ONLY and
767 : determines the prot argument that will be passed to mmap when
768 : mapping the pages in (PROT_WRITE or PROT_READ respectively). */
769 : void
770 : fd_topo_join_workspaces( fd_topo_t * topo,
771 : int mode );
772 :
773 : /* Leave (unmap from the process) the shared memory needed for the
774 : given workspace in the topology, if it was previously mapped.
775 :
776 : topo and wksp are assumed non-NULL. It is OK if the workspace
777 : has not been previously joined, in which case this is a no-op. */
778 :
779 : void
780 : fd_topo_leave_workspace( fd_topo_t * topo,
781 : fd_topo_wksp_t * wksp );
782 :
783 : /* Leave (unmap from the process) all shared memory needed by all
784 : tiles in the topology, if each of them was mapped.
785 :
786 : topo is assumed non-NULL. Only workspaces which were previously
787 : joined are unmapped. */
788 :
789 : void
790 : fd_topo_leave_workspaces( fd_topo_t * topo );
791 :
792 : /* Create the given workspace needed by the topology on the system.
793 : This does not "join" the workspaces (map their memory into the
794 : process), but only creates the .wksp file and formats it correctly
795 : as a workspace.
796 :
797 : Returns 0 on success and -1 on failure, with errno set to the error.
798 : The only reason for failure currently that will be returned is
799 : ENOMEM, as other unexpected errors will cause the program to exit.
800 :
801 : If update_existing is 1, the workspace will not be created from
802 : scratch but it will be assumed that it already exists from a prior
803 : run and needs to be maybe resized and then have the header
804 : structures reinitialized. This can save a very expensive operation
805 : of zeroing all of the workspace pages. This is dangerous in
806 : production because it can leave stray memory from prior runs around,
807 : and should only be used in development environments. */
808 :
809 : int
810 : fd_topo_create_workspace( fd_topo_t * topo,
811 : fd_topo_wksp_t * wksp,
812 : int update_existing );
813 :
814 : /* Join the standard IPC objects needed by the topology of this particular
815 : tile */
816 :
817 : void
818 : fd_topo_fill_tile( fd_topo_t * topo,
819 : fd_topo_tile_t * tile );
820 :
821 : /* Same as fd_topo_fill_tile but fills in all the objects for a
822 : particular workspace with the given mode. */
823 : void
824 : fd_topo_workspace_fill( fd_topo_t * topo,
825 : fd_topo_wksp_t * wksp );
826 :
827 : /* Apply a new function to every object that is resident in the given
828 : workspace in the topology. */
829 :
830 : void
831 : fd_topo_wksp_new( fd_topo_t const * topo,
832 : fd_topo_wksp_t const * wksp,
833 : fd_topo_obj_callbacks_t ** callbacks );
834 :
835 : /* Same as fd_topo_fill_tile but fills in all tiles in the topology. */
836 :
837 : void
838 : fd_topo_fill( fd_topo_t * topo );
839 :
840 : /* fd_topo_tile_stack_join joins a huge page optimized stack for the
841 : provided tile. The stack is assumed to already exist at a known
842 : path in the hugetlbfs mount. */
843 :
844 : void *
845 : fd_topo_tile_stack_join( char const * app_name,
846 : char const * tile_name,
847 : ulong tile_kind_id );
848 :
849 : /* Install the XDP program needed by the net tiles into the local device
850 : and return the xsk_map_fd. bind_addr is an optional IPv4 address to
851 : used for filtering by dst IP. */
852 :
853 : fd_xdp_fds_t
854 : fd_topo_install_xdp( fd_topo_t const * topo,
855 : uint bind_addr );
856 :
857 : /* fd_topo_run_single_process runs all the tiles in a single process
858 : (the calling process). This spawns a thread for each tile, switches
859 : that thread to the given UID and GID and then runs the tile in it.
860 : Each thread will never exit, as tiles are expected to run forever.
861 : An error is logged and the application will exit if a tile exits.
862 : The function itself does return after spawning all the threads.
863 :
864 : The threads will not be sandboxed in any way, except switching to the
865 : provided UID and GID, so they will share the same address space, and
866 : not have any seccomp restrictions or use any Linux namespaces. The
867 : calling thread will also switch to the provided UID and GID before
868 : it returns.
869 :
870 : In production, when running with an Agave child process this is
871 : used for spawning certain tiles inside the Agave address space.
872 : It's also useful for tooling and debugging, but is not how the main
873 : production Firedancer process runs. For production, each tile is run
874 : in its own address space with a separate process and full security
875 : sandbox.
876 :
877 : The agave argument determines which tiles are started. If the
878 : argument is 0 or 1, only non-agave (or only agave) tiles are started.
879 : If the argument is any other value, all tiles in the topology are
880 : started regardless of if they are Agave tiles or not. */
881 :
882 : void
883 : fd_topo_run_single_process( fd_topo_t * topo,
884 : int agave,
885 : uint uid,
886 : uint gid,
887 : fd_topo_run_tile_t (* tile_run )( fd_topo_tile_t const * tile ) );
888 :
889 : /* fd_topo_run_tile runs the given tile directly within the current
890 : process (and thread). The function will never return, as tiles are
891 : expected to run forever. An error is logged and the application will
892 : exit if the tile exits.
893 :
894 : The sandbox argument determines if the current process will be
895 : sandboxed fully before starting the tile. The thread will switch to
896 : the UID and GID provided before starting the tile, even if the thread
897 : is not being sandboxed. Although POSIX specifies that all threads in
898 : a process must share a UID and GID, this is not the case on Linux.
899 : The thread will switch to the provided UID and GID without switching
900 : the other threads in the process.
901 :
902 : If keep_controlling_terminal is set to 0, and the sandbox is enabled
903 : the controlling terminal will be detached as an additional sandbox
904 : measure, but you will not be able to send Ctrl+C or other signals
905 : from the terminal. See fd_sandbox.h for more information.
906 :
907 : The allow_fd argument is only used if sandbox is true, and is a file
908 : descriptor which will be allowed to exist in the process. Normally
909 : the sandbox code rejects and aborts if there is an unexpected file
910 : descriptor present on boot. This is helpful to allow a parent
911 : process to be notified on termination of the tile by waiting for a
912 : pipe file descriptor to get closed.
913 :
914 : wait and debugger are both used in debugging. If wait is non-NULL,
915 : the runner will wait until the value pointed to by wait is non-zero
916 : before launching the tile. Likewise, if debugger is non-NULL, the
917 : runner will wait until a debugger is attached before setting the
918 : value pointed to by debugger to non-zero. These are intended to be
919 : used as a pair, where many tiles share a waiting reference, and then
920 : one of the tiles (a tile you want to attach the debugger to) has the
921 : same reference provided as the debugger, so all tiles will stop and
922 : wait for the debugger to attach to it before proceeding. */
923 :
924 : void
925 : fd_topo_run_tile( fd_topo_t * topo,
926 : fd_topo_tile_t * tile,
927 : int sandbox,
928 : int keep_controlling_terminal,
929 : int dumpable,
930 : uint uid,
931 : uint gid,
932 : int allow_fd,
933 : volatile int * wait,
934 : volatile int * debugger,
935 : fd_topo_run_tile_t * tile_run );
936 :
937 : /* This is for determining the value of RLIMIT_MLOCK that we need to
938 : successfully run all tiles in separate processes. The value returned
939 : is the maximum amount of memory that will be locked with mlock() by
940 : any individual process in the tree. Specifically, if we have three
941 : tile processes, and they each need to lock 5, 9, and 2 MiB of memory
942 : respectively, RLIMIT_MLOCK needs to be 9 MiB to allow all three
943 : process mlock() calls to succeed.
944 :
945 : Tiles lock memory in three ways. Any workspace they are using, they
946 : lock the entire workspace. Then each tile uses huge pages for the
947 : stack which are also locked, and finally some tiles use private
948 : locked mmaps outside the workspace for storing key material. The
949 : results here include all of this memory together.
950 :
951 : The result is not necessarily the amount of memory used by the tile
952 : process, although it will be quite close. Tiles could potentially
953 : allocate memory (eg, with brk) without needing to lock it, which
954 : would not need to included, and some kernel memory that tiles cause
955 : to be allocated (for example XSK buffers) is also not included. The
956 : actual amount of memory used will not be less than this value. */
957 : FD_FN_PURE ulong
958 : fd_topo_mlock_max_tile( fd_topo_t const * topo );
959 :
960 : /* Same as fd_topo_mlock_max_tile, but for loading the entire topology
961 : into one process, rather than a separate process per tile. This is
962 : used, for example, by the configuration code when it creates all the
963 : workspaces, or the monitor that maps the entire system into one
964 : address space. */
965 : FD_FN_PURE ulong
966 : fd_topo_mlock( fd_topo_t const * topo );
967 :
968 : /* This returns the number of gigantic pages needed by the topology on
969 : the provided numa node. It includes pages needed by the workspaces,
970 : as well as additional allocations like huge pages for process stacks
971 : and private key storage. */
972 :
973 : FD_FN_PURE ulong
974 : fd_topo_gigantic_page_cnt( fd_topo_t const * topo,
975 : ulong numa_idx );
976 :
977 : /* This returns the number of huge pages in the application needed by
978 : the topology on the provided numa node. It includes pages needed by
979 : things placed in the hugetlbfs (workspaces, process stacks). If
980 : include_anonymous is true, it also includes anonymous hugepages which
981 : are needed but are not placed in the hugetlbfs. */
982 :
983 : FD_FN_PURE ulong
984 : fd_topo_huge_page_cnt( fd_topo_t const * topo,
985 : ulong numa_idx,
986 : int include_anonymous );
987 :
988 : /* Prints a message describing the topology to an output stream. If
989 : stdout is true, will be written to stdout, otherwise will be written
990 : as a NOTICE log message to the log file. */
991 : void
992 : fd_topo_print_log( int stdout,
993 : fd_topo_t * topo );
994 :
995 : FD_PROTOTYPES_END
996 :
997 : #endif /* HEADER_fd_src_disco_topo_fd_topo_h */
|