Line data Source code
1 : /* The repair command spawns a smaller topology for profiling the repair
2 : tile. This is a standalone application, and it can be run in mainnet,
3 : testnet and/or a private cluster. */
4 :
5 : #include "../../../disco/net/fd_net_tile.h"
6 : #include "../../../disco/tiles.h"
7 : #include "../../../disco/topo/fd_topob.h"
8 : #include "../../../disco/topo/fd_cpu_topo.h"
9 : #include "../../../util/pod/fd_pod_format.h"
10 : #include "../../../util/tile/fd_tile_private.h"
11 :
12 : #include "../../firedancer/topology.h"
13 : #include "../../shared/commands/configure/configure.h"
14 : #include "../../shared/commands/run/run.h" /* initialize_workspaces */
15 : #include "../../shared/fd_config.h" /* config_t */
16 : #include "../../shared_dev/commands/dev.h"
17 : #include "../../../disco/tiles.h"
18 : #include "../../../disco/topo/fd_topob.h"
19 : #include "../../../util/pod/fd_pod_format.h"
20 : #include "../../../waltz/resolv/fd_io_readline.h"
21 : #include "../../platform/fd_sys_util.h"
22 : #include "../../shared/commands/monitor/helper.h"
23 : #include "../../../disco/metrics/fd_metrics.h"
24 : #include "../../../discof/restore/utils/fd_ssmanifest_parser.h"
25 : #include "../../../flamenco/runtime/sysvar/fd_sysvar_epoch_schedule.h"
26 : #include "../../../flamenco/stakes/fd_stake_weight.h"
27 : #include "../../../flamenco/leaders/fd_leaders_base.h"
28 : #include "../../../discof/repair/fd_repair_tile.c"
29 :
30 : #include "gossip.h"
31 : #include "core_subtopo.h"
32 :
33 : #include <unistd.h> /* pause */
34 : #include <fcntl.h>
35 : #include <stdio.h>
36 : #include <termios.h>
37 : #include <errno.h>
38 :
39 : extern action_t fd_action_repair;
40 :
41 : struct fd_location_info {
42 : ulong ip4_addr; /* for map key convenience */
43 : char location[ 128 ];
44 : };
45 : typedef struct fd_location_info fd_location_info_t;
46 :
47 : #define MAP_NAME fd_location_table
48 0 : #define MAP_T fd_location_info_t
49 0 : #define MAP_KEY ip4_addr
50 0 : #define MAP_LG_SLOT_CNT 16
51 : #define MAP_MEMOIZE 0
52 : #include "../../../util/tmpl/fd_map.c"
53 :
54 : uchar __attribute__((aligned(alignof(fd_location_info_t)))) location_table_mem[ sizeof(fd_location_info_t) * (1 << 16 ) ];
55 :
56 : static struct termios termios_backup;
57 :
58 : static void
59 0 : restore_terminal( void ) {
60 0 : (void)tcsetattr( STDIN_FILENO, TCSANOW, &termios_backup );
61 0 : }
62 :
63 : extern fd_topo_obj_callbacks_t * CALLBACKS[];
64 :
65 : fd_topo_run_tile_t
66 : fdctl_tile_run( fd_topo_tile_t const * tile );
67 :
68 : void
69 : resolve_gossip_entrypoints( config_t * config );
70 :
71 0 : #define MANIFEST_LOAD_MAX_SZ (2UL * FD_SHMEM_GIGANTIC_PAGE_SZ)
72 :
73 : /* https://github.com/anza-xyz/agave/blob/v3.1.8/runtime/src/snapshot_bank_utils.rs#L632 */
74 : static int
75 0 : repair_verify_epoch_stakes( fd_snapshot_manifest_t const * manifest ) {
76 0 : fd_epoch_schedule_t epoch_schedule = (fd_epoch_schedule_t){
77 0 : .slots_per_epoch = manifest->epoch_schedule_params.slots_per_epoch,
78 0 : .leader_schedule_slot_offset = manifest->epoch_schedule_params.leader_schedule_slot_offset,
79 0 : .warmup = manifest->epoch_schedule_params.warmup,
80 0 : .first_normal_epoch = manifest->epoch_schedule_params.first_normal_epoch,
81 0 : .first_normal_slot = manifest->epoch_schedule_params.first_normal_slot,
82 0 : };
83 :
84 0 : ulong min_required_epoch = fd_slot_to_epoch( &epoch_schedule, manifest->slot, NULL );
85 0 : ulong max_required_epoch = fd_slot_to_leader_schedule_epoch( &epoch_schedule, manifest->slot );
86 :
87 0 : for( ulong i=min_required_epoch; i<=max_required_epoch; i++ ) {
88 0 : int found = 0;
89 0 : for( ulong j=0UL; j<FD_EPOCH_STAKES_LEN; j++ ) {
90 0 : if( manifest->epoch_stakes[j].epoch==i ) {
91 0 : found = 1;
92 0 : break;
93 0 : }
94 0 : }
95 0 : if( FD_UNLIKELY( !found ) ) {
96 0 : FD_LOG_WARNING(( "stakes not found for epoch %lu in manifest", i ));
97 0 : return -1;
98 0 : }
99 0 : }
100 0 : return 0;
101 0 : }
102 :
103 : static inline ulong
104 : repair_generate_epoch_info_msg( ulong epoch,
105 : fd_epoch_schedule_t const * epoch_schedule,
106 : fd_snapshot_manifest_epoch_stakes_t const * epoch_stakes,
107 0 : ulong * epoch_info_msg_out ) {
108 0 : fd_epoch_info_msg_t * epoch_info_msg = (fd_epoch_info_msg_t *)fd_type_pun( epoch_info_msg_out );
109 0 : fd_vote_stake_weight_t * stake_weights = fd_epoch_info_msg_stake_weights( epoch_info_msg );
110 :
111 0 : epoch_info_msg->epoch = epoch;
112 0 : epoch_info_msg->start_slot = fd_epoch_slot0( epoch_schedule, epoch );
113 0 : epoch_info_msg->slot_cnt = fd_epoch_slot_cnt( epoch_schedule, epoch );
114 0 : epoch_info_msg->excluded_id_stake = 0UL;
115 :
116 0 : fd_memset( &epoch_info_msg->features, 0xFF, sizeof(fd_features_t) );
117 :
118 0 : ulong idx = 0UL;
119 0 : for( ulong i=0UL; i<epoch_stakes->vote_stakes_len; i++ ) {
120 0 : ulong stake = epoch_stakes->vote_stakes[ i ].stake;
121 0 : if( FD_UNLIKELY( !stake ) ) continue;
122 0 : stake_weights[ idx ].stake = stake;
123 0 : memcpy( stake_weights[ idx ].id_key.uc, epoch_stakes->vote_stakes[ i ].identity, sizeof(fd_pubkey_t) );
124 0 : memcpy( stake_weights[ idx ].vote_key.uc, epoch_stakes->vote_stakes[ i ].vote, sizeof(fd_pubkey_t) );
125 0 : idx++;
126 0 : }
127 0 : epoch_info_msg->staked_vote_cnt = idx;
128 0 : sort_vote_weights_by_stake_vote_inplace( stake_weights, idx );
129 :
130 0 : fd_stake_weight_t * id_weights = fd_epoch_info_msg_id_weights( epoch_info_msg );
131 0 : epoch_info_msg->staked_id_cnt = compute_id_weights_from_vote_weights( id_weights, stake_weights, epoch_info_msg->staked_vote_cnt );
132 0 : FD_TEST( idx<=MAX_SHRED_DESTS );
133 :
134 0 : epoch_info_msg->epoch_schedule = *epoch_schedule;
135 0 : return fd_epoch_info_msg_sz( epoch_info_msg->staked_vote_cnt, epoch_info_msg->staked_id_cnt );
136 0 : }
137 :
138 : /* repair_load_manifest loads the snapshot manifest from disk and
139 : pre-populates the snapin_manif and replay_epoch dcache links so
140 : that consumer tiles see the data on their first poll cycle. */
141 : static void
142 : repair_load_manifest( fd_topo_t * topo,
143 0 : char const * manifest_path ) {
144 0 : if( FD_UNLIKELY( !manifest_path || !manifest_path[0] ) ) return;
145 :
146 : /* Parse manifest */
147 :
148 0 : int fd = open( manifest_path, O_RDONLY );
149 0 : if( FD_UNLIKELY( fd<0 ) ) FD_LOG_ERR(( "open(%s) failed (%d-%s)", manifest_path, errno, fd_io_strerror( errno ) ));
150 :
151 0 : fd_snapshot_manifest_t * manifest = aligned_alloc( alignof(fd_snapshot_manifest_t), sizeof(fd_snapshot_manifest_t) );
152 0 : FD_TEST( manifest );
153 0 : for( ulong i=0UL; i<FD_EPOCH_STAKES_LEN; i++ ) manifest->epoch_stakes[i].epoch = ULONG_MAX;
154 :
155 0 : uchar * buf = aligned_alloc( 128UL, MANIFEST_LOAD_MAX_SZ );
156 0 : FD_TEST( buf );
157 0 : ulong buf_sz = 0;
158 0 : FD_TEST( !fd_io_read( fd, buf, 0UL, MANIFEST_LOAD_MAX_SZ-1UL, &buf_sz ) );
159 0 : close( fd );
160 :
161 0 : fd_ssmanifest_parser_t * parser = fd_ssmanifest_parser_join( fd_ssmanifest_parser_new(
162 0 : aligned_alloc( fd_ssmanifest_parser_align(), fd_ssmanifest_parser_footprint() ) ) );
163 0 : FD_TEST( parser );
164 0 : fd_ssmanifest_parser_init( parser, manifest );
165 0 : int parser_err = fd_ssmanifest_parser_consume( parser, buf, buf_sz );
166 0 : FD_TEST( parser_err!=FD_SSMANIFEST_PARSER_ADVANCE_ERROR );
167 0 : FD_TEST( fd_ssmanifest_parser_fini( parser )==FD_SSMANIFEST_PARSER_ADVANCE_DONE );
168 0 : free( parser );
169 0 : free( buf );
170 :
171 0 : FD_LOG_NOTICE(( "manifest bank slot %lu", manifest->slot ));
172 0 : FD_TEST( !repair_verify_epoch_stakes( manifest ) );
173 :
174 : /* Update root_slot fseq */
175 :
176 0 : ulong root_slot_obj_id = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "root_slot" );
177 0 : if( FD_LIKELY( root_slot_obj_id!=ULONG_MAX ) ) {
178 0 : ulong * root_fseq = fd_fseq_join( fd_topo_obj_laddr( topo, root_slot_obj_id ) );
179 0 : FD_TEST( root_fseq );
180 0 : fd_fseq_update( root_fseq, manifest->slot );
181 0 : }
182 :
183 : /* Publish manifest to snapin_manif dcache */
184 :
185 0 : ulong snap_link_idx = fd_topo_find_link( topo, "snapin_manif", 0UL );
186 0 : FD_TEST( snap_link_idx!=ULONG_MAX );
187 0 : fd_topo_link_t * snap_link = &topo->links[ snap_link_idx ];
188 0 : fd_wksp_t * snap_mem = topo->workspaces[ topo->objs[ snap_link->dcache_obj_id ].wksp_id ].wksp;
189 0 : ulong snap_chunk0 = fd_dcache_compact_chunk0( snap_mem, snap_link->dcache );
190 0 : ulong snap_wmark = fd_dcache_compact_wmark ( snap_mem, snap_link->dcache, snap_link->mtu );
191 0 : ulong snap_chunk = snap_chunk0;
192 :
193 0 : uchar * snap_dst = fd_chunk_to_laddr( snap_mem, snap_chunk );
194 0 : memcpy( snap_dst, manifest, sizeof(fd_snapshot_manifest_t) );
195 0 : fd_mcache_publish( snap_link->mcache, snap_link->depth, 0UL,
196 0 : fd_ssmsg_sig( FD_SSMSG_MANIFEST_INCREMENTAL ),
197 0 : snap_chunk, sizeof(fd_snapshot_manifest_t), 0UL, 0UL, 0UL );
198 0 : snap_chunk = fd_dcache_compact_next( snap_chunk, sizeof(fd_snapshot_manifest_t), snap_chunk0, snap_wmark );
199 :
200 0 : fd_mcache_publish( snap_link->mcache, snap_link->depth, 1UL,
201 0 : fd_ssmsg_sig( FD_SSMSG_DONE ), 0UL, 0UL, 0UL, 0UL, 0UL );
202 :
203 : /* Publish epoch stake weights to replay_epoch dcache */
204 :
205 0 : ulong epoch_link_idx = fd_topo_find_link( topo, "replay_epoch", 0UL );
206 0 : FD_TEST( epoch_link_idx!=ULONG_MAX );
207 0 : fd_topo_link_t * epoch_link = &topo->links[ epoch_link_idx ];
208 0 : fd_wksp_t * epoch_mem = topo->workspaces[ topo->objs[ epoch_link->dcache_obj_id ].wksp_id ].wksp;
209 0 : ulong epoch_chunk0 = fd_dcache_compact_chunk0( epoch_mem, epoch_link->dcache );
210 0 : ulong epoch_wmark = fd_dcache_compact_wmark ( epoch_mem, epoch_link->dcache, epoch_link->mtu );
211 0 : ulong epoch_chunk = epoch_chunk0;
212 0 : ulong epoch_seq = 0UL;
213 :
214 : /* Construct fd_epoch_schedule_t field-by-field rather than type-punning
215 : from the unpacked manifest struct (fd_epoch_schedule_t is packed). */
216 0 : fd_epoch_schedule_t schedule_local;
217 0 : schedule_local.slots_per_epoch = manifest->epoch_schedule_params.slots_per_epoch;
218 0 : schedule_local.leader_schedule_slot_offset = manifest->epoch_schedule_params.leader_schedule_slot_offset;
219 0 : schedule_local.warmup = manifest->epoch_schedule_params.warmup;
220 0 : schedule_local.first_normal_epoch = manifest->epoch_schedule_params.first_normal_epoch;
221 0 : schedule_local.first_normal_slot = manifest->epoch_schedule_params.first_normal_slot;
222 0 : fd_epoch_schedule_t const * schedule = &schedule_local;
223 0 : ulong epoch = fd_slot_to_epoch( schedule, manifest->slot, NULL );
224 :
225 0 : ulong epoch_stakes_base = epoch > 0UL ? epoch - 1UL : 0UL;
226 0 : ulong leader_schedule_epoch = fd_slot_to_leader_schedule_epoch( schedule, manifest->slot );
227 0 : ulong cur_idx = epoch - epoch_stakes_base;
228 0 : FD_TEST( cur_idx < FD_EPOCH_STAKES_LEN );
229 :
230 0 : ulong * epoch_dst = fd_chunk_to_laddr( epoch_mem, epoch_chunk );
231 0 : ulong epoch_sz = repair_generate_epoch_info_msg( epoch, schedule, &manifest->epoch_stakes[cur_idx], epoch_dst );
232 0 : fd_mcache_publish( epoch_link->mcache, epoch_link->depth, epoch_seq,
233 0 : 4UL, epoch_chunk, epoch_sz, 0UL, 0UL, fd_frag_meta_ts_comp( fd_tickcount() ) );
234 0 : epoch_chunk = fd_dcache_compact_next( epoch_chunk, epoch_sz, epoch_chunk0, epoch_wmark );
235 0 : epoch_seq++;
236 0 : FD_LOG_NOTICE(( "sending current epoch stake weights - epoch: %lu", epoch ));
237 :
238 0 : if( leader_schedule_epoch >= epoch + 1UL ) {
239 0 : ulong next_idx = epoch + 1UL - epoch_stakes_base;
240 0 : FD_TEST( next_idx < FD_EPOCH_STAKES_LEN );
241 :
242 0 : epoch_dst = fd_chunk_to_laddr( epoch_mem, epoch_chunk );
243 0 : epoch_sz = repair_generate_epoch_info_msg( epoch + 1UL, schedule, &manifest->epoch_stakes[next_idx], epoch_dst );
244 0 : fd_mcache_publish( epoch_link->mcache, epoch_link->depth, epoch_seq,
245 0 : 4UL, epoch_chunk, epoch_sz, 0UL, 0UL, fd_frag_meta_ts_comp( fd_tickcount() ) );
246 0 : epoch_chunk = fd_dcache_compact_next( epoch_chunk, epoch_sz, epoch_chunk0, epoch_wmark );
247 0 : epoch_seq++;
248 0 : FD_LOG_NOTICE(( "sending next epoch stake weights - epoch: %lu", epoch + 1UL ));
249 0 : }
250 0 : (void)epoch_chunk;
251 :
252 0 : free( manifest );
253 0 : }
254 :
255 : /* repair_topo is a subset of "src/app/firedancer/topology.c" at commit
256 : 0d8386f4f305bb15329813cfe4a40c3594249e96, slightly modified to work
257 : as a repair catchup. TODO ideally, one should invoke the firedancer
258 : topology first, and exclude the parts that are not needed, instead of
259 : manually generating new topologies for every command. This would
260 : also guarantee that the catchup is replicating (as close as possible)
261 : the full topology. */
262 : static void
263 0 : repair_topo( config_t * config ) {
264 0 : resolve_gossip_entrypoints( config );
265 :
266 0 : ulong net_tile_cnt = config->layout.net_tile_count;
267 0 : ulong shred_tile_cnt = config->layout.shred_tile_count;
268 0 : ulong quic_tile_cnt = config->layout.quic_tile_count;
269 0 : ulong sign_tile_cnt = config->firedancer.layout.sign_tile_count;
270 0 : ulong gossvf_tile_cnt = config->firedancer.layout.gossvf_tile_count;
271 :
272 0 : fd_topo_t * topo = { fd_topob_new( &config->topo, config->name ) };
273 0 : topo->max_page_size = fd_cstr_to_shmem_page_sz( config->hugetlbfs.max_page_size );
274 0 : topo->gigantic_page_threshold = config->hugetlbfs.gigantic_page_threshold_mib << 20;
275 :
276 0 : ulong tile_to_cpu[ FD_TILE_MAX ] = {0};
277 0 : ushort parsed_tile_to_cpu[ FD_TILE_MAX ];
278 : /* Unassigned tiles will be floating, unless auto topology is enabled. */
279 0 : for( ulong i=0UL; i<FD_TILE_MAX; i++ ) parsed_tile_to_cpu[ i ] = USHORT_MAX;
280 :
281 0 : int is_auto_affinity = !strcmp( config->layout.affinity, "auto" );
282 0 : int is_bench_auto_affinity = !strcmp( config->development.bench.affinity, "auto" );
283 :
284 0 : if( FD_UNLIKELY( is_auto_affinity != is_bench_auto_affinity ) ) {
285 0 : FD_LOG_ERR(( "The CPU affinity string in the configuration file under [layout.affinity] and [development.bench.affinity] must all be set to 'auto' or all be set to a specific CPU affinity string." ));
286 0 : }
287 :
288 0 : fd_topo_cpus_t cpus[1];
289 0 : fd_topo_cpus_init( cpus );
290 :
291 0 : ulong affinity_tile_cnt = 0UL;
292 0 : if( FD_LIKELY( !is_auto_affinity ) ) affinity_tile_cnt = fd_tile_private_cpus_parse( config->layout.affinity, parsed_tile_to_cpu );
293 :
294 0 : for( ulong i=0UL; i<affinity_tile_cnt; i++ ) {
295 0 : if( FD_UNLIKELY( parsed_tile_to_cpu[ i ]!=USHORT_MAX && parsed_tile_to_cpu[ i ]>=cpus->cpu_cnt ) )
296 0 : FD_LOG_ERR(( "The CPU affinity string in the configuration file under [layout.affinity] specifies a CPU index of %hu, but the system "
297 0 : "only has %lu CPUs. You should either change the CPU allocations in the affinity string, or increase the number of CPUs "
298 0 : "in the system.",
299 0 : parsed_tile_to_cpu[ i ], cpus->cpu_cnt ));
300 0 : tile_to_cpu[ i ] = fd_ulong_if( parsed_tile_to_cpu[ i ]==USHORT_MAX, ULONG_MAX, (ulong)parsed_tile_to_cpu[ i ] );
301 0 : }
302 :
303 0 : fd_core_subtopo( config, tile_to_cpu );
304 0 : fd_gossip_subtopo( config, tile_to_cpu );
305 :
306 : /* topo, name */
307 0 : fd_topob_wksp( topo, "net_shred" );
308 0 : fd_topob_wksp( topo, "net_repair" );
309 0 : fd_topob_wksp( topo, "net_quic" );
310 :
311 0 : fd_topob_wksp( topo, "shred_out" );
312 0 : fd_topob_wksp( topo, "replay_epoch" );
313 :
314 0 : fd_topob_wksp( topo, "poh_shred" );
315 :
316 0 : fd_topob_wksp( topo, "shred_sign" );
317 0 : fd_topob_wksp( topo, "sign_shred" );
318 :
319 0 : fd_topob_wksp( topo, "repair_sign" );
320 0 : fd_topob_wksp( topo, "sign_repair" );
321 0 : fd_topob_wksp( topo, "rnonce" );
322 0 : fd_topob_wksp( topo, "repair_out" );
323 :
324 0 : fd_topob_wksp( topo, "txsend_out" );
325 :
326 0 : fd_topob_wksp( topo, "shred" );
327 0 : fd_topob_wksp( topo, "repair" );
328 0 : fd_topob_wksp( topo, "fec_sets" );
329 0 : fd_topob_wksp( topo, "snapin_manif" );
330 :
331 0 : fd_topob_wksp( topo, "genesi_out" ); /* mock genesi_out for ipecho */
332 :
333 0 : fd_topob_wksp( topo, "tower_out" ); /* mock tower_out for confirmation msgs. Not needed for any topo except eqvoc. */
334 :
335 0 : #define FOR(cnt) for( ulong i=0UL; i<cnt; i++ )
336 :
337 0 : ulong pending_fec_shreds_depth = fd_ulong_min( fd_ulong_pow2_up( config->tiles.shred.max_pending_shred_sets * FD_REEDSOL_DATA_SHREDS_MAX ), USHORT_MAX + 1 /* dcache max */ );
338 :
339 : /* topo, link_name, wksp_name, depth, mtu, burst */
340 0 : FOR(quic_tile_cnt) fd_topob_link( topo, "quic_net", "net_quic", config->net.ingress_buffer_size, FD_NET_MTU, 1UL );
341 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "shred_net", "net_shred", config->net.ingress_buffer_size, FD_NET_MTU, 1UL );
342 :
343 0 : /**/ fd_topob_link( topo, "replay_epoch", "replay_epoch", 128UL, FD_EPOCH_OUT_MTU, 1UL );
344 :
345 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "shred_sign", "shred_sign", 128UL, 32UL, 1UL );
346 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "sign_shred", "sign_shred", 128UL, 64UL, 1UL );
347 :
348 0 : /**/ fd_topob_link( topo, "repair_net", "net_repair", config->net.ingress_buffer_size, FD_NET_MTU, 1UL );
349 :
350 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "shred_out", "shred_out", pending_fec_shreds_depth, sizeof(fd_shred_message_t), 2UL /* at most 2 msgs per after_frag */ );
351 0 : FOR(sign_tile_cnt-1) fd_topob_link( topo, "repair_sign", "repair_sign", 256UL, FD_REPAIR_MAX_PREIMAGE_SZ, 1UL );
352 0 : FOR(sign_tile_cnt-1) fd_topob_link( topo, "sign_repair", "sign_repair", 128UL, sizeof(fd_ed25519_sig_t), 1UL );
353 :
354 : /**/ fd_topob_link( topo, "repair_out", "repair_out", 128UL, sizeof(fd_fec_complete_t), 1UL );
355 :
356 0 : /**/ fd_topob_link( topo, "poh_shred", "poh_shred", 16384UL, USHORT_MAX, 1UL );
357 :
358 0 : /**/ fd_topob_link( topo, "txsend_out", "txsend_out", 128UL, FD_TXN_MTU, 1UL );
359 :
360 : /**/ fd_topob_link( topo, "snapin_manif", "snapin_manif", 2UL, sizeof(fd_snapshot_manifest_t),1UL );
361 :
362 0 : /**/ fd_topob_link( topo, "genesi_out", "genesi_out", 1UL, FD_GENESIS_TILE_MTU, 1UL );
363 0 : /**/ fd_topob_link( topo, "tower_out", "tower_out", 1024UL, sizeof(fd_tower_msg_t), 1UL );
364 :
365 0 : FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_repair", i, config->net.ingress_buffer_size );
366 0 : FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_quic", i, config->net.ingress_buffer_size );
367 0 : FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_shred", i, config->net.ingress_buffer_size );
368 :
369 : /* topo, tile_name, tile_wksp, metrics_wksp, cpu_idx, is_agave, uses_id_keyswitch, uses_av_keyswitch */
370 0 : FOR(shred_tile_cnt) fd_topob_tile( topo, "shred", "shred", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 1, 0 );
371 0 : fd_topo_tile_t * repair_tile = fd_topob_tile( topo, "repair", "repair", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 1, 0 );
372 :
373 : /* Setup a shared wksp object for fec sets. */
374 :
375 0 : ulong shred_depth = 65536UL; /* from fdctl/topology.c shred_store link. MAKE SURE TO KEEP IN SYNC. */
376 0 : ulong fec_set_cnt = 2UL*shred_depth + config->tiles.shred.max_pending_shred_sets + 6UL;
377 0 : ulong fec_sets_sz = fec_set_cnt*sizeof(fd_fec_set_t); /* mirrors # of dcache entires in frankendancer */
378 0 : fd_topo_obj_t * fec_sets_obj = setup_topo_fec_sets( topo, "fec_sets", shred_tile_cnt*fec_sets_sz );
379 0 : for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
380 0 : fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
381 0 : fd_topob_tile_uses( topo, shred_tile, fec_sets_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
382 0 : }
383 0 : fd_topob_tile_uses( topo, repair_tile, fec_sets_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
384 0 : FD_TEST( fd_pod_insertf_ulong( topo->props, fec_sets_obj->id, "fec_sets" ) );
385 :
386 : /* There's another special fseq that's used to communicate the shred
387 : version from the Agave boot path to the shred tile. */
388 0 : fd_topo_obj_t * poh_shred_obj = fd_topob_obj( topo, "fseq", "poh_shred" );
389 0 : fd_topo_tile_t * poh_tile = &topo->tiles[ fd_topo_find_tile( topo, "gossip", 0UL ) ];
390 0 : fd_topob_tile_uses( topo, poh_tile, poh_shred_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
391 :
392 0 : for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
393 0 : fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
394 0 : fd_topob_tile_uses( topo, shred_tile, poh_shred_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
395 0 : }
396 0 : FD_TEST( fd_pod_insertf_ulong( topo->props, poh_shred_obj->id, "poh_shred" ) );
397 :
398 0 : if( FD_LIKELY( !is_auto_affinity ) ) {
399 0 : if( FD_UNLIKELY( affinity_tile_cnt<topo->tile_cnt ) )
400 0 : FD_LOG_ERR(( "The topology you are using has %lu tiles, but the CPU affinity specified in the config tile as [layout.affinity] only provides for %lu cores. "
401 0 : "You should either increase the number of cores dedicated to Firedancer in the affinity string, or decrease the number of cores needed by reducing "
402 0 : "the total tile count. You can reduce the tile count by decreasing individual tile counts in the [layout] section of the configuration file.",
403 0 : topo->tile_cnt, affinity_tile_cnt ));
404 0 : if( FD_UNLIKELY( affinity_tile_cnt>topo->tile_cnt ) )
405 0 : FD_LOG_WARNING(( "The topology you are using has %lu tiles, but the CPU affinity specified in the config tile as [layout.affinity] provides for %lu cores. "
406 0 : "Not all cores in the affinity will be used by Firedancer. You may wish to increase the number of tiles in the system by increasing "
407 0 : "individual tile counts in the [layout] section of the configuration file.",
408 0 : topo->tile_cnt, affinity_tile_cnt ));
409 0 : }
410 :
411 : /* topo, tile_name, tile_kind_id, fseq_wksp, link_name, link_kind_id, reliable, polled */
412 0 : for( ulong j=0UL; j<shred_tile_cnt; j++ )
413 0 : fd_topos_tile_in_net( topo, "metric_in", "shred_net", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
414 0 : for( ulong j=0UL; j<quic_tile_cnt; j++ )
415 0 : {fd_topos_tile_in_net( topo, "metric_in", "quic_net", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );} /* No reliable consumers of networking fragments, may be dropped or overrun */
416 :
417 0 : /**/ fd_topob_tile_in( topo, "gossip", 0UL, "metric_in", "txsend_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
418 :
419 0 : /**/ fd_topos_tile_in_net( topo, "metric_in", "repair_net", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
420 :
421 0 : FOR(shred_tile_cnt) for( ulong j=0UL; j<net_tile_cnt; j++ )
422 0 : fd_topob_tile_in( topo, "shred", i, "metric_in", "net_shred", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
423 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "shred", i, "metric_in", "poh_shred", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
424 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "shred", i, "metric_in", "replay_epoch", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
425 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "shred", i, "metric_in", "gossip_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
426 0 : FOR(shred_tile_cnt) fd_topob_tile_out( topo, "shred", i, "shred_out", i );
427 0 : FOR(shred_tile_cnt) fd_topob_tile_out( topo, "shred", i, "shred_net", i );
428 0 : FOR(shred_tile_cnt) fd_topob_tile_in ( topo, "shred", i, "metric_in", "ipecho_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
429 :
430 : /**/ fd_topob_tile_out( topo, "repair", 0UL, "repair_net", 0UL );
431 :
432 : /* Sign links don't need to be reliable because they are synchronous,
433 : so there's at most one fragment in flight at a time anyway. The
434 : sign links are also not polled by the mux, instead the tiles will
435 : read the sign responses out of band in a dedicated spin loop. */
436 0 : for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
437 0 : /**/ fd_topob_tile_in( topo, "sign", 0UL, "metric_in", "shred_sign", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
438 0 : /**/ fd_topob_tile_out( topo, "shred", i, "shred_sign", i );
439 0 : /**/ fd_topob_tile_in( topo, "shred", i, "metric_in", "sign_shred", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_UNPOLLED );
440 0 : /**/ fd_topob_tile_out( topo, "sign", 0UL, "sign_shred", i );
441 0 : }
442 0 : FOR(gossvf_tile_cnt) fd_topob_tile_in ( topo, "gossvf", i, "metric_in", "replay_epoch", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
443 :
444 0 : /**/ fd_topob_tile_in ( topo, "gossip", 0UL, "metric_in", "replay_epoch", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
445 :
446 0 : FOR(net_tile_cnt) fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "net_repair", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
447 0 : /**/ fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "gossip_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
448 0 : fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "snapin_manif", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
449 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "shred_out", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
450 0 : FOR(sign_tile_cnt-1) fd_topob_tile_out( topo, "repair", 0UL, "repair_sign", i );
451 0 : FOR(sign_tile_cnt-1) fd_topob_tile_in ( topo, "sign", i+1, "metric_in", "repair_sign", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
452 0 : FOR(sign_tile_cnt-1) fd_topob_tile_out( topo, "sign", i+1, "sign_repair", i );
453 0 : FOR(sign_tile_cnt-1) fd_topob_tile_in ( topo, "repair", 0UL, "metric_in", "sign_repair", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
454 :
455 : /**/ fd_topob_tile_out( topo, "repair", 0UL, "repair_out", 0UL );
456 0 : /**/ fd_topob_tile_in ( topo, "gossip", 0UL, "metric_in", "sign_gossip", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_UNPOLLED );
457 0 : /**/ fd_topob_tile_in ( topo, "ipecho", 0UL, "metric_in", "genesi_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
458 0 : /**/ fd_topob_tile_in ( topo, "repair", 0UL, "metric_in", "tower_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
459 :
460 : /* Repair and shred share a secret they use to generate the nonces.
461 : It's not super security sensitive, but for good hygiene, we make it
462 : an object. */
463 0 : if( 1 /* just restrict the scope for these variables in this big function */ ) {
464 0 : fd_topo_obj_t * rnonce_ss_obj = fd_topob_obj( topo, "rnonce_ss", "rnonce" );
465 0 : fd_topo_tile_t * repair_tile = &topo->tiles[ fd_topo_find_tile( topo, "repair", 0UL ) ];
466 0 : fd_topob_tile_uses( topo, repair_tile, rnonce_ss_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
467 0 : for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
468 0 : fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
469 0 : fd_topob_tile_uses( topo, shred_tile, rnonce_ss_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
470 0 : }
471 0 : FD_TEST( fd_pod_insertf_ulong( topo->props, rnonce_ss_obj->id, "rnonce_ss" ) );
472 0 : }
473 :
474 0 : FD_TEST( fd_link_permit_no_producers( topo, "quic_net" ) == quic_tile_cnt );
475 0 : FD_TEST( fd_link_permit_no_producers( topo, "poh_shred" ) == 1UL );
476 0 : FD_TEST( fd_link_permit_no_producers( topo, "txsend_out" ) == 1UL );
477 0 : FD_TEST( fd_link_permit_no_producers( topo, "genesi_out" ) == 1UL );
478 0 : FD_TEST( fd_link_permit_no_producers( topo, "tower_out" ) == 1UL );
479 0 : FD_TEST( fd_link_permit_no_producers( topo, "replay_epoch" ) == 1UL );
480 0 : FD_TEST( fd_link_permit_no_producers( topo, "snapin_manif" ) == 1UL );
481 0 : FD_TEST( fd_link_permit_no_consumers( topo, "net_quic" ) == net_tile_cnt );
482 0 : FD_TEST( fd_link_permit_no_consumers( topo, "repair_out" ) == 1UL );
483 :
484 0 : config->tiles.txsend.txsend_src_port = 0; /* disable txsend */
485 :
486 0 : FOR(net_tile_cnt) fd_topos_net_tile_finish( topo, i );
487 :
488 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
489 0 : fd_topo_tile_t * tile = &topo->tiles[ i ];
490 0 : fd_topo_configure_tile( tile, config );
491 0 : }
492 :
493 0 : if( FD_UNLIKELY( is_auto_affinity ) ) fd_topob_auto_layout( topo, 0 );
494 :
495 0 : fd_topob_finish( topo, CALLBACKS );
496 :
497 0 : config->topo = *topo;
498 0 : }
499 :
500 : static char *
501 0 : fmt_count( char buf[ static 64 ], ulong count ) {
502 0 : char tmp[ 64 ];
503 0 : if( FD_LIKELY( count<1000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%lu", count ) );
504 0 : else if( FD_LIKELY( count<1000000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%.1f K", (double)count/1000.0 ) );
505 0 : else if( FD_LIKELY( count<1000000000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%.1f M", (double)count/1000000.0 ) );
506 :
507 0 : FD_TEST( fd_cstr_printf_check( buf, 64UL, NULL, "%12s", tmp ) );
508 0 : return buf;
509 0 : }
510 :
511 : static void
512 : print_histogram_buckets( volatile ulong * metrics,
513 : ulong offset,
514 : int converter,
515 : double histmin,
516 : double histmax,
517 0 : char * title ) {
518 0 : fd_histf_t hist[1];
519 :
520 : /* Create histogram structure only to get bucket edges for display */
521 0 : if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
522 : /* For SLOT_COMPLETE_TIME: min=0.2, max=2.0 seconds */
523 0 : FD_TEST( fd_histf_new( hist, fd_metrics_convert_seconds_to_ticks( histmin ), fd_metrics_convert_seconds_to_ticks( histmax ) ) );
524 0 : } else if( FD_LIKELY( converter == FD_METRICS_CONVERTER_NONE ) ) {
525 : /* For non-time histograms, we'd need the actual min/max values */
526 0 : FD_TEST( fd_histf_new( hist, (ulong)histmin, (ulong)histmax ) );
527 0 : } else {
528 0 : FD_LOG_ERR(( "unknown converter %i", converter ));
529 0 : }
530 :
531 0 : printf( " +---------------------+--------------------+--------------+\n" );
532 0 : printf( " | %-19s | | Count |\n", title );
533 0 : printf( " +---------------------+--------------------+--------------+\n" );
534 :
535 0 : ulong total_count = 0;
536 0 : for( ulong k = 0; k < FD_HISTF_BUCKET_CNT; k++ ) {
537 0 : ulong bucket_count = metrics[ offset + k ];
538 0 : total_count += bucket_count;
539 0 : }
540 :
541 0 : for( ulong k = 0; k < FD_HISTF_BUCKET_CNT; k++ ) {
542 : /* Get individual bucket count directly from metrics array */
543 0 : ulong bucket_count = metrics[ offset + k ];
544 :
545 0 : char * le_str;
546 0 : char le_buf[ 64 ];
547 0 : if( FD_UNLIKELY( k == FD_HISTF_BUCKET_CNT - 1UL ) ) {
548 0 : le_str = "+Inf";
549 0 : } else {
550 0 : ulong edge = fd_histf_right( hist, k );
551 0 : if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
552 0 : double edgef = fd_metrics_convert_ticks_to_seconds( edge - 1 );
553 0 : FD_TEST( fd_cstr_printf_check( le_buf, sizeof( le_buf ), NULL, "%.3f", edgef ) );
554 0 : } else {
555 0 : FD_TEST( fd_cstr_printf_check( le_buf, sizeof( le_buf ), NULL, "%.3f", (double)(edge - 1) / 1000000.0 ) );
556 0 : }
557 0 : le_str = le_buf;
558 0 : }
559 :
560 0 : char count_buf[ 64 ];
561 0 : fmt_count( count_buf, bucket_count );
562 :
563 : /* Match visual bar length to the %-18s display column width. */
564 0 : char bar_buf[ 19 ];
565 0 : ulong bar_max = sizeof( bar_buf ) - 1UL;
566 0 : if( bucket_count > 0 && total_count > 0 ) {
567 0 : ulong bar_length = (bucket_count * bar_max) / total_count;
568 0 : if( bar_length == 0 ) bar_length = 1;
569 0 : if( bar_length > bar_max ) bar_length = bar_max;
570 0 : for( ulong i = 0; i < bar_length; i++ ) { bar_buf[ i ] = '|'; }
571 0 : bar_buf[ bar_length ] = '\0';
572 0 : } else {
573 0 : bar_buf[ 0 ] = '\0';
574 0 : }
575 :
576 0 : printf( " | %-19s | %-18s | %s |\n", le_str, bar_buf, count_buf );
577 0 : }
578 :
579 : /* Print sum and total count */
580 0 : char sum_buf[ 64 ];
581 0 : char avg_buf[ 64 ];
582 0 : if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
583 0 : double sumf = fd_metrics_convert_ticks_to_seconds( metrics[ offset + FD_HISTF_BUCKET_CNT ] );
584 0 : FD_TEST( fd_cstr_printf_check( sum_buf, sizeof( sum_buf ), NULL, "%.6f", sumf ) );
585 0 : double avg = sumf / (double)total_count;
586 0 : FD_TEST( fd_cstr_printf_check( avg_buf, sizeof( avg_buf ), NULL, "%.6f", avg ) );
587 0 : } else {
588 0 : FD_TEST( fd_cstr_printf_check( sum_buf, sizeof( sum_buf ), NULL, "%lu", metrics[ offset + FD_HISTF_BUCKET_CNT ] ));
589 0 : }
590 :
591 0 : printf( " +---------------------+--------------------+---------------+\n" );
592 0 : printf( " | Sum: %-14s | Count: %-11lu | Avg: %-8s |\n", sum_buf, total_count, avg_buf );
593 0 : printf( " +---------------------+--------------------+---------------+\n" );
594 0 : }
595 :
596 : static fd_slot_metrics_t temp_slots[ FD_CATCHUP_METRICS_MAX ];
597 :
598 : static void
599 0 : print_catchup_slots( fd_wksp_t * repair_tile_wksp, ctx_t * repair_ctx, int verbose, int sort_by_slot ) {
600 0 : fd_repair_metrics_t * catchup = repair_ctx->slot_metrics;
601 0 : ulong catchup_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, catchup );
602 0 : fd_repair_metrics_t * catchup_table = (fd_repair_metrics_t *)fd_wksp_laddr( repair_tile_wksp, catchup_gaddr );
603 0 : if( FD_LIKELY( sort_by_slot ) ) {
604 0 : fd_repair_metrics_print_sorted( catchup_table, verbose, temp_slots );
605 0 : } else {
606 0 : fd_repair_metrics_print( catchup_table, verbose );
607 0 : }
608 0 : }
609 :
610 : static fd_location_info_t * location_table;
611 : static fd_pubkey_t peers_copy[ FD_REPAIR_PEER_MAX];
612 :
613 : static ulong
614 0 : sort_peers_by_latency( fd_policy_peer_map_t * active_table, fd_policy_peer_dlist_t * peers_dlist, fd_policy_peer_dlist_t * peers_wlist, fd_policy_peer_t * peers_arr ) {
615 0 : ulong i = 0;
616 0 : fd_policy_peer_dlist_iter_t iter = fd_policy_peer_dlist_iter_fwd_init( peers_dlist, peers_arr );
617 0 : while( !fd_policy_peer_dlist_iter_done( iter, peers_dlist, peers_arr ) ) {
618 0 : fd_policy_peer_t * peer = fd_policy_peer_dlist_iter_ele( iter, peers_dlist, peers_arr );
619 0 : if( FD_UNLIKELY( !peer ) ) break;
620 0 : peers_copy[ i++ ] = peer->key;
621 0 : if( FD_UNLIKELY( i >= FD_REPAIR_PEER_MAX ) ) break;
622 0 : iter = fd_policy_peer_dlist_iter_fwd_next( iter, peers_dlist, peers_arr );
623 0 : }
624 0 : ulong fast_cnt = i;
625 0 : iter = fd_policy_peer_dlist_iter_fwd_init( peers_wlist, peers_arr );
626 0 : while( !fd_policy_peer_dlist_iter_done( iter, peers_wlist, peers_arr ) ) {
627 0 : fd_policy_peer_t * peer = fd_policy_peer_dlist_iter_ele( iter, peers_wlist, peers_arr );
628 0 : if( FD_UNLIKELY( !peer ) ) break;
629 0 : peers_copy[ i++ ] = peer->key;
630 0 : if( FD_UNLIKELY( i >= FD_REPAIR_PEER_MAX ) ) break;
631 0 : iter = fd_policy_peer_dlist_iter_fwd_next( iter, peers_wlist, peers_arr );
632 0 : }
633 0 : FD_LOG_NOTICE(( "Fast peers cnt: %lu. Slow peers cnt: %lu.", fast_cnt, i - fast_cnt ));
634 :
635 0 : ulong peer_cnt = i;
636 0 : for( uint i = 0; i < peer_cnt - 1; i++ ) {
637 0 : int swapped = 0;
638 0 : for( uint j = 0; j < peer_cnt - 1 - i; j++ ) {
639 0 : fd_policy_peer_t const * active_j = fd_policy_peer_map_ele_query( active_table, &peers_copy[ j ], NULL, peers_arr );
640 0 : fd_policy_peer_t const * active_j1 = fd_policy_peer_map_ele_query( active_table, &peers_copy[ j + 1 ], NULL, peers_arr );
641 :
642 : /* Skip peers with no responses */
643 0 : double latency_j = 10e9;
644 0 : double latency_j1 = 10e9;
645 0 : if( FD_LIKELY( active_j && active_j->res_cnt > 0 ) ) latency_j = ((double)active_j->total_lat / (double)active_j->res_cnt);
646 0 : if( FD_LIKELY( active_j1 && active_j1->res_cnt > 0 ) ) latency_j1 = ((double)active_j1->total_lat / (double)active_j1->res_cnt);
647 :
648 : /* Swap if j has higher latency than j+1 */
649 0 : if( latency_j > latency_j1 ) {
650 0 : fd_pubkey_t temp = peers_copy[ j ];
651 0 : peers_copy[ j ] = peers_copy[ j + 1 ];
652 0 : peers_copy[ j + 1 ] = temp;
653 0 : swapped = 1;
654 0 : }
655 0 : }
656 0 : if( !swapped ) break;
657 0 : }
658 0 : return peer_cnt;
659 0 : }
660 :
661 : static void
662 0 : print_peer_location_latency( fd_wksp_t * repair_tile_wksp, ctx_t * tile_ctx ) {
663 0 : ulong policy_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, tile_ctx->policy );
664 0 : fd_policy_t * policy = fd_wksp_laddr ( repair_tile_wksp, policy_gaddr );
665 0 : ulong peermap_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.map );
666 0 : ulong peerarr_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.pool );
667 0 : ulong peerlst_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.fast );
668 0 : ulong peerwst_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.slow );
669 0 : fd_policy_peer_map_t * peers_map = (fd_policy_peer_map_t *) fd_wksp_laddr( repair_tile_wksp, peermap_gaddr );
670 0 : fd_policy_peer_dlist_t * peers_dlist = (fd_policy_peer_dlist_t *)fd_wksp_laddr( repair_tile_wksp, peerlst_gaddr );
671 0 : fd_policy_peer_dlist_t * peers_wlist = (fd_policy_peer_dlist_t *)fd_wksp_laddr( repair_tile_wksp, peerwst_gaddr );
672 0 : fd_policy_peer_t * peers_arr = (fd_policy_peer_t *) fd_wksp_laddr( repair_tile_wksp, peerarr_gaddr );
673 :
674 0 : ulong peer_cnt = sort_peers_by_latency( peers_map, peers_dlist, peers_wlist, peers_arr );
675 0 : printf("\nPeer Location/Latency Information\n");
676 0 : printf( " | %-46s | %-7s | %-8s | %-8s | %-7s | %-7s | %-12s | %s\n", "Pubkey", "Req Cnt", "Req B/s", "Rx B/s", "Rx Rate", "Avg Latency", "Ewma Latency", "Location Info" );
677 0 : for( uint i = 0; i < peer_cnt; i++ ) {
678 0 : fd_policy_peer_t const * active = fd_policy_peer_map_ele_query( peers_map, &peers_copy[ i ], NULL, peers_arr );
679 0 : if( FD_LIKELY( active && active->res_cnt > 0 ) ) {
680 0 : fd_location_info_t * info = fd_location_table_query( location_table, active->ip4, NULL );
681 0 : char * geolocation = info ? info->location : "";
682 0 : double peer_bps = (double)(active->res_cnt * FD_SHRED_MIN_SZ) / ((double)(active->last_resp_ts - active->first_resp_ts) / 1e9);
683 0 : double req_bps = (double)active->req_cnt * 202 / ((double)(active->last_req_ts - active->first_req_ts) / 1e9);
684 0 : FD_BASE58_ENCODE_32_BYTES( active->key.key, key_b58 );
685 0 : printf( "%-5u | %-46s | %-7lu | %-8.2f | %-8.2f | %-7.2f | %10.3fms | %10.3fms | %s\n", i, key_b58, active->req_cnt, req_bps, peer_bps, (double)active->res_cnt / (double)active->req_cnt, ((double)active->total_lat / (double)active->res_cnt) / 1e6, (double)active->ewma_lat / 1e6, geolocation );
686 0 : }
687 0 : }
688 0 : printf("\n");
689 0 : fflush( stdout );
690 0 : }
691 :
692 : static void
693 0 : read_iptable( char * iptable_path, fd_location_info_t * location_table ) {
694 0 : int iptable_fd = open( iptable_path, O_RDONLY );
695 0 : if( FD_UNLIKELY( iptable_fd<0 ) ) return;
696 :
697 : /* read iptable line by line */
698 0 : if( FD_LIKELY( iptable_fd>=0 ) ) {
699 0 : char line[ 256 ];
700 0 : uchar istream_buf[256];
701 0 : fd_io_buffered_istream_t istream[1];
702 0 : fd_io_buffered_istream_init( istream, iptable_fd, istream_buf, sizeof(istream_buf) );
703 0 : for(;;) {
704 0 : int err;
705 0 : if( !fd_io_fgets( line, sizeof(line), istream, &err ) ) break;
706 0 : fd_location_info_t location_info;
707 0 : sscanf( line, "%lu %[^\n]", &location_info.ip4_addr, location_info.location );
708 0 : fd_location_info_t * info = fd_location_table_insert( location_table, location_info.ip4_addr );
709 0 : if( FD_UNLIKELY( info==NULL ) ) break;
710 0 : memcpy( info->location, location_info.location, sizeof(info->location) );
711 0 : }
712 0 : }
713 0 : }
714 :
715 : static void
716 : print_tile_metrics( volatile ulong * shred_metrics,
717 : volatile ulong * repair_metrics,
718 : volatile ulong * repair_metrics_prev, /* for diffing metrics */
719 : volatile ulong ** repair_net_links,
720 : volatile ulong ** net_shred_links,
721 : ulong net_tile_cnt,
722 : ulong * last_sent_cnt,
723 : long last_print_ts,
724 0 : long now ) {
725 0 : char buf2[ 64 ];
726 0 : ulong rcvd = shred_metrics [ MIDX( COUNTER, SHRED, SHRED_REPAIR_RX ) ];
727 0 : ulong sent = repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_TX_NEEDED_WINDOW ) ] +
728 0 : repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_TX_NEEDED_HIGHEST_WINDOW ) ] +
729 0 : repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_TX_NEEDED_ORPHAN ) ];
730 0 : printf(" Requests received: (%lu/%lu) %.1f%% \n", rcvd, sent, (double)rcvd / (double)sent * 100.0 );
731 0 : printf( " +---------------+--------------+\n" );
732 0 : printf( " | Request Type | Count |\n" );
733 0 : printf( " +---------------+--------------+\n" );
734 0 : printf( " | Orphan | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_TX_NEEDED_ORPHAN ) ] ) );
735 0 : printf( " | HighestWindow | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_TX_NEEDED_HIGHEST_WINDOW ) ] ) );
736 0 : printf( " | Index | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_TX_NEEDED_WINDOW ) ] ) );
737 0 : printf( " +---------------+--------------+\n" );
738 0 : printf( " Send Pkt Rate: %s pps\n", fmt_count( buf2, (ulong)((sent - *last_sent_cnt)*1e9L / (now - last_print_ts) ) ) );
739 0 : *last_sent_cnt = sent;
740 :
741 : /* Sum overrun across all net tiles connected to repair_net */
742 0 : ulong total_overrun = repair_net_links[0][ MIDX( COUNTER, LINK, FRAG_POLLING_OVERRUN ) ]; /* coarse double counting prevention */
743 0 : ulong total_consumed = 0UL;
744 0 : for( ulong i = 0UL; i < net_tile_cnt; i++ ) {
745 0 : volatile ulong * ovar_net_metrics = repair_net_links[i];
746 0 : total_overrun += ovar_net_metrics[ MIDX( COUNTER, LINK, FRAG_READING_OVERRUN ) ];
747 0 : total_consumed += ovar_net_metrics[ MIDX( COUNTER, LINK, FRAG_CONSUMED ) ]; /* consumed is incremented after after_frag is called */
748 0 : }
749 0 : printf( " Outgoing requests overrun: %s\n", fmt_count( buf2, total_overrun ) );
750 0 : printf( " Outgoing requests consumed: %s\n", fmt_count( buf2, total_consumed ) );
751 :
752 0 : total_overrun = net_shred_links[0][ MIDX( COUNTER, LINK, FRAG_READING_OVERRUN ) ];
753 0 : total_consumed = 0UL;
754 0 : for( ulong i = 0UL; i < net_tile_cnt; i++ ) {
755 0 : volatile ulong * ovar_net_metrics = net_shred_links[i];
756 0 : total_overrun += ovar_net_metrics[ MIDX( COUNTER, LINK, FRAG_READING_OVERRUN ) ];
757 0 : total_consumed += ovar_net_metrics[ MIDX( COUNTER, LINK, FRAG_CONSUMED ) ]; /* shred frag filtering happens manually in after_frag, so no need to index every shred_tile. */
758 0 : }
759 :
760 0 : printf( " Incoming shreds overrun: %s\n", fmt_count( buf2, total_overrun ) );
761 0 : printf( " Incoming shreds consumed: %s\n", fmt_count( buf2, total_consumed ) );
762 :
763 0 : print_histogram_buckets( repair_metrics,
764 0 : MIDX( HISTOGRAM, REPAIR, RESPONSE_LATENCY_NANOS ),
765 0 : FD_METRICS_CONVERTER_NONE,
766 0 : FD_METRICS_HISTOGRAM_REPAIR_RESPONSE_LATENCY_NANOS_MIN,
767 0 : FD_METRICS_HISTOGRAM_REPAIR_RESPONSE_LATENCY_NANOS_MAX,
768 0 : "Response Latency" );
769 :
770 0 : printf(" Repair Peers: %lu\n", repair_metrics[ MIDX( COUNTER, REPAIR, PEER_REQUESTED ) ] );
771 0 : printf(" Shreds rejected (no stakes): %lu\n", shred_metrics[ MIDX( COUNTER, SHRED, SHRED_PROCESSED ) ] );
772 : /* Print histogram buckets similar to Prometheus format */
773 0 : print_histogram_buckets( repair_metrics,
774 0 : MIDX( HISTOGRAM, REPAIR, SLOT_COMPLETE_DURATION_SECONDS ),
775 0 : FD_METRICS_CONVERTER_SECONDS,
776 0 : FD_METRICS_HISTOGRAM_REPAIR_SLOT_COMPLETE_DURATION_SECONDS_MIN,
777 0 : FD_METRICS_HISTOGRAM_REPAIR_SLOT_COMPLETE_DURATION_SECONDS_MAX,
778 0 : "Slot Complete Time" );
779 :
780 0 : #define DIFFX(METRIC) repair_metrics[ MIDX( COUNTER, TILE, METRIC ) ] - repair_metrics_prev[ MIDX( COUNTER, TILE, METRIC ) ]
781 0 : ulong hkeep_ticks = DIFFX(REGIME_DURATION_NANOS_CAUGHT_UP_HOUSEKEEPING) + DIFFX(REGIME_DURATION_NANOS_PROCESSING_HOUSEKEEPING) + DIFFX(REGIME_DURATION_NANOS_BACKPRESSURE_HOUSEKEEPING);
782 0 : ulong busy_ticks = DIFFX(REGIME_DURATION_NANOS_PROCESSING_PREFRAG) + DIFFX(REGIME_DURATION_NANOS_PROCESSING_POSTFRAG ) + DIFFX(REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG);
783 0 : ulong caught_up_ticks = DIFFX(REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG);
784 0 : ulong backpressure_ticks = DIFFX(REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG);
785 0 : ulong total_ticks = hkeep_ticks + busy_ticks + caught_up_ticks + backpressure_ticks;
786 :
787 0 : printf( " Repair Hkeep: %.1f %% Busy: %.1f %% Idle: %.1f %% Backp: %0.1f %%\n",
788 0 : (double)hkeep_ticks/(double)total_ticks*100.0,
789 0 : (double)busy_ticks/(double)total_ticks*100.0,
790 0 : (double)caught_up_ticks/(double)total_ticks*100.0,
791 0 : (double)backpressure_ticks/(double)total_ticks*100.0 );
792 0 : #undef DIFFX
793 0 : fflush( stdout );
794 :
795 0 : printf( " Block failed insert: %lu\n", repair_metrics[ MIDX( COUNTER, REPAIR, BLOCK_INSERT_FAILED ) ] );
796 0 : printf( " Block evicted: %lu\n", repair_metrics[ MIDX( COUNTER, REPAIR, BLOCK_EVICTED ) ] );
797 0 : printf( " slot evicted: %lu\n", repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_LAST_EVICTED ) ] );
798 0 : printf( " slot evicted by: %lu\n", repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_LAST_EVICTION_CAUSE ) ] );
799 0 : printf( " slot failed insert: %lu\n", repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_LAST_INSERT_FAILED ) ] );
800 0 : for( ulong i=0UL; i<FD_METRICS_TOTAL_SZ/sizeof(ulong); i++ ) repair_metrics_prev[ i ] = repair_metrics[ i ];
801 0 : }
802 :
803 : static void
804 : repair_ctx_wksp( args_t * args,
805 : config_t * config,
806 : ctx_t ** repair_ctx,
807 0 : fd_topo_wksp_t ** repair_wksp ) {
808 0 : (void)args;
809 :
810 0 : fd_topo_t * topo = &config->topo;
811 0 : ulong wksp_id = fd_topo_find_wksp( topo, "repair" );
812 0 : if( FD_UNLIKELY( wksp_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair workspace not found" ));
813 :
814 0 : fd_topo_wksp_t * _repair_wksp = &topo->workspaces[ wksp_id ];
815 :
816 0 : ulong tile_id = fd_topo_find_tile( topo, "repair", 0UL );
817 0 : if( FD_UNLIKELY( tile_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair tile not found" ));
818 :
819 0 : fd_topo_join_workspace( topo, _repair_wksp, FD_SHMEM_JOIN_MODE_READ_ONLY, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
820 :
821 : /* Access the repair tile scratch memory where repair_tile_ctx is stored */
822 0 : fd_topo_tile_t * tile = &topo->tiles[ tile_id ];
823 0 : void * scratch = fd_topo_obj_laddr( &config->topo, tile->tile_obj_id );
824 0 : if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "Failed to access repair tile scratch memory" ));
825 :
826 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
827 0 : ctx_t * _repair_ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ctx_t), sizeof(ctx_t) );
828 :
829 0 : *repair_ctx = _repair_ctx;
830 0 : *repair_wksp = _repair_wksp;
831 0 : }
832 :
833 : static void
834 : repair_cmd_fn_catchup( args_t * args,
835 0 : config_t * config ) {
836 :
837 0 : memset( &config->topo, 0, sizeof(config->topo) );
838 0 : repair_topo( config );
839 :
840 0 : fd_topo_print_log( 1, &config->topo );
841 :
842 0 : args_t configure_args = {
843 0 : .configure.command = CONFIGURE_CMD_INIT,
844 0 : };
845 0 : for( ulong i=0UL; STAGES[ i ]; i++ ) {
846 0 : configure_args.configure.stages[ i ] = STAGES[ i ];
847 0 : }
848 0 : configure_cmd_fn( &configure_args, config );
849 0 : if( 0==strcmp( config->net.provider, "xdp" ) ) {
850 0 : fd_topo_install_xdp_simple( &config->topo, config->net.bind_address_parsed );
851 0 : }
852 0 : run_firedancer_init( config, 1, 0 );
853 :
854 0 : fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_WRITE, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
855 :
856 0 : fd_topo_fill( &config->topo );
857 :
858 0 : repair_load_manifest( &config->topo, args->repair.manifest_path );
859 :
860 : /* Access repair workspace memory and metrics */
861 :
862 0 : ulong repair_tile_idx = fd_topo_find_tile( &config->topo, "repair", 0UL );
863 0 : ulong shred_tile_idx = fd_topo_find_tile( &config->topo, "shred", 0UL );
864 0 : FD_TEST( repair_tile_idx!=ULONG_MAX );
865 0 : FD_TEST( shred_tile_idx !=ULONG_MAX );
866 0 : fd_topo_tile_t * repair_tile = &config->topo.tiles[ repair_tile_idx ];
867 0 : fd_topo_tile_t * shred_tile = &config->topo.tiles[ shred_tile_idx ];
868 :
869 0 : fd_topo_wksp_t * repair_wksp;
870 0 : ctx_t * repair_ctx;
871 0 : repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
872 :
873 0 : volatile ulong * shred_metrics = fd_metrics_tile( shred_tile->metrics );
874 0 : volatile ulong * repair_metrics = fd_metrics_tile( repair_tile->metrics );
875 0 : FD_TEST( repair_metrics );
876 0 : ulong * repair_metrics_prev = aligned_alloc( 8UL, sizeof(ulong) * FD_METRICS_TOTAL_SZ );
877 0 : FD_TEST( repair_metrics_prev );
878 0 : memset( repair_metrics_prev, 0, sizeof(ulong) * FD_METRICS_TOTAL_SZ );
879 :
880 : /* Collect link metrics */
881 :
882 : /* Collect all net tiles and their repair_net link metrics */
883 0 : ulong net_cnt = config->layout.net_tile_count;
884 0 : volatile ulong ** repair_net_links = aligned_alloc( 8UL, net_cnt * sizeof(volatile ulong*) );
885 0 : volatile ulong ** net_shred_links = aligned_alloc( 8UL, net_cnt * sizeof(volatile ulong*) );
886 0 : FD_TEST( repair_net_links );
887 0 : FD_TEST( net_shred_links );
888 :
889 0 : for( ulong i = 0UL; i < net_cnt; i++ ) {
890 0 : ulong tile_idx = fd_topo_find_tile( &config->topo, "net", i );
891 0 : if( FD_UNLIKELY( tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net tile %lu not found", i ));
892 0 : fd_topo_tile_t * tile = &config->topo.tiles[ tile_idx ];
893 :
894 0 : ulong repair_net_in_idx = fd_topo_find_tile_in_link( &config->topo, tile, "repair_net", 0UL );
895 0 : if( FD_UNLIKELY( repair_net_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "repair_net link not found for net tile %lu", i ));
896 0 : FD_TEST( tile->metrics );
897 0 : repair_net_links[i] = fd_metrics_link_in( tile->metrics, repair_net_in_idx );
898 0 : FD_TEST( repair_net_links[i] );
899 :
900 : /* process all net_shred links */
901 0 : ulong shred_tile_idx = fd_topo_find_tile( &config->topo, "shred", 0 );
902 0 : if( FD_UNLIKELY( shred_tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "shred tile 0 not found" ));
903 0 : fd_topo_tile_t * shred_tile = &config->topo.tiles[ shred_tile_idx ];
904 :
905 0 : ulong shred_out_in_idx = fd_topo_find_tile_in_link( &config->topo, shred_tile, "net_shred", i );
906 0 : if( FD_UNLIKELY( shred_out_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net_shred link not found for shred tile 0" ));
907 0 : FD_TEST( shred_tile->metrics );
908 0 : net_shred_links[i] = fd_metrics_link_in( shred_tile->metrics, shred_out_in_idx );
909 0 : FD_TEST( net_shred_links[i] );
910 0 : }
911 :
912 0 : FD_LOG_NOTICE(( "Repair catchup run" ));
913 :
914 0 : ulong shred_out_link_idx = fd_topo_find_link( &config->topo, "shred_out", 0UL );
915 0 : FD_TEST( shred_out_link_idx!=ULONG_MAX );
916 0 : fd_topo_link_t * shred_out_link = &config->topo.links[ shred_out_link_idx ];
917 0 : fd_frag_meta_t * shred_out_mcache = shred_out_link->mcache;
918 0 : void * shred_out_dcache = config->topo.workspaces[ config->topo.objs[ shred_out_link->dcache_obj_id ].wksp_id ].wksp;
919 :
920 0 : ulong turbine_slot0 = 0;
921 0 : long last_print = fd_log_wallclock();
922 0 : ulong last_sent = 0UL;
923 :
924 0 : fd_topo_run_single_process( &config->topo, 0, config->uid, config->gid, fdctl_tile_run );
925 0 : for(;;) {
926 :
927 0 : if( FD_UNLIKELY( !turbine_slot0 ) ) {
928 0 : fd_frag_meta_t * frag = &shred_out_mcache[0]; /* hack to get first frag */
929 0 : if ( frag->sz > 0 ) {
930 0 : uchar * shred_out_chunk = fd_chunk_to_laddr( shred_out_dcache, frag->chunk );
931 0 : fd_shred_base_t * shred_out_shred = (fd_shred_base_t *)fd_type_pun( shred_out_chunk );
932 0 : turbine_slot0 = shred_out_shred->shred.slot;
933 0 : FD_LOG_NOTICE(("turbine_slot0: %lu", turbine_slot0));
934 0 : }
935 0 : }
936 :
937 : /* print metrics */
938 :
939 0 : long now = fd_log_wallclock();
940 0 : int catchup_finished = 0;
941 0 : if( FD_UNLIKELY( now - last_print > 1e9L ) ) {
942 0 : print_tile_metrics( shred_metrics, repair_metrics, repair_metrics_prev, repair_net_links, net_shred_links, net_cnt, &last_sent, last_print, now );
943 0 : ulong slots_behind = turbine_slot0 > repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_HIGHEST_REPAIRED ) ] ? turbine_slot0 - repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_HIGHEST_REPAIRED ) ] : 0;
944 0 : printf(" Repaired slots: %lu/%lu (slots behind: %lu)\n", repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_HIGHEST_REPAIRED ) ], turbine_slot0, slots_behind );
945 0 : if( turbine_slot0 && !slots_behind ) {
946 0 : catchup_finished = 1;
947 0 : }
948 0 : printf("\n");
949 0 : fflush( stdout );
950 0 : last_print = now;
951 0 : }
952 :
953 0 : if( FD_UNLIKELY( catchup_finished ) ) {
954 : /* repair cmd owned memory */
955 0 : location_table = fd_location_table_join( fd_location_table_new( location_table_mem ) );
956 0 : read_iptable( args->repair.iptable_path, location_table );
957 0 : print_peer_location_latency( repair_wksp->wksp, repair_ctx );
958 0 : print_catchup_slots( repair_wksp->wksp, repair_ctx, 0, 1 );
959 0 : FD_LOG_NOTICE(("Catchup to slot %lu completed successfully", turbine_slot0));
960 0 : fd_sys_util_exit_group( 0 );
961 0 : }
962 0 : }
963 0 : }
964 :
965 : /* Tests equivocation detection & repair path. */
966 : static void
967 : repair_cmd_fn_eqvoc( args_t * args,
968 0 : config_t * config ) {
969 0 : (void)args;
970 0 : memset( &config->topo, 0, sizeof(config->topo) );
971 0 : repair_topo( config );
972 :
973 0 : FD_LOG_NOTICE(( "Repair eqvoc testing init" ));
974 0 : fd_topo_print_log( 1, &config->topo );
975 :
976 0 : args_t configure_args = { .configure.command = CONFIGURE_CMD_INIT, };
977 0 : for( ulong i=0UL; STAGES[ i ]; i++ ) configure_args.configure.stages[ i ] = STAGES[ i ];
978 0 : configure_cmd_fn( &configure_args, config );
979 0 : if( 0==strcmp( config->net.provider, "xdp" ) ) fd_topo_install_xdp_simple( &config->topo, config->net.bind_address_parsed );
980 :
981 0 : run_firedancer_init( config, 1, 0 );
982 0 : fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_WRITE, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
983 0 : fd_topo_fill( &config->topo );
984 :
985 0 : repair_load_manifest( &config->topo, args->repair.manifest_path );
986 :
987 0 : ulong repair_tile_idx = fd_topo_find_tile( &config->topo, "repair", 0UL );
988 0 : fd_topo_tile_t * repair_tile = &config->topo.tiles[ repair_tile_idx ];
989 0 : volatile ulong * repair_metrics = fd_metrics_tile( repair_tile->metrics );
990 :
991 0 : void * scratch = fd_topo_obj_laddr( &config->topo, repair_tile->tile_obj_id );
992 0 : if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "Failed to access repair tile scratch memory" ));
993 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
994 0 : ctx_t * repair_ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ctx_t), sizeof(ctx_t) );
995 0 : (void)repair_ctx;
996 :
997 : /* read tower_out mcache dcache */
998 0 : ulong tower_out_link_idx = fd_topo_find_link( &config->topo, "tower_out", 0UL );
999 0 : FD_TEST( tower_out_link_idx!=ULONG_MAX );
1000 0 : fd_topo_link_t * tower_out_link = &config->topo.links[ tower_out_link_idx ];
1001 0 : fd_frag_meta_t * tower_out_mcache = tower_out_link->mcache;
1002 0 : fd_wksp_t * tower_out_mem = config->topo.workspaces[ config->topo.objs[ tower_out_link->dcache_obj_id ].wksp_id ].wksp;
1003 0 : ulong tower_out_chunk0 = fd_dcache_compact_chunk0( tower_out_mem, tower_out_link->dcache );
1004 0 : ulong tower_out_wmark = fd_dcache_compact_wmark( tower_out_mem, tower_out_link->dcache, tower_out_link->mtu );
1005 0 : ulong tower_out_chunk = tower_out_chunk0;
1006 :
1007 0 : fd_topo_run_single_process( &config->topo, 0, config->uid, config->gid, fdctl_tile_run );
1008 0 : int confirmed = 0;
1009 0 : for(;;) {
1010 : /* publish a confirmation on tower_out */
1011 0 : if( FD_UNLIKELY( !confirmed && repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_HIGHEST_REPAIRED ) ] != 0 ) ) {
1012 0 : fd_tower_slot_confirmed_t * msg = fd_chunk_to_laddr( tower_out_mem, tower_out_chunk );
1013 0 : FD_LOG_NOTICE(( "publishing confirmation for slot %lu", msg->slot ));
1014 0 : fd_mcache_publish( tower_out_mcache, tower_out_link->depth, 0, FD_TOWER_SIG_SLOT_CONFIRMED, tower_out_chunk, sizeof(fd_tower_slot_confirmed_t), 0, 0, 0 );
1015 0 : tower_out_chunk = fd_dcache_compact_next( tower_out_chunk, sizeof(fd_tower_slot_confirmed_t), tower_out_chunk0, tower_out_wmark );
1016 0 : confirmed = 1;
1017 0 : }
1018 0 : sleep( 1 );
1019 0 : }
1020 0 : }
1021 :
1022 : static void
1023 : repair_cmd_fn_metrics( args_t * args,
1024 0 : config_t * config ) {
1025 : //memset( &config->topo, 0, sizeof(config->topo) );
1026 :
1027 0 : fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_ONLY, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
1028 0 : fd_topo_fill( &config->topo );
1029 :
1030 0 : ctx_t * repair_ctx;
1031 0 : fd_topo_wksp_t * repair_wksp;
1032 0 : repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
1033 :
1034 0 : ulong shred_tile_idx = fd_topo_find_tile( &config->topo, "shred", 0UL );
1035 0 : ulong repair_tile_idx = fd_topo_find_tile( &config->topo, "repair", 0UL );
1036 0 : FD_TEST( shred_tile_idx != ULONG_MAX );
1037 0 : FD_TEST( repair_tile_idx!= ULONG_MAX );
1038 0 : fd_topo_tile_t * shred_tile = &config->topo.tiles[ shred_tile_idx ];
1039 0 : fd_topo_tile_t * repair_tile = &config->topo.tiles[ repair_tile_idx ];
1040 :
1041 0 : volatile ulong * shred_metrics = fd_metrics_tile( shred_tile->metrics );
1042 0 : FD_TEST( shred_metrics );
1043 :
1044 0 : volatile ulong * repair_metrics = fd_metrics_tile( repair_tile->metrics );
1045 0 : FD_TEST( repair_metrics );
1046 0 : ulong * repair_metrics_prev = aligned_alloc( 8UL, sizeof(ulong) * FD_METRICS_TOTAL_SZ );
1047 0 : FD_TEST( repair_metrics_prev );
1048 0 : memset( repair_metrics_prev, 0, sizeof(ulong) * FD_METRICS_TOTAL_SZ );
1049 :
1050 :
1051 0 : ulong net_tile_cnt = config->layout.net_tile_count;
1052 0 : volatile ulong ** repair_net_links = aligned_alloc( 8UL, net_tile_cnt * sizeof(volatile ulong*) );
1053 0 : volatile ulong ** net_shred_links = aligned_alloc( 8UL, net_tile_cnt * sizeof(volatile ulong*) );
1054 0 : FD_TEST( repair_net_links );
1055 0 : FD_TEST( net_shred_links );
1056 :
1057 0 : for( ulong i = 0UL; i < net_tile_cnt; i++ ) {
1058 : /* process all repair_net links */
1059 0 : ulong tile_idx = fd_topo_find_tile( &config->topo, "net", i );
1060 0 : if( FD_UNLIKELY( tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net tile %lu not found", i ));
1061 0 : fd_topo_tile_t * tile = &config->topo.tiles[ tile_idx ];
1062 :
1063 0 : ulong repair_net_in_idx = fd_topo_find_tile_in_link( &config->topo, tile, "repair_net", 0UL );
1064 0 : if( FD_UNLIKELY( repair_net_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "repair_net link not found for net tile %lu", i ));
1065 0 : repair_net_links[i] = fd_metrics_link_in( tile->metrics, repair_net_in_idx );
1066 0 : FD_TEST( repair_net_links[i] );
1067 :
1068 : /* process all net_shred links */
1069 0 : tile_idx = fd_topo_find_tile( &config->topo, "shred", 0 );
1070 0 : if( FD_UNLIKELY( tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "shred tile 0 not found" ));
1071 0 : fd_topo_tile_t * shred_tile = &config->topo.tiles[ tile_idx ];
1072 :
1073 0 : ulong shred_out_in_idx = fd_topo_find_tile_in_link( &config->topo, shred_tile, "net_shred", i );
1074 0 : if( FD_UNLIKELY( shred_out_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net_shred link not found for shred tile 0" ));
1075 0 : net_shred_links[i] = fd_metrics_link_in( shred_tile->metrics, shred_out_in_idx );
1076 0 : FD_TEST( net_shred_links[i] );
1077 0 : }
1078 :
1079 0 : long last_print_ts = fd_log_wallclock();
1080 0 : ulong last_sent = 0UL;
1081 0 : for(;;) {
1082 0 : long now = fd_log_wallclock();
1083 0 : if( FD_UNLIKELY( now - last_print_ts > 1e9L ) ) {
1084 0 : print_tile_metrics( shred_metrics, repair_metrics, repair_metrics_prev, repair_net_links, net_shred_links, net_tile_cnt, &last_sent, last_print_ts, now );
1085 0 : last_print_ts = now;
1086 0 : }
1087 0 : }
1088 0 : }
1089 :
1090 : static void
1091 : repair_cmd_fn_forest( args_t * args,
1092 0 : config_t * config ) {
1093 0 : ctx_t * repair_ctx;
1094 0 : fd_topo_wksp_t * repair_wksp;
1095 0 : repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
1096 :
1097 0 : ulong forest_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->forest );
1098 0 : fd_forest_t * forest = (fd_forest_t *)fd_wksp_laddr( repair_wksp->wksp, forest_gaddr );
1099 :
1100 0 : for( ;; ) {
1101 0 : fd_forest_print( forest );
1102 0 : sleep( 1 );
1103 0 : }
1104 0 : }
1105 :
1106 : static void
1107 : repair_cmd_fn_inflight( args_t * args,
1108 0 : config_t * config ) {
1109 0 : ctx_t * repair_ctx;
1110 0 : fd_topo_wksp_t * repair_wksp;
1111 0 : repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
1112 :
1113 0 : ulong inflights_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->inflights );
1114 0 : fd_inflights_t * inflights = (fd_inflights_t *)fd_wksp_laddr( repair_wksp->wksp, inflights_gaddr );
1115 :
1116 0 : ulong inflight_pool_off = (ulong)inflights->pool - (ulong)repair_ctx->inflights;
1117 0 : fd_inflight_t * inflight_pool = (fd_inflight_t *)fd_wksp_laddr( repair_wksp->wksp, inflights_gaddr + inflight_pool_off );
1118 :
1119 0 : for( ;; ) {
1120 0 : fd_inflights_print( inflights->outstanding_dl, inflight_pool );
1121 0 : printf("popped count: %lu\n", inflights->popped_cnt);
1122 0 : fd_inflights_print( inflights->popped_dl, inflight_pool );
1123 0 : sleep( 1 );
1124 0 : }
1125 0 : }
1126 :
1127 : static void
1128 : repair_cmd_fn_requests( args_t * args,
1129 0 : config_t * config ) {
1130 0 : ctx_t * repair_ctx;
1131 0 : fd_topo_wksp_t * repair_wksp;
1132 0 : repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
1133 :
1134 0 : fd_forest_t * forest = fd_forest_join( fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->forest ) ) );
1135 0 : fd_forest_reqslist_t * dlist = fd_forest_reqslist( forest );
1136 0 : fd_forest_ref_t * pool = fd_forest_reqspool( forest );
1137 :
1138 0 : fd_forest_reqslist_t * orphlist = fd_forest_orphlist( forest );
1139 :
1140 0 : for( ;; ) {
1141 0 : printf("%-15s %-12s %-12s %-12s %-20s %-12s\n",
1142 0 : "Slot", "Buffered Idx", "Complete Idx", "First Shred ts", "Turbine Cnt", "Repair Cnt");
1143 0 : printf("%-15s %-12s %-12s %-12s %-20s %-12s\n",
1144 0 : "---------------", "------------", "------------", "------------",
1145 0 : "--------------------", "------------");
1146 0 : for( fd_forest_reqslist_iter_t iter = fd_forest_reqslist_iter_fwd_init( dlist, pool );
1147 0 : !fd_forest_reqslist_iter_done( iter, dlist, pool );
1148 0 : iter = fd_forest_reqslist_iter_fwd_next( iter, dlist, pool ) ) {
1149 0 : fd_forest_ref_t * req = fd_forest_reqslist_iter_ele( iter, dlist, pool );
1150 0 : fd_forest_blk_t * blk = fd_forest_pool_ele( fd_forest_pool( forest ), req->idx );
1151 :
1152 0 : printf("%-15lu %-12u %-12u %-20ld %-12u %-10u\n",
1153 0 : blk->slot,
1154 0 : blk->buffered_idx,
1155 0 : blk->complete_idx,
1156 0 : blk->first_shred_ts,
1157 0 : blk->turbine_cnt,
1158 0 : blk->repair_cnt);
1159 0 : }
1160 0 : printf("\n");
1161 :
1162 : /* now lets print the orphreqs */
1163 :
1164 0 : printf("Orphan Requests:\n");
1165 0 : printf("%-15s %-12s %-12s %-12s %-20s %-12s %-10s\n",
1166 0 : "Slot", "Consumed Idx", "Buffered Idx", "Complete Idx",
1167 0 : "First Shred Timestamp", "Turbine Cnt", "Repair Cnt");
1168 0 : printf("%-15s %-12s %-12s %-12s %-20s %-12s %-10s\n",
1169 0 : "---------------", "------------", "------------", "------------",
1170 0 : "--------------------", "------------", "----------");
1171 :
1172 0 : for( fd_forest_reqslist_iter_t iter = fd_forest_reqslist_iter_fwd_init( orphlist, pool );
1173 0 : !fd_forest_reqslist_iter_done( iter, orphlist, pool );
1174 0 : iter = fd_forest_reqslist_iter_fwd_next( iter, orphlist, pool ) ) {
1175 0 : fd_forest_ref_t * req = fd_forest_reqslist_iter_ele( iter, orphlist, pool );
1176 0 : fd_forest_blk_t * blk = fd_forest_pool_ele( fd_forest_pool( forest ), req->idx );
1177 0 : printf("%-15lu %-12u %-12u %-20ld %-12u %-10u\n",
1178 0 : blk->slot,
1179 0 : blk->buffered_idx,
1180 0 : blk->complete_idx,
1181 0 : blk->first_shred_ts,
1182 0 : blk->turbine_cnt,
1183 0 : blk->repair_cnt);
1184 0 : }
1185 0 : sleep( 1 );
1186 0 : }
1187 0 : }
1188 :
1189 : static void
1190 : repair_cmd_fn_waterfall( args_t * args,
1191 0 : config_t * config ) {
1192 :
1193 0 : fd_topo_t * topo = &config->topo;
1194 0 : ulong wksp_id = fd_topo_find_wksp( topo, "repair" );
1195 0 : if( FD_UNLIKELY( wksp_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair workspace not found" ));
1196 0 : fd_topo_wksp_t * repair_wksp = &topo->workspaces[ wksp_id ];
1197 0 : fd_topo_join_workspace( topo, repair_wksp, FD_SHMEM_JOIN_MODE_READ_ONLY, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
1198 :
1199 : /* Access the repair tile scratch memory where repair_tile_ctx is stored */
1200 0 : ulong tile_id = fd_topo_find_tile( topo, "repair", 0UL );
1201 0 : if( FD_UNLIKELY( tile_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair tile not found" ));
1202 0 : fd_topo_tile_t * tile = &topo->tiles[ tile_id ];
1203 0 : void * scratch = fd_topo_obj_laddr( &config->topo, tile->tile_obj_id );
1204 0 : if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "Failed to access repair tile scratch memory" ));
1205 :
1206 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1207 0 : ctx_t * repair_ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ctx_t), sizeof(ctx_t) );
1208 :
1209 : /* catchup cmd owned memory */
1210 0 : location_table = fd_location_table_join( fd_location_table_new( location_table_mem ) );
1211 0 : read_iptable( args->repair.iptable_path, location_table );
1212 :
1213 : // Add terminal setup here - same as monitor.c
1214 0 : atexit( restore_terminal );
1215 0 : if( FD_UNLIKELY( 0!=tcgetattr( STDIN_FILENO, &termios_backup ) ) ) {
1216 0 : FD_LOG_ERR(( "tcgetattr(STDIN_FILENO) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
1217 0 : }
1218 :
1219 : /* Disable character echo and line buffering */
1220 0 : struct termios term = termios_backup;
1221 0 : term.c_lflag &= (tcflag_t)~(ICANON | ECHO);
1222 0 : if( FD_UNLIKELY( 0!=tcsetattr( STDIN_FILENO, TCSANOW, &term ) ) ) {
1223 0 : FD_LOG_WARNING(( "tcsetattr(STDIN_FILENO) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
1224 0 : }
1225 :
1226 0 : int catchup_verbose = 0;
1227 0 : long last_print = 0;
1228 0 : for( ;; ) {
1229 0 : int c = fd_getchar();
1230 0 : if( FD_UNLIKELY( c=='i' ) ) catchup_verbose = !catchup_verbose;
1231 0 : if( FD_UNLIKELY( c=='\x04' ) ) break; /* Ctrl-D */
1232 :
1233 0 : long now = fd_log_wallclock();
1234 0 : if( FD_UNLIKELY( now - last_print > 1e9L ) ) {
1235 0 : last_print = now;
1236 0 : print_catchup_slots( repair_wksp->wksp, repair_ctx, catchup_verbose, args->repair.sort_by_slot );
1237 0 : printf( "catchup slots | Use 'i' to toggle extra slot information" TEXT_NEWLINE );
1238 0 : fflush( stdout );
1239 :
1240 : /* Peer location latency is not that useful post catchup, and also
1241 : requires some concurrent dlist iteration, so only print it when
1242 : in catchup mode. */
1243 0 : }
1244 0 : }
1245 0 : }
1246 :
1247 0 : #define PEERS_DISPLAY_MAX 20
1248 :
1249 : static void
1250 : print_peer_dlist( fd_policy_peer_dlist_t * dlist,
1251 : fd_policy_peer_t * pool,
1252 : fd_policy_peer_dlist_iter_t cursor,
1253 0 : char const * label ) {
1254 0 : ulong cnt = 0;
1255 0 : for( fd_policy_peer_dlist_iter_t it = fd_policy_peer_dlist_iter_fwd_init( dlist, pool );
1256 0 : !fd_policy_peer_dlist_iter_done( it, dlist, pool );
1257 0 : it = fd_policy_peer_dlist_iter_fwd_next( it, dlist, pool ) ) cnt++;
1258 :
1259 0 : printf( "%s (%lu peers)\n", label, cnt );
1260 0 : if( !cnt || fd_policy_peer_dlist_iter_done( cursor, dlist, pool ) ) {
1261 0 : printf( " (empty or iterator not initialized)\n\n" );
1262 0 : return;
1263 0 : }
1264 :
1265 0 : printf( " | %-8s | %-12s | %-12s | %-8s | %-8s\n",
1266 0 : "Idx", "Pubkey", "Ewma Lat", "Avg Lat", "Req/Res" );
1267 0 : printf( "-----+----------+--------------+--------------+----------+---------\n" );
1268 :
1269 0 : fd_policy_peer_dlist_iter_t it = cursor;
1270 0 : for( ulong i = 0; i < PEERS_DISPLAY_MAX && i < cnt; i++ ) {
1271 0 : fd_policy_peer_t * peer = fd_policy_peer_dlist_iter_ele( it, dlist, pool );
1272 :
1273 0 : FD_BASE58_ENCODE_32_BYTES( peer->key.key, b58 );
1274 0 : char pubkey_short[13];
1275 0 : fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( pubkey_short ), b58, 12 ) );
1276 :
1277 0 : double avg_lat_ms = peer->res_cnt ? ((double)peer->total_lat / (double)peer->res_cnt) / 1e6 : 0.0;
1278 0 : double ewma_lat_ms = (double)peer->ewma_lat / 1e6;
1279 :
1280 0 : printf( " %s%c%s | %-8lu | %-12s | %9.3fms | %9.3fms | %lu/%lu\n",
1281 0 : i == 0 ? "\033[1;33m" : "",
1282 0 : i == 0 ? '>' : ' ',
1283 0 : i == 0 ? "\033[0m" : "",
1284 0 : fd_policy_peer_pool_idx( pool, peer ),
1285 0 : pubkey_short,
1286 0 : ewma_lat_ms,
1287 0 : avg_lat_ms,
1288 0 : peer->req_cnt,
1289 0 : peer->res_cnt );
1290 :
1291 0 : it = fd_policy_peer_dlist_iter_fwd_next( it, dlist, pool );
1292 0 : if( fd_policy_peer_dlist_iter_done( it, dlist, pool ) ) {
1293 0 : it = fd_policy_peer_dlist_iter_fwd_init( dlist, pool );
1294 0 : }
1295 0 : }
1296 0 : if( cnt > PEERS_DISPLAY_MAX ) printf( " ... (%lu more)\n", cnt - PEERS_DISPLAY_MAX );
1297 0 : printf( "\n" );
1298 0 : }
1299 :
1300 : static void
1301 : repair_cmd_fn_peers( args_t * args,
1302 0 : config_t * config ) {
1303 0 : ctx_t * repair_ctx;
1304 0 : fd_topo_wksp_t * repair_wksp;
1305 0 : repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
1306 :
1307 0 : fd_policy_t * policy = fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->policy ) );
1308 :
1309 0 : fd_policy_peer_dlist_t * fast_dlist = fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, policy->peers.fast ) );
1310 0 : fd_policy_peer_dlist_t * slow_dlist = fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, policy->peers.slow ) );
1311 0 : fd_policy_peer_t * pool = fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, policy->peers.pool ) );
1312 :
1313 0 : long last_print = 0;
1314 0 : for( ;; ) {
1315 0 : long now = fd_log_wallclock();
1316 0 : if( FD_UNLIKELY( now - last_print > 1e9L ) ) {
1317 0 : last_print = now;
1318 0 : printf( "\033[2J\033[H" );
1319 :
1320 0 : char fast_label[64];
1321 0 : char slow_label[64];
1322 0 : snprintf( fast_label, sizeof(fast_label), "FAST PEERS (ewma < %ldms)", (long)(FD_POLICY_LATENCY_THRESH / 1e6L) );
1323 0 : snprintf( slow_label, sizeof(slow_label), "SLOW PEERS (ewma >= %ldms or no responses)", (long)(FD_POLICY_LATENCY_THRESH / 1e6L) );
1324 0 : print_peer_dlist( fast_dlist, pool, policy->peers.select.fast_iter, fast_label );
1325 0 : print_peer_dlist( slow_dlist, pool, policy->peers.select.slow_iter, slow_label );
1326 :
1327 0 : printf( "select cnt: %u / %u (fast per slow)\n", policy->peers.select.cnt, FD_POLICY_FAST_PER_SLOW );
1328 0 : printf( "pool used: %lu / %lu\n", fd_policy_peer_pool_used( pool ), fd_policy_peer_pool_max( pool ) );
1329 :
1330 0 : fflush( stdout );
1331 0 : }
1332 :
1333 0 : }
1334 0 : }
1335 :
1336 :
1337 :
1338 :
1339 : void
1340 : repair_cmd_args( int * pargc,
1341 : char *** pargv,
1342 0 : args_t * args ) {
1343 :
1344 : /* positional arg */
1345 :
1346 0 : args->repair.pos_arg = (*pargv)[0];
1347 0 : if( FD_UNLIKELY( !args->repair.pos_arg ) ) {
1348 0 : args->repair.help = 1;
1349 0 : return;
1350 0 : }
1351 0 : (*pargc)--;
1352 0 : (*pargv)++;
1353 :
1354 : /* required args */
1355 :
1356 0 : char const * manifest_path = fd_env_strip_cmdline_cstr ( pargc, pargv, "--manifest-path", NULL, NULL );
1357 :
1358 : /* optional args */
1359 :
1360 0 : char const * iptable_path = fd_env_strip_cmdline_cstr ( pargc, pargv, "--iptable", NULL, NULL );
1361 0 : ulong slot = fd_env_strip_cmdline_ulong ( pargc, pargv, "--slot", NULL, ULONG_MAX );
1362 0 : int sort_by_slot = fd_env_strip_cmdline_contains( pargc, pargv, "--sort-by-slot" );
1363 :
1364 0 : if( FD_UNLIKELY( !strcmp( args->repair.pos_arg, "catchup" ) && !manifest_path ) ) {
1365 0 : args->repair.help = 1;
1366 0 : return;
1367 0 : }
1368 :
1369 0 : fd_cstr_fini( fd_cstr_append_cstr_safe( fd_cstr_init( args->repair.manifest_path ), manifest_path, sizeof(args->repair.manifest_path)-1UL ) );
1370 0 : fd_cstr_fini( fd_cstr_append_cstr_safe( fd_cstr_init( args->repair.iptable_path ), iptable_path, sizeof(args->repair.iptable_path )-1UL ) );
1371 0 : args->repair.slot = slot;
1372 0 : args->repair.sort_by_slot = sort_by_slot;
1373 0 : }
1374 :
1375 : static void
1376 : repair_cmd_fn( args_t * args,
1377 0 : config_t * config ) {
1378 :
1379 0 : if( args->repair.help ) {
1380 0 : fd_action_help_print( &fd_action_repair );
1381 0 : return;
1382 0 : }
1383 :
1384 0 : if ( !strcmp( args->repair.pos_arg, "catchup" ) ) repair_cmd_fn_catchup ( args, config );
1385 0 : else if( !strcmp( args->repair.pos_arg, "eqvoc" ) ) repair_cmd_fn_eqvoc ( args, config );
1386 0 : else if( !strcmp( args->repair.pos_arg, "forest" ) ) repair_cmd_fn_forest ( args, config );
1387 0 : else if( !strcmp( args->repair.pos_arg, "inflight" ) ) repair_cmd_fn_inflight ( args, config );
1388 0 : else if( !strcmp( args->repair.pos_arg, "requests" ) ) repair_cmd_fn_requests ( args, config );
1389 0 : else if( !strcmp( args->repair.pos_arg, "waterfall" ) ) repair_cmd_fn_waterfall( args, config );
1390 0 : else if( !strcmp( args->repair.pos_arg, "peers" ) ) repair_cmd_fn_peers ( args, config );
1391 0 : else if( !strcmp( args->repair.pos_arg, "metrics" ) ) repair_cmd_fn_metrics ( args, config );
1392 0 : else fd_action_help_print( &fd_action_repair );
1393 0 : }
1394 :
1395 : static void
1396 0 : repair_args_help( fd_action_help_t * help ) {
1397 0 : fd_action_help_arg( help, "catchup", NULL, "Run a reduced topology that only repairs slots until catchup.\n"
1398 0 : "Requires --manifest-path; accepts --iptable and --sort-by-slot" );
1399 0 : fd_action_help_arg( help, "eqvoc", NULL, "Test equivocation detection and the repair path" );
1400 0 : fd_action_help_arg( help, "forest", NULL, "Print the repair forest. Accepts --slot to drill into a slot" );
1401 0 : fd_action_help_arg( help, "inflight", NULL, "Print the inflight repairs" );
1402 0 : fd_action_help_arg( help, "requests", NULL, "Print the queued repair requests" );
1403 0 : fd_action_help_arg( help, "waterfall", NULL, "Print a waterfall diagram of recent slot completion times and\n"
1404 0 : "response latencies. Accepts --iptable and --sort-by-slot" );
1405 0 : fd_action_help_arg( help, "peers", NULL, "Print the list of slow and fast repair peers" );
1406 0 : fd_action_help_arg( help, "metrics", NULL, "Print repair tile metrics in a digestible format" );
1407 0 : fd_action_help_arg( help, "--manifest-path", "<path>", "Path to manifest file (required by catchup)" );
1408 0 : fd_action_help_arg( help, "--iptable", "<path>", "Path to iptable file (catchup, waterfall)" );
1409 0 : fd_action_help_arg( help, "--slot", "<slot>", "Specific forest slot to drill into (forest)" );
1410 : fd_action_help_arg( help, "--sort-by-slot", NULL, "Sort results by slot (catchup, waterfall)" );
1411 0 : }
1412 :
1413 : action_t fd_action_repair = {
1414 : .name = "repair",
1415 : .args = repair_cmd_args,
1416 : .fn = repair_cmd_fn,
1417 : .perm = dev_cmd_perm,
1418 : .description = "Spawn a reduced topology for inspecting and profiling the repair tile",
1419 : .detail = "Boots a smaller Firedancer topology focused on the repair tile and runs the\n"
1420 : "requested subcommand to drive or inspect repair behavior. Pick one of the\n"
1421 : "subcommands below.",
1422 : .usage = "repair <catchup|eqvoc|forest|inflight|requests|waterfall|peers|metrics> [OPTIONS]",
1423 : .args_help = repair_args_help,
1424 : };
|