LCOV - code coverage report
Current view: top level - app/firedancer-dev/commands - repair.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 0 983 0.0 %
Date: 2026-06-29 05:51:35 Functions: 0 25 0.0 %

          Line data    Source code
       1             : /* The repair command spawns a smaller topology for profiling the repair
       2             :    tile.  This is a standalone application, and it can be run in mainnet,
       3             :    testnet and/or a private cluster. */
       4             : 
       5             : #include "../../../disco/net/fd_net_tile.h"
       6             : #include "../../../disco/tiles.h"
       7             : #include "../../../disco/topo/fd_topob.h"
       8             : #include "../../../disco/topo/fd_cpu_topo.h"
       9             : #include "../../../util/pod/fd_pod_format.h"
      10             : #include "../../../util/tile/fd_tile_private.h"
      11             : 
      12             : #include "../../firedancer/topology.h"
      13             : #include "../../shared/commands/configure/configure.h"
      14             : #include "../../shared/commands/run/run.h" /* initialize_workspaces */
      15             : #include "../../shared/fd_config.h" /* config_t */
      16             : #include "../../shared_dev/commands/dev.h"
      17             : #include "../../../disco/tiles.h"
      18             : #include "../../../disco/topo/fd_topob.h"
      19             : #include "../../../util/pod/fd_pod_format.h"
      20             : #include "../../../waltz/resolv/fd_io_readline.h"
      21             : #include "../../platform/fd_sys_util.h"
      22             : #include "../../shared/commands/monitor/helper.h"
      23             : #include "../../../disco/metrics/fd_metrics.h"
      24             : #include "../../../discof/restore/utils/fd_ssmanifest_parser.h"
      25             : #include "../../../flamenco/runtime/sysvar/fd_sysvar_epoch_schedule.h"
      26             : #include "../../../flamenco/stakes/fd_stake_weight.h"
      27             : #include "../../../flamenco/leaders/fd_leaders_base.h"
      28             : #include "../../../discof/repair/fd_repair_tile.c"
      29             : 
      30             : #include "gossip.h"
      31             : #include "core_subtopo.h"
      32             : 
      33             : #include <unistd.h> /* pause */
      34             : #include <fcntl.h>
      35             : #include <stdio.h>
      36             : #include <termios.h>
      37             : #include <errno.h>
      38             : 
      39             : extern action_t fd_action_repair;
      40             : 
      41             : struct fd_location_info {
      42             :   ulong ip4_addr;         /* for map key convenience */
      43             :   char location[ 128 ];
      44             : };
      45             : typedef struct fd_location_info fd_location_info_t;
      46             : 
      47             : #define MAP_NAME    fd_location_table
      48           0 : #define MAP_T       fd_location_info_t
      49           0 : #define MAP_KEY     ip4_addr
      50           0 : #define MAP_LG_SLOT_CNT 16
      51             : #define MAP_MEMOIZE 0
      52             : #include "../../../util/tmpl/fd_map.c"
      53             : 
      54             : uchar __attribute__((aligned(alignof(fd_location_info_t)))) location_table_mem[ sizeof(fd_location_info_t) * (1 << 16 ) ];
      55             : 
      56             : static struct termios termios_backup;
      57             : 
      58             : static void
      59           0 : restore_terminal( void ) {
      60           0 :   (void)tcsetattr( STDIN_FILENO, TCSANOW, &termios_backup );
      61           0 : }
      62             : 
      63             : extern fd_topo_obj_callbacks_t * CALLBACKS[];
      64             : 
      65             : fd_topo_run_tile_t
      66             : fdctl_tile_run( fd_topo_tile_t const * tile );
      67             : 
      68             : void
      69             : resolve_gossip_entrypoints( config_t * config );
      70             : 
      71           0 : #define MANIFEST_LOAD_MAX_SZ (2UL * FD_SHMEM_GIGANTIC_PAGE_SZ)
      72             : 
      73             : /* https://github.com/anza-xyz/agave/blob/v3.1.8/runtime/src/snapshot_bank_utils.rs#L632 */
      74             : static int
      75           0 : repair_verify_epoch_stakes( fd_snapshot_manifest_t const * manifest ) {
      76           0 :   fd_epoch_schedule_t epoch_schedule = (fd_epoch_schedule_t){
      77           0 :     .slots_per_epoch             = manifest->epoch_schedule_params.slots_per_epoch,
      78           0 :     .leader_schedule_slot_offset = manifest->epoch_schedule_params.leader_schedule_slot_offset,
      79           0 :     .warmup                      = manifest->epoch_schedule_params.warmup,
      80           0 :     .first_normal_epoch          = manifest->epoch_schedule_params.first_normal_epoch,
      81           0 :     .first_normal_slot           = manifest->epoch_schedule_params.first_normal_slot,
      82           0 :   };
      83             : 
      84           0 :   ulong min_required_epoch = fd_slot_to_epoch( &epoch_schedule, manifest->slot, NULL );
      85           0 :   ulong max_required_epoch = fd_slot_to_leader_schedule_epoch( &epoch_schedule, manifest->slot );
      86             : 
      87           0 :   for( ulong i=min_required_epoch; i<=max_required_epoch; i++ ) {
      88           0 :     int found = 0;
      89           0 :     for( ulong j=0UL; j<FD_EPOCH_STAKES_LEN; j++ ) {
      90           0 :       if( manifest->epoch_stakes[j].epoch==i ) {
      91           0 :         found = 1;
      92           0 :         break;
      93           0 :       }
      94           0 :     }
      95           0 :     if( FD_UNLIKELY( !found ) ) {
      96           0 :       FD_LOG_WARNING(( "stakes not found for epoch %lu in manifest", i ));
      97           0 :       return -1;
      98           0 :     }
      99           0 :   }
     100           0 :   return 0;
     101           0 : }
     102             : 
     103             : static inline ulong
     104             : repair_generate_epoch_info_msg( ulong                                       epoch,
     105             :                                 fd_epoch_schedule_t const *                 epoch_schedule,
     106             :                                 fd_snapshot_manifest_epoch_stakes_t const * epoch_stakes,
     107           0 :                                 ulong *                                     epoch_info_msg_out ) {
     108           0 :   fd_epoch_info_msg_t *    epoch_info_msg = (fd_epoch_info_msg_t *)fd_type_pun( epoch_info_msg_out );
     109           0 :   fd_vote_stake_weight_t * stake_weights  = fd_epoch_info_msg_stake_weights( epoch_info_msg );
     110             : 
     111           0 :   epoch_info_msg->epoch             = epoch;
     112           0 :   epoch_info_msg->start_slot        = fd_epoch_slot0( epoch_schedule, epoch );
     113           0 :   epoch_info_msg->slot_cnt          = fd_epoch_slot_cnt( epoch_schedule, epoch );
     114           0 :   epoch_info_msg->excluded_id_stake = 0UL;
     115             : 
     116           0 :   fd_memset( &epoch_info_msg->features, 0xFF, sizeof(fd_features_t) );
     117             : 
     118           0 :   ulong idx = 0UL;
     119           0 :   for( ulong i=0UL; i<epoch_stakes->vote_stakes_len; i++ ) {
     120           0 :     ulong stake = epoch_stakes->vote_stakes[ i ].stake;
     121           0 :     if( FD_UNLIKELY( !stake ) ) continue;
     122           0 :     stake_weights[ idx ].stake = stake;
     123           0 :     memcpy( stake_weights[ idx ].id_key.uc, epoch_stakes->vote_stakes[ i ].identity, sizeof(fd_pubkey_t) );
     124           0 :     memcpy( stake_weights[ idx ].vote_key.uc, epoch_stakes->vote_stakes[ i ].vote, sizeof(fd_pubkey_t) );
     125           0 :     idx++;
     126           0 :   }
     127           0 :   epoch_info_msg->staked_vote_cnt = idx;
     128           0 :   sort_vote_weights_by_stake_vote_inplace( stake_weights, idx );
     129             : 
     130           0 :   fd_stake_weight_t * id_weights = fd_epoch_info_msg_id_weights( epoch_info_msg );
     131           0 :   epoch_info_msg->staked_id_cnt = compute_id_weights_from_vote_weights( id_weights, stake_weights, epoch_info_msg->staked_vote_cnt );
     132           0 :   FD_TEST( idx<=MAX_SHRED_DESTS );
     133             : 
     134           0 :   epoch_info_msg->epoch_schedule = *epoch_schedule;
     135           0 :   return fd_epoch_info_msg_sz( epoch_info_msg->staked_vote_cnt, epoch_info_msg->staked_id_cnt );
     136           0 : }
     137             : 
     138             : /* repair_load_manifest loads the snapshot manifest from disk and
     139             :    pre-populates the snapin_manif and replay_epoch dcache links so
     140             :    that consumer tiles see the data on their first poll cycle. */
     141             : static void
     142             : repair_load_manifest( fd_topo_t *  topo,
     143           0 :                       char const * manifest_path ) {
     144           0 :   if( FD_UNLIKELY( !manifest_path || !manifest_path[0] ) ) return;
     145             : 
     146             :   /* Parse manifest */
     147             : 
     148           0 :   int fd = open( manifest_path, O_RDONLY );
     149           0 :   if( FD_UNLIKELY( fd<0 ) ) FD_LOG_ERR(( "open(%s) failed (%d-%s)", manifest_path, errno, fd_io_strerror( errno ) ));
     150             : 
     151           0 :   fd_snapshot_manifest_t * manifest = aligned_alloc( alignof(fd_snapshot_manifest_t), sizeof(fd_snapshot_manifest_t) );
     152           0 :   FD_TEST( manifest );
     153           0 :   for( ulong i=0UL; i<FD_EPOCH_STAKES_LEN; i++ ) manifest->epoch_stakes[i].epoch = ULONG_MAX;
     154             : 
     155           0 :   uchar * buf = aligned_alloc( 128UL, MANIFEST_LOAD_MAX_SZ );
     156           0 :   FD_TEST( buf );
     157           0 :   ulong buf_sz = 0;
     158           0 :   FD_TEST( !fd_io_read( fd, buf, 0UL, MANIFEST_LOAD_MAX_SZ-1UL, &buf_sz ) );
     159           0 :   close( fd );
     160             : 
     161           0 :   fd_ssmanifest_parser_t * parser = fd_ssmanifest_parser_join( fd_ssmanifest_parser_new(
     162           0 :       aligned_alloc( fd_ssmanifest_parser_align(), fd_ssmanifest_parser_footprint() ) ) );
     163           0 :   FD_TEST( parser );
     164           0 :   fd_ssmanifest_parser_init( parser, manifest );
     165           0 :   int parser_err = fd_ssmanifest_parser_consume( parser, buf, buf_sz );
     166           0 :   FD_TEST( parser_err!=FD_SSMANIFEST_PARSER_ADVANCE_ERROR );
     167           0 :   FD_TEST( fd_ssmanifest_parser_fini( parser )==FD_SSMANIFEST_PARSER_ADVANCE_DONE );
     168           0 :   free( parser );
     169           0 :   free( buf );
     170             : 
     171           0 :   FD_LOG_NOTICE(( "manifest bank slot %lu", manifest->slot ));
     172           0 :   FD_TEST( !repair_verify_epoch_stakes( manifest ) );
     173             : 
     174             :   /* Update root_slot fseq */
     175             : 
     176           0 :   ulong root_slot_obj_id = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "root_slot" );
     177           0 :   if( FD_LIKELY( root_slot_obj_id!=ULONG_MAX ) ) {
     178           0 :     ulong * root_fseq = fd_fseq_join( fd_topo_obj_laddr( topo, root_slot_obj_id ) );
     179           0 :     FD_TEST( root_fseq );
     180           0 :     fd_fseq_update( root_fseq, manifest->slot );
     181           0 :   }
     182             : 
     183             :   /* Publish manifest to snapin_manif dcache */
     184             : 
     185           0 :   ulong snap_link_idx = fd_topo_find_link( topo, "snapin_manif", 0UL );
     186           0 :   FD_TEST( snap_link_idx!=ULONG_MAX );
     187           0 :   fd_topo_link_t * snap_link = &topo->links[ snap_link_idx ];
     188           0 :   fd_wksp_t *      snap_mem  = topo->workspaces[ topo->objs[ snap_link->dcache_obj_id ].wksp_id ].wksp;
     189           0 :   ulong snap_chunk0 = fd_dcache_compact_chunk0( snap_mem, snap_link->dcache );
     190           0 :   ulong snap_wmark  = fd_dcache_compact_wmark ( snap_mem, snap_link->dcache, snap_link->mtu );
     191           0 :   ulong snap_chunk  = snap_chunk0;
     192             : 
     193           0 :   uchar * snap_dst = fd_chunk_to_laddr( snap_mem, snap_chunk );
     194           0 :   memcpy( snap_dst, manifest, sizeof(fd_snapshot_manifest_t) );
     195           0 :   fd_mcache_publish( snap_link->mcache, snap_link->depth, 0UL,
     196           0 :                      fd_ssmsg_sig( FD_SSMSG_MANIFEST_INCREMENTAL ),
     197           0 :                      snap_chunk, sizeof(fd_snapshot_manifest_t), 0UL, 0UL, 0UL );
     198           0 :   snap_chunk = fd_dcache_compact_next( snap_chunk, sizeof(fd_snapshot_manifest_t), snap_chunk0, snap_wmark );
     199             : 
     200           0 :   fd_mcache_publish( snap_link->mcache, snap_link->depth, 1UL,
     201           0 :                      fd_ssmsg_sig( FD_SSMSG_DONE ), 0UL, 0UL, 0UL, 0UL, 0UL );
     202             : 
     203             :   /* Publish epoch stake weights to replay_epoch dcache */
     204             : 
     205           0 :   ulong epoch_link_idx = fd_topo_find_link( topo, "replay_epoch", 0UL );
     206           0 :   FD_TEST( epoch_link_idx!=ULONG_MAX );
     207           0 :   fd_topo_link_t * epoch_link = &topo->links[ epoch_link_idx ];
     208           0 :   fd_wksp_t *      epoch_mem  = topo->workspaces[ topo->objs[ epoch_link->dcache_obj_id ].wksp_id ].wksp;
     209           0 :   ulong epoch_chunk0 = fd_dcache_compact_chunk0( epoch_mem, epoch_link->dcache );
     210           0 :   ulong epoch_wmark  = fd_dcache_compact_wmark ( epoch_mem, epoch_link->dcache, epoch_link->mtu );
     211           0 :   ulong epoch_chunk  = epoch_chunk0;
     212           0 :   ulong epoch_seq    = 0UL;
     213             : 
     214             :   /* Construct fd_epoch_schedule_t field-by-field rather than type-punning
     215             :      from the unpacked manifest struct (fd_epoch_schedule_t is packed). */
     216           0 :   fd_epoch_schedule_t schedule_local;
     217           0 :   schedule_local.slots_per_epoch             = manifest->epoch_schedule_params.slots_per_epoch;
     218           0 :   schedule_local.leader_schedule_slot_offset = manifest->epoch_schedule_params.leader_schedule_slot_offset;
     219           0 :   schedule_local.warmup                      = manifest->epoch_schedule_params.warmup;
     220           0 :   schedule_local.first_normal_epoch          = manifest->epoch_schedule_params.first_normal_epoch;
     221           0 :   schedule_local.first_normal_slot           = manifest->epoch_schedule_params.first_normal_slot;
     222           0 :   fd_epoch_schedule_t const * schedule = &schedule_local;
     223           0 :   ulong epoch = fd_slot_to_epoch( schedule, manifest->slot, NULL );
     224             : 
     225           0 :   ulong epoch_stakes_base      = epoch > 0UL ? epoch - 1UL : 0UL;
     226           0 :   ulong leader_schedule_epoch  = fd_slot_to_leader_schedule_epoch( schedule, manifest->slot );
     227           0 :   ulong cur_idx = epoch - epoch_stakes_base;
     228           0 :   FD_TEST( cur_idx < FD_EPOCH_STAKES_LEN );
     229             : 
     230           0 :   ulong * epoch_dst = fd_chunk_to_laddr( epoch_mem, epoch_chunk );
     231           0 :   ulong epoch_sz = repair_generate_epoch_info_msg( epoch, schedule, &manifest->epoch_stakes[cur_idx], epoch_dst );
     232           0 :   fd_mcache_publish( epoch_link->mcache, epoch_link->depth, epoch_seq,
     233           0 :                      4UL, epoch_chunk, epoch_sz, 0UL, 0UL, fd_frag_meta_ts_comp( fd_tickcount() ) );
     234           0 :   epoch_chunk = fd_dcache_compact_next( epoch_chunk, epoch_sz, epoch_chunk0, epoch_wmark );
     235           0 :   epoch_seq++;
     236           0 :   FD_LOG_NOTICE(( "sending current epoch stake weights - epoch: %lu", epoch ));
     237             : 
     238           0 :   if( leader_schedule_epoch >= epoch + 1UL ) {
     239           0 :     ulong next_idx = epoch + 1UL - epoch_stakes_base;
     240           0 :     FD_TEST( next_idx < FD_EPOCH_STAKES_LEN );
     241             : 
     242           0 :     epoch_dst = fd_chunk_to_laddr( epoch_mem, epoch_chunk );
     243           0 :     epoch_sz = repair_generate_epoch_info_msg( epoch + 1UL, schedule, &manifest->epoch_stakes[next_idx], epoch_dst );
     244           0 :     fd_mcache_publish( epoch_link->mcache, epoch_link->depth, epoch_seq,
     245           0 :                        4UL, epoch_chunk, epoch_sz, 0UL, 0UL, fd_frag_meta_ts_comp( fd_tickcount() ) );
     246           0 :     epoch_chunk = fd_dcache_compact_next( epoch_chunk, epoch_sz, epoch_chunk0, epoch_wmark );
     247           0 :     epoch_seq++;
     248           0 :     FD_LOG_NOTICE(( "sending next epoch stake weights - epoch: %lu", epoch + 1UL ));
     249           0 :   }
     250           0 :   (void)epoch_chunk;
     251             : 
     252           0 :   free( manifest );
     253           0 : }
     254             : 
     255             : /* repair_topo is a subset of "src/app/firedancer/topology.c" at commit
     256             :    0d8386f4f305bb15329813cfe4a40c3594249e96, slightly modified to work
     257             :    as a repair catchup.  TODO ideally, one should invoke the firedancer
     258             :    topology first, and exclude the parts that are not needed, instead of
     259             :    manually generating new topologies for every command.  This would
     260             :    also guarantee that the catchup is replicating (as close as possible)
     261             :    the full topology. */
     262             : static void
     263           0 : repair_topo( config_t * config ) {
     264           0 :   resolve_gossip_entrypoints( config );
     265             : 
     266           0 :   ulong net_tile_cnt    = config->layout.net_tile_count;
     267           0 :   ulong shred_tile_cnt  = config->layout.shred_tile_count;
     268           0 :   ulong quic_tile_cnt   = config->layout.quic_tile_count;
     269           0 :   ulong sign_tile_cnt   = config->firedancer.layout.sign_tile_count;
     270           0 :   ulong gossvf_tile_cnt = config->firedancer.layout.gossvf_tile_count;
     271             : 
     272           0 :   fd_topo_t * topo = { fd_topob_new( &config->topo, config->name ) };
     273           0 :   topo->max_page_size = fd_cstr_to_shmem_page_sz( config->hugetlbfs.max_page_size );
     274           0 :   topo->gigantic_page_threshold = config->hugetlbfs.gigantic_page_threshold_mib << 20;
     275             : 
     276           0 :   ulong tile_to_cpu[ FD_TILE_MAX ] = {0};
     277           0 :   ushort parsed_tile_to_cpu[ FD_TILE_MAX ];
     278             :   /* Unassigned tiles will be floating, unless auto topology is enabled. */
     279           0 :   for( ulong i=0UL; i<FD_TILE_MAX; i++ ) parsed_tile_to_cpu[ i ] = USHORT_MAX;
     280             : 
     281           0 :   int is_auto_affinity = !strcmp( config->layout.affinity, "auto" );
     282           0 :   int is_bench_auto_affinity = !strcmp( config->development.bench.affinity, "auto" );
     283             : 
     284           0 :   if( FD_UNLIKELY( is_auto_affinity != is_bench_auto_affinity ) ) {
     285           0 :     FD_LOG_ERR(( "The CPU affinity string in the configuration file under [layout.affinity] and [development.bench.affinity] must all be set to 'auto' or all be set to a specific CPU affinity string." ));
     286           0 :   }
     287             : 
     288           0 :   fd_topo_cpus_t cpus[1];
     289           0 :   fd_topo_cpus_init( cpus );
     290             : 
     291           0 :   ulong affinity_tile_cnt = 0UL;
     292           0 :   if( FD_LIKELY( !is_auto_affinity ) ) affinity_tile_cnt = fd_tile_private_cpus_parse( config->layout.affinity, parsed_tile_to_cpu );
     293             : 
     294           0 :   for( ulong i=0UL; i<affinity_tile_cnt; i++ ) {
     295           0 :     if( FD_UNLIKELY( parsed_tile_to_cpu[ i ]!=USHORT_MAX && parsed_tile_to_cpu[ i ]>=cpus->cpu_cnt ) )
     296           0 :       FD_LOG_ERR(( "The CPU affinity string in the configuration file under [layout.affinity] specifies a CPU index of %hu, but the system "
     297           0 :                   "only has %lu CPUs. You should either change the CPU allocations in the affinity string, or increase the number of CPUs "
     298           0 :                   "in the system.",
     299           0 :                   parsed_tile_to_cpu[ i ], cpus->cpu_cnt ));
     300           0 :     tile_to_cpu[ i ] = fd_ulong_if( parsed_tile_to_cpu[ i ]==USHORT_MAX, ULONG_MAX, (ulong)parsed_tile_to_cpu[ i ] );
     301           0 :   }
     302             : 
     303           0 :   fd_core_subtopo(   config, tile_to_cpu );
     304           0 :   fd_gossip_subtopo( config, tile_to_cpu );
     305             : 
     306             :   /*             topo, name */
     307           0 :   fd_topob_wksp( topo, "net_shred"    );
     308           0 :   fd_topob_wksp( topo, "net_repair"   );
     309           0 :   fd_topob_wksp( topo, "net_quic"     );
     310             : 
     311           0 :   fd_topob_wksp( topo, "shred_out"    );
     312           0 :   fd_topob_wksp( topo, "replay_epoch" );
     313             : 
     314           0 :   fd_topob_wksp( topo, "poh_shred"    );
     315             : 
     316           0 :   fd_topob_wksp( topo, "shred_sign"   );
     317           0 :   fd_topob_wksp( topo, "sign_shred"   );
     318             : 
     319           0 :   fd_topob_wksp( topo, "repair_sign"  );
     320           0 :   fd_topob_wksp( topo, "sign_repair"  );
     321           0 :   fd_topob_wksp( topo, "rnonce"       );
     322           0 :   fd_topob_wksp( topo, "repair_out"  );
     323             : 
     324           0 :   fd_topob_wksp( topo, "txsend_out"   );
     325             : 
     326           0 :   fd_topob_wksp( topo, "shred"        );
     327           0 :   fd_topob_wksp( topo, "repair"       );
     328           0 :   fd_topob_wksp( topo, "fec_sets"     );
     329           0 :   fd_topob_wksp( topo, "snapin_manif" );
     330             : 
     331           0 :   fd_topob_wksp( topo, "genesi_out"   ); /* mock genesi_out for ipecho */
     332             : 
     333           0 :   fd_topob_wksp( topo, "tower_out"    ); /* mock tower_out for confirmation msgs. Not needed for any topo except eqvoc. */
     334             : 
     335           0 :   #define FOR(cnt) for( ulong i=0UL; i<cnt; i++ )
     336             : 
     337           0 :   ulong pending_fec_shreds_depth = fd_ulong_min( fd_ulong_pow2_up( config->tiles.shred.max_pending_shred_sets * FD_REEDSOL_DATA_SHREDS_MAX ), USHORT_MAX + 1 /* dcache max */ );
     338             : 
     339             :   /*                                  topo, link_name,      wksp_name,      depth,                                    mtu,                           burst */
     340           0 :   FOR(quic_tile_cnt)   fd_topob_link( topo, "quic_net",     "net_quic",     config->net.ingress_buffer_size,          FD_NET_MTU,                    1UL );
     341           0 :   FOR(shred_tile_cnt)  fd_topob_link( topo, "shred_net",    "net_shred",    config->net.ingress_buffer_size,          FD_NET_MTU,                    1UL );
     342             : 
     343           0 :   /**/                 fd_topob_link( topo, "replay_epoch", "replay_epoch", 128UL,                                    FD_EPOCH_OUT_MTU,              1UL );
     344             : 
     345           0 :   FOR(shred_tile_cnt)  fd_topob_link( topo, "shred_sign",   "shred_sign",   128UL,                                    32UL,                          1UL );
     346           0 :   FOR(shred_tile_cnt)  fd_topob_link( topo, "sign_shred",   "sign_shred",   128UL,                                    64UL,                          1UL );
     347             : 
     348           0 :   /**/                 fd_topob_link( topo, "repair_net",   "net_repair",   config->net.ingress_buffer_size,          FD_NET_MTU,                    1UL );
     349             : 
     350           0 :   FOR(shred_tile_cnt)  fd_topob_link( topo, "shred_out",    "shred_out",    pending_fec_shreds_depth,                 sizeof(fd_shred_message_t),    2UL /* at most 2 msgs per after_frag */ );
     351           0 :   FOR(sign_tile_cnt-1) fd_topob_link( topo, "repair_sign",  "repair_sign",  256UL,                                    FD_REPAIR_MAX_PREIMAGE_SZ,     1UL );
     352           0 :   FOR(sign_tile_cnt-1) fd_topob_link( topo, "sign_repair",  "sign_repair",  128UL,                                    sizeof(fd_ed25519_sig_t),      1UL );
     353             : 
     354             :   /**/                 fd_topob_link( topo, "repair_out",   "repair_out",   128UL,                                    sizeof(fd_fec_complete_t),   1UL );
     355             : 
     356           0 :   /**/                 fd_topob_link( topo, "poh_shred",    "poh_shred",    16384UL,                                  USHORT_MAX,                    1UL );
     357             : 
     358           0 :   /**/                 fd_topob_link( topo, "txsend_out",   "txsend_out",   128UL,                                    FD_TXN_MTU,                    1UL );
     359             : 
     360             :   /**/                 fd_topob_link( topo, "snapin_manif", "snapin_manif", 2UL,                                      sizeof(fd_snapshot_manifest_t),1UL );
     361             : 
     362           0 :   /**/                 fd_topob_link( topo, "genesi_out",   "genesi_out",   1UL,                                      FD_GENESIS_TILE_MTU,            1UL );
     363           0 :   /**/                 fd_topob_link( topo, "tower_out",    "tower_out",    1024UL,                                   sizeof(fd_tower_msg_t),         1UL );
     364             : 
     365           0 :   FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_repair", i, config->net.ingress_buffer_size );
     366           0 :   FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_quic",   i, config->net.ingress_buffer_size );
     367           0 :   FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_shred",  i, config->net.ingress_buffer_size );
     368             : 
     369             :   /*                                              topo, tile_name, tile_wksp, metrics_wksp, cpu_idx,                       is_agave, uses_id_keyswitch, uses_av_keyswitch */
     370           0 :   FOR(shred_tile_cnt)              fd_topob_tile( topo, "shred",   "shred",   "metric_in",  tile_to_cpu[ topo->tile_cnt ], 0,        1,                 0 );
     371           0 :   fd_topo_tile_t * repair_tile =   fd_topob_tile( topo, "repair",  "repair",  "metric_in",  tile_to_cpu[ topo->tile_cnt ], 0,        1,                 0 );
     372             : 
     373             :   /* Setup a shared wksp object for fec sets. */
     374             : 
     375           0 :   ulong shred_depth = 65536UL; /* from fdctl/topology.c shred_store link. MAKE SURE TO KEEP IN SYNC. */
     376           0 :   ulong fec_set_cnt = 2UL*shred_depth + config->tiles.shred.max_pending_shred_sets + 6UL;
     377           0 :   ulong fec_sets_sz = fec_set_cnt*sizeof(fd_fec_set_t); /* mirrors # of dcache entires in frankendancer */
     378           0 :   fd_topo_obj_t * fec_sets_obj = setup_topo_fec_sets( topo, "fec_sets", shred_tile_cnt*fec_sets_sz );
     379           0 :   for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
     380           0 :     fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
     381           0 :     fd_topob_tile_uses( topo, shred_tile,  fec_sets_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
     382           0 :   }
     383           0 :   fd_topob_tile_uses( topo, repair_tile, fec_sets_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
     384           0 :   FD_TEST( fd_pod_insertf_ulong( topo->props, fec_sets_obj->id, "fec_sets" ) );
     385             : 
     386             :   /* There's another special fseq that's used to communicate the shred
     387             :     version from the Agave boot path to the shred tile. */
     388           0 :   fd_topo_obj_t * poh_shred_obj = fd_topob_obj( topo, "fseq", "poh_shred" );
     389           0 :   fd_topo_tile_t * poh_tile = &topo->tiles[ fd_topo_find_tile( topo, "gossip", 0UL ) ];
     390           0 :   fd_topob_tile_uses( topo, poh_tile, poh_shred_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
     391             : 
     392           0 :   for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
     393           0 :     fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
     394           0 :     fd_topob_tile_uses( topo, shred_tile, poh_shred_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
     395           0 :   }
     396           0 :   FD_TEST( fd_pod_insertf_ulong( topo->props, poh_shred_obj->id, "poh_shred" ) );
     397             : 
     398           0 :   if( FD_LIKELY( !is_auto_affinity ) ) {
     399           0 :     if( FD_UNLIKELY( affinity_tile_cnt<topo->tile_cnt ) )
     400           0 :       FD_LOG_ERR(( "The topology you are using has %lu tiles, but the CPU affinity specified in the config tile as [layout.affinity] only provides for %lu cores. "
     401           0 :                   "You should either increase the number of cores dedicated to Firedancer in the affinity string, or decrease the number of cores needed by reducing "
     402           0 :                   "the total tile count. You can reduce the tile count by decreasing individual tile counts in the [layout] section of the configuration file.",
     403           0 :                   topo->tile_cnt, affinity_tile_cnt ));
     404           0 :     if( FD_UNLIKELY( affinity_tile_cnt>topo->tile_cnt ) )
     405           0 :       FD_LOG_WARNING(( "The topology you are using has %lu tiles, but the CPU affinity specified in the config tile as [layout.affinity] provides for %lu cores. "
     406           0 :                       "Not all cores in the affinity will be used by Firedancer. You may wish to increase the number of tiles in the system by increasing "
     407           0 :                       "individual tile counts in the [layout] section of the configuration file.",
     408           0 :                       topo->tile_cnt, affinity_tile_cnt ));
     409           0 :   }
     410             : 
     411             :   /*                                      topo, tile_name, tile_kind_id, fseq_wksp,   link_name,      link_kind_id,  reliable,            polled */
     412           0 :   for( ulong j=0UL; j<shred_tile_cnt; j++ )
     413           0 :                   fd_topos_tile_in_net(  topo,                           "metric_in", "shred_net",     j,            FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
     414           0 :   for( ulong j=0UL; j<quic_tile_cnt; j++ )
     415           0 :                   {fd_topos_tile_in_net(  topo,                          "metric_in", "quic_net",      j,            FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );} /* No reliable consumers of networking fragments, may be dropped or overrun */
     416             : 
     417           0 :   /**/            fd_topob_tile_in(      topo, "gossip",  0UL,           "metric_in", "txsend_out",    0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED );
     418             : 
     419           0 :   /**/            fd_topos_tile_in_net(  topo,                           "metric_in", "repair_net",    0UL,          FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
     420             : 
     421           0 :   FOR(shred_tile_cnt) for( ulong j=0UL; j<net_tile_cnt; j++ )
     422           0 :                        fd_topob_tile_in(  topo, "shred",  i,             "metric_in", "net_shred",     j,            FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
     423           0 :   FOR(shred_tile_cnt)  fd_topob_tile_in(  topo, "shred",  i,             "metric_in", "poh_shred",     0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED );
     424           0 :   FOR(shred_tile_cnt)  fd_topob_tile_in(  topo, "shred",  i,             "metric_in", "replay_epoch",  0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED );
     425           0 :   FOR(shred_tile_cnt)  fd_topob_tile_in(  topo, "shred",  i,             "metric_in", "gossip_out",    0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED );
     426           0 :   FOR(shred_tile_cnt)  fd_topob_tile_out( topo, "shred",  i,                          "shred_out",     i                                                    );
     427           0 :   FOR(shred_tile_cnt)  fd_topob_tile_out( topo, "shred",  i,                          "shred_net",     i                                                    );
     428           0 :   FOR(shred_tile_cnt)  fd_topob_tile_in ( topo, "shred",  i,             "metric_in", "ipecho_out",    0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED );
     429             : 
     430             :   /**/                 fd_topob_tile_out( topo, "repair",  0UL,                       "repair_net",    0UL                                                  );
     431             : 
     432             :   /* Sign links don't need to be reliable because they are synchronous,
     433             :     so there's at most one fragment in flight at a time anyway.  The
     434             :     sign links are also not polled by the mux, instead the tiles will
     435             :     read the sign responses out of band in a dedicated spin loop. */
     436           0 :   for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
     437           0 :     /**/               fd_topob_tile_in(  topo, "sign",   0UL,           "metric_in", "shred_sign",    i,            FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED   );
     438           0 :     /**/               fd_topob_tile_out( topo, "shred",  i,                          "shred_sign",    i                                                    );
     439           0 :     /**/               fd_topob_tile_in(  topo, "shred",  i,             "metric_in", "sign_shred",    i,            FD_TOPOB_UNRELIABLE, FD_TOPOB_UNPOLLED );
     440           0 :     /**/               fd_topob_tile_out( topo, "sign",   0UL,                        "sign_shred",    i                                                    );
     441           0 :   }
     442           0 :   FOR(gossvf_tile_cnt) fd_topob_tile_in ( topo, "gossvf",   i,           "metric_in", "replay_epoch",  0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     443             : 
     444           0 :   /**/                 fd_topob_tile_in ( topo, "gossip",   0UL,         "metric_in", "replay_epoch",  0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     445             : 
     446           0 :   FOR(net_tile_cnt)    fd_topob_tile_in(  topo, "repair",  0UL,          "metric_in", "net_repair",    i,            FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED   ); /* No reliable consumers of networking fragments, may be dropped or overrun */
     447           0 :   /**/                 fd_topob_tile_in(  topo, "repair",  0UL,          "metric_in", "gossip_out",    0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     448           0 :                        fd_topob_tile_in(  topo, "repair",  0UL,          "metric_in", "snapin_manif",  0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     449           0 :   FOR(shred_tile_cnt)  fd_topob_tile_in(  topo, "repair",  0UL,          "metric_in", "shred_out",     i,            FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     450           0 :   FOR(sign_tile_cnt-1) fd_topob_tile_out( topo, "repair", 0UL,                        "repair_sign",   i                                                    );
     451           0 :   FOR(sign_tile_cnt-1) fd_topob_tile_in ( topo, "sign",   i+1,           "metric_in", "repair_sign",   i,            FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     452           0 :   FOR(sign_tile_cnt-1) fd_topob_tile_out( topo, "sign",   i+1,                        "sign_repair",   i                                                    );
     453           0 :   FOR(sign_tile_cnt-1) fd_topob_tile_in ( topo, "repair", 0UL,           "metric_in", "sign_repair",   i,            FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED   );
     454             : 
     455             :   /**/                 fd_topob_tile_out( topo, "repair", 0UL,                        "repair_out",   0UL                                                   );
     456           0 :   /**/                 fd_topob_tile_in ( topo, "gossip", 0UL,           "metric_in", "sign_gossip",   0UL,          FD_TOPOB_UNRELIABLE, FD_TOPOB_UNPOLLED );
     457           0 :   /**/                 fd_topob_tile_in ( topo, "ipecho", 0UL,           "metric_in", "genesi_out",    0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     458           0 :   /**/                 fd_topob_tile_in ( topo, "repair", 0UL,           "metric_in", "tower_out",     0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     459             : 
     460             :   /* Repair and shred share a secret they use to generate the nonces.
     461             :     It's not super security sensitive, but for good hygiene, we make it
     462             :     an object. */
     463           0 :   if( 1 /* just restrict the scope for these variables in this big function */ ) {
     464           0 :     fd_topo_obj_t * rnonce_ss_obj = fd_topob_obj( topo, "rnonce_ss", "rnonce" );
     465           0 :     fd_topo_tile_t * repair_tile = &topo->tiles[ fd_topo_find_tile( topo, "repair", 0UL ) ];
     466           0 :     fd_topob_tile_uses( topo, repair_tile, rnonce_ss_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
     467           0 :     for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
     468           0 :       fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
     469           0 :       fd_topob_tile_uses( topo, shred_tile, rnonce_ss_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
     470           0 :     }
     471           0 :     FD_TEST( fd_pod_insertf_ulong( topo->props, rnonce_ss_obj->id, "rnonce_ss" ) );
     472           0 :   }
     473             : 
     474           0 :   FD_TEST( fd_link_permit_no_producers( topo, "quic_net"      ) == quic_tile_cnt );
     475           0 :   FD_TEST( fd_link_permit_no_producers( topo, "poh_shred"     ) == 1UL           );
     476           0 :   FD_TEST( fd_link_permit_no_producers( topo, "txsend_out"    ) == 1UL           );
     477           0 :   FD_TEST( fd_link_permit_no_producers( topo, "genesi_out"    ) == 1UL           );
     478           0 :   FD_TEST( fd_link_permit_no_producers( topo, "tower_out"     ) == 1UL           );
     479           0 :   FD_TEST( fd_link_permit_no_producers( topo, "replay_epoch"  ) == 1UL           );
     480           0 :   FD_TEST( fd_link_permit_no_producers( topo, "snapin_manif"  ) == 1UL           );
     481           0 :   FD_TEST( fd_link_permit_no_consumers( topo, "net_quic"     ) == net_tile_cnt  );
     482           0 :   FD_TEST( fd_link_permit_no_consumers( topo, "repair_out"   ) == 1UL           );
     483             : 
     484           0 :   config->tiles.txsend.txsend_src_port = 0; /* disable txsend */
     485             : 
     486           0 :   FOR(net_tile_cnt) fd_topos_net_tile_finish( topo, i );
     487             : 
     488           0 :   for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
     489           0 :     fd_topo_tile_t * tile = &topo->tiles[ i ];
     490           0 :     fd_topo_configure_tile( tile, config );
     491           0 :   }
     492             : 
     493           0 :   if( FD_UNLIKELY( is_auto_affinity ) ) fd_topob_auto_layout( topo, 0 );
     494             : 
     495           0 :   fd_topob_finish( topo, CALLBACKS );
     496             : 
     497           0 :   config->topo = *topo;
     498           0 : }
     499             : 
     500             : static char *
     501           0 : fmt_count( char buf[ static 64 ], ulong count ) {
     502           0 :   char tmp[ 64 ];
     503           0 :   if( FD_LIKELY( count<1000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%lu", count ) );
     504           0 :   else if( FD_LIKELY( count<1000000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%.1f K", (double)count/1000.0 ) );
     505           0 :   else if( FD_LIKELY( count<1000000000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%.1f M", (double)count/1000000.0 ) );
     506             : 
     507           0 :   FD_TEST( fd_cstr_printf_check( buf, 64UL, NULL, "%12s", tmp ) );
     508           0 :   return buf;
     509           0 : }
     510             : 
     511             : static void
     512             : print_histogram_buckets( volatile ulong * metrics,
     513             :                          ulong offset,
     514             :                          int converter,
     515             :                          double histmin,
     516             :                          double histmax,
     517           0 :                          char * title ) {
     518           0 :   fd_histf_t hist[1];
     519             : 
     520             :   /* Create histogram structure only to get bucket edges for display */
     521           0 :   if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
     522             :     /* For SLOT_COMPLETE_TIME: min=0.2, max=2.0 seconds */
     523           0 :     FD_TEST( fd_histf_new( hist, fd_metrics_convert_seconds_to_ticks( histmin ), fd_metrics_convert_seconds_to_ticks( histmax ) ) );
     524           0 :   } else if( FD_LIKELY( converter == FD_METRICS_CONVERTER_NONE ) ) {
     525             :     /* For non-time histograms, we'd need the actual min/max values */
     526           0 :     FD_TEST( fd_histf_new( hist, (ulong)histmin, (ulong)histmax ) );
     527           0 :   } else {
     528           0 :     FD_LOG_ERR(( "unknown converter %i", converter ));
     529           0 :   }
     530             : 
     531           0 :   printf( " +---------------------+--------------------+--------------+\n" );
     532           0 :   printf( " | %-19s |                    | Count        |\n", title );
     533           0 :   printf( " +---------------------+--------------------+--------------+\n" );
     534             : 
     535           0 :   ulong total_count = 0;
     536           0 :   for( ulong k = 0; k < FD_HISTF_BUCKET_CNT; k++ ) {
     537           0 :     ulong bucket_count = metrics[ offset + k ];
     538           0 :     total_count += bucket_count;
     539           0 :   }
     540             : 
     541           0 :   for( ulong k = 0; k < FD_HISTF_BUCKET_CNT; k++ ) {
     542             :     /* Get individual bucket count directly from metrics array */
     543           0 :     ulong bucket_count = metrics[ offset + k ];
     544             : 
     545           0 :     char * le_str;
     546           0 :     char le_buf[ 64 ];
     547           0 :     if( FD_UNLIKELY( k == FD_HISTF_BUCKET_CNT - 1UL ) ) {
     548           0 :       le_str = "+Inf";
     549           0 :     } else {
     550           0 :       ulong edge = fd_histf_right( hist, k );
     551           0 :       if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
     552           0 :         double edgef = fd_metrics_convert_ticks_to_seconds( edge - 1 );
     553           0 :         FD_TEST( fd_cstr_printf_check( le_buf, sizeof( le_buf ), NULL, "%.3f", edgef ) );
     554           0 :       } else {
     555           0 :         FD_TEST( fd_cstr_printf_check( le_buf, sizeof( le_buf ), NULL, "%.3f", (double)(edge - 1) / 1000000.0 ) );
     556           0 :       }
     557           0 :       le_str = le_buf;
     558           0 :     }
     559             : 
     560           0 :     char count_buf[ 64 ];
     561           0 :     fmt_count( count_buf, bucket_count );
     562             : 
     563             :     /* Match visual bar length to the %-18s display column width. */
     564           0 :     char  bar_buf[ 19 ];
     565           0 :     ulong bar_max = sizeof( bar_buf ) - 1UL;
     566           0 :     if( bucket_count > 0 && total_count > 0 ) {
     567           0 :       ulong bar_length = (bucket_count * bar_max) / total_count;
     568           0 :       if( bar_length == 0 ) bar_length = 1;
     569           0 :       if( bar_length > bar_max ) bar_length = bar_max;
     570           0 :       for( ulong i = 0; i < bar_length; i++ ) { bar_buf[ i ] = '|'; }
     571           0 :       bar_buf[ bar_length ] = '\0';
     572           0 :     } else {
     573           0 :       bar_buf[ 0 ] = '\0';
     574           0 :     }
     575             : 
     576           0 :     printf( " | %-19s | %-18s | %s |\n", le_str, bar_buf, count_buf );
     577           0 :   }
     578             : 
     579             :   /* Print sum and total count */
     580           0 :   char sum_buf[ 64 ];
     581           0 :   char avg_buf[ 64 ];
     582           0 :   if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
     583           0 :     double sumf = fd_metrics_convert_ticks_to_seconds( metrics[ offset + FD_HISTF_BUCKET_CNT ] );
     584           0 :     FD_TEST( fd_cstr_printf_check( sum_buf, sizeof( sum_buf ), NULL, "%.6f", sumf ) );
     585           0 :     double avg = sumf / (double)total_count;
     586           0 :     FD_TEST( fd_cstr_printf_check( avg_buf, sizeof( avg_buf ), NULL, "%.6f", avg ) );
     587           0 :   } else {
     588           0 :     FD_TEST( fd_cstr_printf_check( sum_buf, sizeof( sum_buf ), NULL, "%lu", metrics[ offset + FD_HISTF_BUCKET_CNT ] ));
     589           0 :   }
     590             : 
     591           0 :   printf( " +---------------------+--------------------+---------------+\n" );
     592           0 :   printf( " | Sum: %-14s | Count: %-11lu | Avg: %-8s |\n", sum_buf, total_count, avg_buf );
     593           0 :   printf( " +---------------------+--------------------+---------------+\n" );
     594           0 : }
     595             : 
     596             : static fd_slot_metrics_t temp_slots[ FD_CATCHUP_METRICS_MAX ];
     597             : 
     598             : static void
     599           0 : print_catchup_slots( fd_wksp_t * repair_tile_wksp, ctx_t * repair_ctx, int verbose, int sort_by_slot ) {
     600           0 :   fd_repair_metrics_t * catchup = repair_ctx->slot_metrics;
     601           0 :   ulong catchup_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, catchup );
     602           0 :   fd_repair_metrics_t * catchup_table = (fd_repair_metrics_t *)fd_wksp_laddr( repair_tile_wksp, catchup_gaddr );
     603           0 :   if( FD_LIKELY( sort_by_slot ) ) {
     604           0 :     fd_repair_metrics_print_sorted( catchup_table, verbose, temp_slots );
     605           0 :   } else {
     606           0 :     fd_repair_metrics_print( catchup_table, verbose );
     607           0 :   }
     608           0 : }
     609             : 
     610             : static fd_location_info_t * location_table;
     611             : static fd_pubkey_t peers_copy[ FD_REPAIR_PEER_MAX];
     612             : 
     613             : static ulong
     614           0 : sort_peers_by_latency( fd_policy_peer_map_t * active_table, fd_policy_peer_dlist_t * peers_dlist, fd_policy_peer_dlist_t * peers_wlist, fd_policy_peer_t * peers_arr ) {
     615           0 :   ulong i = 0;
     616           0 :   fd_policy_peer_dlist_iter_t iter = fd_policy_peer_dlist_iter_fwd_init( peers_dlist, peers_arr );
     617           0 :   while( !fd_policy_peer_dlist_iter_done( iter, peers_dlist, peers_arr ) ) {
     618           0 :     fd_policy_peer_t * peer = fd_policy_peer_dlist_iter_ele( iter, peers_dlist, peers_arr );
     619           0 :     if( FD_UNLIKELY( !peer ) ) break;
     620           0 :     peers_copy[ i++ ] = peer->key;
     621           0 :     if( FD_UNLIKELY( i >= FD_REPAIR_PEER_MAX ) ) break;
     622           0 :     iter = fd_policy_peer_dlist_iter_fwd_next( iter, peers_dlist, peers_arr );
     623           0 :   }
     624           0 :   ulong fast_cnt = i;
     625           0 :   iter = fd_policy_peer_dlist_iter_fwd_init( peers_wlist, peers_arr );
     626           0 :   while( !fd_policy_peer_dlist_iter_done( iter, peers_wlist, peers_arr ) ) {
     627           0 :     fd_policy_peer_t * peer = fd_policy_peer_dlist_iter_ele( iter, peers_wlist, peers_arr );
     628           0 :     if( FD_UNLIKELY( !peer ) ) break;
     629           0 :     peers_copy[ i++ ] = peer->key;
     630           0 :     if( FD_UNLIKELY( i >= FD_REPAIR_PEER_MAX ) ) break;
     631           0 :     iter = fd_policy_peer_dlist_iter_fwd_next( iter, peers_wlist, peers_arr );
     632           0 :   }
     633           0 :   FD_LOG_NOTICE(( "Fast peers cnt: %lu. Slow peers cnt: %lu.", fast_cnt, i - fast_cnt ));
     634             : 
     635           0 :   ulong peer_cnt = i;
     636           0 :   for( uint i = 0; i < peer_cnt - 1; i++ ) {
     637           0 :     int swapped = 0;
     638           0 :     for( uint j = 0; j < peer_cnt - 1 - i; j++ ) {
     639           0 :       fd_policy_peer_t const * active_j  = fd_policy_peer_map_ele_query( active_table, &peers_copy[ j ], NULL, peers_arr );
     640           0 :       fd_policy_peer_t const * active_j1 = fd_policy_peer_map_ele_query( active_table, &peers_copy[ j + 1 ], NULL, peers_arr );
     641             : 
     642             :       /* Skip peers with no responses */
     643           0 :       double latency_j  = 10e9;
     644           0 :       double latency_j1 = 10e9;
     645           0 :       if( FD_LIKELY( active_j  && active_j->res_cnt > 0  ) ) latency_j  = ((double)active_j->total_lat / (double)active_j->res_cnt);
     646           0 :       if( FD_LIKELY( active_j1 && active_j1->res_cnt > 0 ) ) latency_j1 = ((double)active_j1->total_lat / (double)active_j1->res_cnt);
     647             : 
     648             :       /* Swap if j has higher latency than j+1 */
     649           0 :       if( latency_j > latency_j1 ) {
     650           0 :         fd_pubkey_t temp    = peers_copy[ j ];
     651           0 :         peers_copy[ j ]     = peers_copy[ j + 1 ];
     652           0 :         peers_copy[ j + 1 ] = temp;
     653           0 :         swapped             = 1;
     654           0 :       }
     655           0 :     }
     656           0 :     if( !swapped ) break;
     657           0 :   }
     658           0 :   return peer_cnt;
     659           0 : }
     660             : 
     661             : static void
     662           0 : print_peer_location_latency( fd_wksp_t * repair_tile_wksp, ctx_t * tile_ctx ) {
     663           0 :   ulong              policy_gaddr  = fd_wksp_gaddr_fast( tile_ctx->wksp, tile_ctx->policy );
     664           0 :   fd_policy_t *      policy        = fd_wksp_laddr     ( repair_tile_wksp, policy_gaddr );
     665           0 :   ulong              peermap_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.map  );
     666           0 :   ulong              peerarr_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.pool );
     667           0 :   ulong              peerlst_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.fast );
     668           0 :   ulong              peerwst_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.slow );
     669           0 :   fd_policy_peer_map_t *   peers_map   = (fd_policy_peer_map_t *)  fd_wksp_laddr( repair_tile_wksp, peermap_gaddr );
     670           0 :   fd_policy_peer_dlist_t * peers_dlist = (fd_policy_peer_dlist_t *)fd_wksp_laddr( repair_tile_wksp, peerlst_gaddr );
     671           0 :   fd_policy_peer_dlist_t * peers_wlist = (fd_policy_peer_dlist_t *)fd_wksp_laddr( repair_tile_wksp, peerwst_gaddr );
     672           0 :   fd_policy_peer_t *       peers_arr   = (fd_policy_peer_t *)      fd_wksp_laddr( repair_tile_wksp, peerarr_gaddr );
     673             : 
     674           0 :   ulong peer_cnt = sort_peers_by_latency( peers_map, peers_dlist, peers_wlist, peers_arr );
     675           0 :   printf("\nPeer Location/Latency Information\n");
     676           0 :   printf( "     | %-46s | %-7s | %-8s | %-8s | %-7s | %-7s | %-12s | %s\n", "Pubkey", "Req Cnt", "Req B/s", "Rx B/s", "Rx Rate", "Avg Latency", "Ewma Latency", "Location Info" );
     677           0 :   for( uint i = 0; i < peer_cnt; i++ ) {
     678           0 :     fd_policy_peer_t const * active = fd_policy_peer_map_ele_query( peers_map, &peers_copy[ i ], NULL, peers_arr );
     679           0 :     if( FD_LIKELY( active && active->res_cnt > 0 ) ) {
     680           0 :       fd_location_info_t * info = fd_location_table_query( location_table, active->ip4, NULL );
     681           0 :       char * geolocation = info ? info->location : "";
     682           0 :       double peer_bps    = (double)(active->res_cnt * FD_SHRED_MIN_SZ) / ((double)(active->last_resp_ts - active->first_resp_ts) / 1e9);
     683           0 :       double req_bps     = (double)active->req_cnt * 202 / ((double)(active->last_req_ts - active->first_req_ts) / 1e9);
     684           0 :       FD_BASE58_ENCODE_32_BYTES( active->key.key, key_b58 );
     685           0 :       printf( "%-5u | %-46s | %-7lu | %-8.2f | %-8.2f | %-7.2f | %10.3fms | %10.3fms | %s\n", i, key_b58, active->req_cnt, req_bps, peer_bps, (double)active->res_cnt / (double)active->req_cnt, ((double)active->total_lat / (double)active->res_cnt) / 1e6, (double)active->ewma_lat / 1e6, geolocation );
     686           0 :     }
     687           0 :   }
     688           0 :   printf("\n");
     689           0 :   fflush( stdout );
     690           0 : }
     691             : 
     692             : static void
     693           0 : read_iptable( char * iptable_path, fd_location_info_t * location_table ) {
     694           0 :   int iptable_fd = open( iptable_path, O_RDONLY );
     695           0 :   if( FD_UNLIKELY( iptable_fd<0 ) ) return;
     696             : 
     697             :   /* read iptable line by line */
     698           0 :   if( FD_LIKELY( iptable_fd>=0 ) ) {
     699           0 :     char line[ 256 ];
     700           0 :     uchar istream_buf[256];
     701           0 :     fd_io_buffered_istream_t istream[1];
     702           0 :     fd_io_buffered_istream_init( istream, iptable_fd, istream_buf, sizeof(istream_buf) );
     703           0 :     for(;;) {
     704           0 :       int err;
     705           0 :       if( !fd_io_fgets( line, sizeof(line), istream, &err ) ) break;
     706           0 :       fd_location_info_t location_info;
     707           0 :       sscanf( line, "%lu %[^\n]", &location_info.ip4_addr, location_info.location );
     708           0 :       fd_location_info_t * info = fd_location_table_insert( location_table, location_info.ip4_addr );
     709           0 :       if( FD_UNLIKELY( info==NULL ) ) break;
     710           0 :       memcpy( info->location, location_info.location, sizeof(info->location) );
     711           0 :     }
     712           0 :   }
     713           0 : }
     714             : 
     715             : static void
     716             : print_tile_metrics( volatile ulong * shred_metrics,
     717             :                     volatile ulong * repair_metrics,
     718             :                     volatile ulong * repair_metrics_prev, /* for diffing metrics */
     719             :                     volatile ulong ** repair_net_links,
     720             :                     volatile ulong ** net_shred_links,
     721             :                     ulong   net_tile_cnt,
     722             :                     ulong * last_sent_cnt,
     723             :                     long    last_print_ts,
     724           0 :                     long    now ) {
     725           0 :   char buf2[ 64 ];
     726           0 :   ulong rcvd = shred_metrics [ MIDX( COUNTER, SHRED,  SHRED_REPAIR_RX ) ];
     727           0 :   ulong sent = repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_TX_NEEDED_WINDOW ) ] +
     728           0 :                 repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_TX_NEEDED_HIGHEST_WINDOW ) ] +
     729           0 :                 repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_TX_NEEDED_ORPHAN ) ];
     730           0 :   printf(" Requests received: (%lu/%lu) %.1f%% \n", rcvd, sent, (double)rcvd / (double)sent * 100.0 );
     731           0 :   printf( " +---------------+--------------+\n" );
     732           0 :   printf( " | Request Type  | Count        |\n" );
     733           0 :   printf( " +---------------+--------------+\n" );
     734           0 :   printf( " | Orphan        | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_TX_NEEDED_ORPHAN         ) ] ) );
     735           0 :   printf( " | HighestWindow | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_TX_NEEDED_HIGHEST_WINDOW ) ] ) );
     736           0 :   printf( " | Index         | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_TX_NEEDED_WINDOW         ) ] ) );
     737           0 :   printf( " +---------------+--------------+\n" );
     738           0 :   printf( " Send Pkt Rate: %s pps\n",  fmt_count( buf2, (ulong)((sent - *last_sent_cnt)*1e9L / (now - last_print_ts) ) ) );
     739           0 :   *last_sent_cnt = sent;
     740             : 
     741             :   /* Sum overrun across all net tiles connected to repair_net */
     742           0 :   ulong total_overrun = repair_net_links[0][ MIDX( COUNTER, LINK, FRAG_POLLING_OVERRUN ) ]; /* coarse double counting prevention */
     743           0 :   ulong total_consumed = 0UL;
     744           0 :   for( ulong i = 0UL; i < net_tile_cnt; i++ ) {
     745           0 :     volatile ulong * ovar_net_metrics = repair_net_links[i];
     746           0 :     total_overrun  += ovar_net_metrics[ MIDX( COUNTER, LINK, FRAG_READING_OVERRUN ) ];
     747           0 :     total_consumed += ovar_net_metrics[ MIDX( COUNTER, LINK, FRAG_CONSUMED ) ]; /* consumed is incremented after after_frag is called */
     748           0 :   }
     749           0 :   printf( " Outgoing requests overrun:  %s\n", fmt_count( buf2, total_overrun  ) );
     750           0 :   printf( " Outgoing requests consumed: %s\n", fmt_count( buf2, total_consumed ) );
     751             : 
     752           0 :   total_overrun  = net_shred_links[0][ MIDX( COUNTER, LINK, FRAG_READING_OVERRUN ) ];
     753           0 :   total_consumed = 0UL;
     754           0 :   for( ulong i = 0UL; i < net_tile_cnt; i++ ) {
     755           0 :     volatile ulong * ovar_net_metrics = net_shred_links[i];
     756           0 :     total_overrun  += ovar_net_metrics[ MIDX( COUNTER, LINK, FRAG_READING_OVERRUN ) ];
     757           0 :     total_consumed += ovar_net_metrics[ MIDX( COUNTER, LINK, FRAG_CONSUMED ) ]; /* shred frag filtering happens manually in after_frag, so no need to index every shred_tile. */
     758           0 :   }
     759             : 
     760           0 :   printf( " Incoming shreds overrun:    %s\n", fmt_count( buf2, total_overrun ) );
     761           0 :   printf( " Incoming shreds consumed:   %s\n", fmt_count( buf2, total_consumed ) );
     762             : 
     763           0 :   print_histogram_buckets( repair_metrics,
     764           0 :                             MIDX( HISTOGRAM, REPAIR, RESPONSE_LATENCY_NANOS ),
     765           0 :                             FD_METRICS_CONVERTER_NONE,
     766           0 :                             FD_METRICS_HISTOGRAM_REPAIR_RESPONSE_LATENCY_NANOS_MIN,
     767           0 :                             FD_METRICS_HISTOGRAM_REPAIR_RESPONSE_LATENCY_NANOS_MAX,
     768           0 :                             "Response Latency" );
     769             : 
     770           0 :   printf(" Repair Peers: %lu\n", repair_metrics[ MIDX( COUNTER, REPAIR, PEER_REQUESTED ) ] );
     771           0 :   printf(" Shreds rejected (no stakes): %lu\n", shred_metrics[ MIDX( COUNTER, SHRED, SHRED_PROCESSED ) ] );
     772             :   /* Print histogram buckets similar to Prometheus format */
     773           0 :   print_histogram_buckets( repair_metrics,
     774           0 :                           MIDX( HISTOGRAM, REPAIR, SLOT_COMPLETE_DURATION_SECONDS ),
     775           0 :                           FD_METRICS_CONVERTER_SECONDS,
     776           0 :                           FD_METRICS_HISTOGRAM_REPAIR_SLOT_COMPLETE_DURATION_SECONDS_MIN,
     777           0 :                           FD_METRICS_HISTOGRAM_REPAIR_SLOT_COMPLETE_DURATION_SECONDS_MAX,
     778           0 :                           "Slot Complete Time" );
     779             : 
     780           0 : #define DIFFX(METRIC) repair_metrics[ MIDX( COUNTER, TILE, METRIC ) ] - repair_metrics_prev[ MIDX( COUNTER, TILE, METRIC ) ]
     781           0 :   ulong hkeep_ticks        = DIFFX(REGIME_DURATION_NANOS_CAUGHT_UP_HOUSEKEEPING) + DIFFX(REGIME_DURATION_NANOS_PROCESSING_HOUSEKEEPING) + DIFFX(REGIME_DURATION_NANOS_BACKPRESSURE_HOUSEKEEPING);
     782           0 :   ulong busy_ticks         = DIFFX(REGIME_DURATION_NANOS_PROCESSING_PREFRAG) + DIFFX(REGIME_DURATION_NANOS_PROCESSING_POSTFRAG ) + DIFFX(REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG);
     783           0 :   ulong caught_up_ticks    = DIFFX(REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG);
     784           0 :   ulong backpressure_ticks = DIFFX(REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG);
     785           0 :   ulong total_ticks = hkeep_ticks + busy_ticks + caught_up_ticks + backpressure_ticks;
     786             : 
     787           0 :   printf( " Repair Hkeep: %.1f %%  Busy: %.1f %%  Idle: %.1f %%  Backp: %0.1f %%\n",
     788           0 :             (double)hkeep_ticks/(double)total_ticks*100.0,
     789           0 :             (double)busy_ticks/(double)total_ticks*100.0,
     790           0 :             (double)caught_up_ticks/(double)total_ticks*100.0,
     791           0 :             (double)backpressure_ticks/(double)total_ticks*100.0 );
     792           0 : #undef DIFFX
     793           0 :   fflush( stdout );
     794             : 
     795           0 :   printf( " Block failed insert: %lu\n", repair_metrics[ MIDX( COUNTER, REPAIR, BLOCK_INSERT_FAILED ) ] );
     796           0 :   printf( " Block evicted: %lu\n", repair_metrics[ MIDX( COUNTER, REPAIR, BLOCK_EVICTED ) ] );
     797           0 :   printf( " slot evicted: %lu\n", repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_LAST_EVICTED ) ] );
     798           0 :   printf( " slot evicted by: %lu\n", repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_LAST_EVICTION_CAUSE ) ] );
     799           0 :   printf( " slot failed insert: %lu\n", repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_LAST_INSERT_FAILED ) ] );
     800           0 :   for( ulong i=0UL; i<FD_METRICS_TOTAL_SZ/sizeof(ulong); i++ ) repair_metrics_prev[ i ] = repair_metrics[ i ];
     801           0 : }
     802             : 
     803             : static void
     804             : repair_ctx_wksp( args_t *          args,
     805             :                  config_t *        config,
     806             :                  ctx_t **          repair_ctx,
     807           0 :                  fd_topo_wksp_t ** repair_wksp ) {
     808           0 :   (void)args;
     809             : 
     810           0 :   fd_topo_t * topo = &config->topo;
     811           0 :   ulong wksp_id = fd_topo_find_wksp( topo, "repair" );
     812           0 :   if( FD_UNLIKELY( wksp_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair workspace not found" ));
     813             : 
     814           0 :   fd_topo_wksp_t * _repair_wksp = &topo->workspaces[ wksp_id ];
     815             : 
     816           0 :   ulong tile_id = fd_topo_find_tile( topo, "repair", 0UL );
     817           0 :   if( FD_UNLIKELY( tile_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair tile not found" ));
     818             : 
     819           0 :   fd_topo_join_workspace( topo, _repair_wksp, FD_SHMEM_JOIN_MODE_READ_ONLY, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
     820             : 
     821             :   /* Access the repair tile scratch memory where repair_tile_ctx is stored */
     822           0 :   fd_topo_tile_t * tile = &topo->tiles[ tile_id ];
     823           0 :   void * scratch = fd_topo_obj_laddr( &config->topo, tile->tile_obj_id );
     824           0 :   if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "Failed to access repair tile scratch memory" ));
     825             : 
     826           0 :   FD_SCRATCH_ALLOC_INIT( l, scratch );
     827           0 :   ctx_t * _repair_ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ctx_t), sizeof(ctx_t) );
     828             : 
     829           0 :   *repair_ctx  = _repair_ctx;
     830           0 :   *repair_wksp = _repair_wksp;
     831           0 : }
     832             : 
     833             : static void
     834             : repair_cmd_fn_catchup( args_t *   args,
     835           0 :                        config_t * config ) {
     836             : 
     837           0 :   memset( &config->topo, 0, sizeof(config->topo) );
     838           0 :   repair_topo( config );
     839             : 
     840           0 :   fd_topo_print_log( 1, &config->topo );
     841             : 
     842           0 :   args_t configure_args = {
     843           0 :     .configure.command = CONFIGURE_CMD_INIT,
     844           0 :   };
     845           0 :   for( ulong i=0UL; STAGES[ i ]; i++ ) {
     846           0 :     configure_args.configure.stages[ i ] = STAGES[ i ];
     847           0 :   }
     848           0 :   configure_cmd_fn( &configure_args, config );
     849           0 :   if( 0==strcmp( config->net.provider, "xdp" ) ) {
     850           0 :     fd_topo_install_xdp_simple( &config->topo, config->net.bind_address_parsed );
     851           0 :   }
     852           0 :   run_firedancer_init( config, 1, 0 );
     853             : 
     854           0 :   fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_WRITE, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
     855             : 
     856           0 :   fd_topo_fill( &config->topo );
     857             : 
     858           0 :   repair_load_manifest( &config->topo, args->repair.manifest_path );
     859             : 
     860             :   /* Access repair workspace memory and metrics */
     861             : 
     862           0 :   ulong repair_tile_idx = fd_topo_find_tile( &config->topo, "repair", 0UL );
     863           0 :   ulong shred_tile_idx  = fd_topo_find_tile( &config->topo, "shred", 0UL );
     864           0 :   FD_TEST( repair_tile_idx!=ULONG_MAX );
     865           0 :   FD_TEST( shred_tile_idx !=ULONG_MAX );
     866           0 :   fd_topo_tile_t * repair_tile = &config->topo.tiles[ repair_tile_idx ];
     867           0 :   fd_topo_tile_t * shred_tile  = &config->topo.tiles[ shred_tile_idx ];
     868             : 
     869           0 :   fd_topo_wksp_t * repair_wksp;
     870           0 :   ctx_t          * repair_ctx;
     871           0 :   repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
     872             : 
     873           0 :   volatile ulong * shred_metrics  = fd_metrics_tile( shred_tile->metrics );
     874           0 :   volatile ulong * repair_metrics = fd_metrics_tile( repair_tile->metrics );
     875           0 :   FD_TEST( repair_metrics );
     876           0 :   ulong * repair_metrics_prev = aligned_alloc( 8UL, sizeof(ulong) * FD_METRICS_TOTAL_SZ );
     877           0 :   FD_TEST( repair_metrics_prev );
     878           0 :   memset( repair_metrics_prev, 0, sizeof(ulong) * FD_METRICS_TOTAL_SZ );
     879             : 
     880             :   /* Collect link metrics */
     881             : 
     882             :   /* Collect all net tiles and their repair_net link metrics */
     883           0 :   ulong net_cnt  = config->layout.net_tile_count;
     884           0 :   volatile ulong ** repair_net_links = aligned_alloc( 8UL, net_cnt * sizeof(volatile ulong*) );
     885           0 :   volatile ulong ** net_shred_links  = aligned_alloc( 8UL, net_cnt * sizeof(volatile ulong*) );
     886           0 :   FD_TEST( repair_net_links );
     887           0 :   FD_TEST( net_shred_links  );
     888             : 
     889           0 :   for( ulong i = 0UL; i < net_cnt; i++ ) {
     890           0 :     ulong tile_idx = fd_topo_find_tile( &config->topo, "net", i );
     891           0 :     if( FD_UNLIKELY( tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net tile %lu not found", i ));
     892           0 :     fd_topo_tile_t * tile = &config->topo.tiles[ tile_idx ];
     893             : 
     894           0 :     ulong repair_net_in_idx = fd_topo_find_tile_in_link( &config->topo, tile, "repair_net", 0UL );
     895           0 :     if( FD_UNLIKELY( repair_net_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "repair_net link not found for net tile %lu", i ));
     896           0 :     FD_TEST( tile->metrics );
     897           0 :     repair_net_links[i] = fd_metrics_link_in( tile->metrics, repair_net_in_idx );
     898           0 :     FD_TEST( repair_net_links[i] );
     899             : 
     900             :     /* process all net_shred links */
     901           0 :     ulong shred_tile_idx = fd_topo_find_tile( &config->topo, "shred", 0 );
     902           0 :     if( FD_UNLIKELY( shred_tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "shred tile 0 not found" ));
     903           0 :     fd_topo_tile_t * shred_tile = &config->topo.tiles[ shred_tile_idx ];
     904             : 
     905           0 :     ulong shred_out_in_idx = fd_topo_find_tile_in_link( &config->topo, shred_tile, "net_shred", i );
     906           0 :     if( FD_UNLIKELY( shred_out_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net_shred link not found for shred tile 0" ));
     907           0 :     FD_TEST( shred_tile->metrics );
     908           0 :     net_shred_links[i] = fd_metrics_link_in( shred_tile->metrics, shred_out_in_idx );
     909           0 :     FD_TEST( net_shred_links[i] );
     910           0 :   }
     911             : 
     912           0 :   FD_LOG_NOTICE(( "Repair catchup run" ));
     913             : 
     914           0 :   ulong    shred_out_link_idx = fd_topo_find_link( &config->topo, "shred_out", 0UL );
     915           0 :   FD_TEST( shred_out_link_idx!=ULONG_MAX );
     916           0 :   fd_topo_link_t * shred_out_link   = &config->topo.links[ shred_out_link_idx  ];
     917           0 :   fd_frag_meta_t * shred_out_mcache = shred_out_link->mcache;
     918           0 :   void * shred_out_dcache = config->topo.workspaces[ config->topo.objs[ shred_out_link->dcache_obj_id ].wksp_id ].wksp;
     919             : 
     920           0 :   ulong turbine_slot0 = 0;
     921           0 :   long  last_print    = fd_log_wallclock();
     922           0 :   ulong last_sent     = 0UL;
     923             : 
     924           0 :   fd_topo_run_single_process( &config->topo, 0, config->uid, config->gid, fdctl_tile_run );
     925           0 :   for(;;) {
     926             : 
     927           0 :     if( FD_UNLIKELY( !turbine_slot0 ) ) {
     928           0 :       fd_frag_meta_t * frag = &shred_out_mcache[0]; /* hack to get first frag */
     929           0 :       if ( frag->sz > 0 ) {
     930           0 :         uchar      * shred_out_chunk = fd_chunk_to_laddr( shred_out_dcache, frag->chunk );
     931           0 :         fd_shred_base_t * shred_out_shred = (fd_shred_base_t *)fd_type_pun( shred_out_chunk );
     932           0 :         turbine_slot0 = shred_out_shred->shred.slot;
     933           0 :         FD_LOG_NOTICE(("turbine_slot0: %lu", turbine_slot0));
     934           0 :       }
     935           0 :     }
     936             : 
     937             :     /* print metrics */
     938             : 
     939           0 :     long now = fd_log_wallclock();
     940           0 :     int catchup_finished = 0;
     941           0 :     if( FD_UNLIKELY( now - last_print > 1e9L ) ) {
     942           0 :       print_tile_metrics( shred_metrics, repair_metrics, repair_metrics_prev, repair_net_links, net_shred_links, net_cnt, &last_sent, last_print, now );
     943           0 :       ulong slots_behind = turbine_slot0 > repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_HIGHEST_REPAIRED ) ] ? turbine_slot0 - repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_HIGHEST_REPAIRED ) ] : 0;
     944           0 :       printf(" Repaired slots: %lu/%lu  (slots behind: %lu)\n", repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_HIGHEST_REPAIRED ) ], turbine_slot0, slots_behind );
     945           0 :       if( turbine_slot0 && !slots_behind ) {
     946           0 :         catchup_finished = 1;
     947           0 :       }
     948           0 :       printf("\n");
     949           0 :       fflush( stdout );
     950           0 :       last_print = now;
     951           0 :     }
     952             : 
     953           0 :     if( FD_UNLIKELY( catchup_finished ) ) {
     954             :       /* repair cmd owned memory */
     955           0 :       location_table = fd_location_table_join( fd_location_table_new( location_table_mem ) );
     956           0 :       read_iptable( args->repair.iptable_path, location_table );
     957           0 :       print_peer_location_latency( repair_wksp->wksp, repair_ctx );
     958           0 :       print_catchup_slots( repair_wksp->wksp, repair_ctx, 0, 1 );
     959           0 :       FD_LOG_NOTICE(("Catchup to slot %lu completed successfully", turbine_slot0));
     960           0 :       fd_sys_util_exit_group( 0 );
     961           0 :     }
     962           0 :   }
     963           0 : }
     964             : 
     965             : /* Tests equivocation detection & repair path. */
     966             : static void
     967             : repair_cmd_fn_eqvoc( args_t *   args,
     968           0 :                      config_t * config ) {
     969           0 :   (void)args;
     970           0 :   memset( &config->topo, 0, sizeof(config->topo) );
     971           0 :   repair_topo( config );
     972             : 
     973           0 :   FD_LOG_NOTICE(( "Repair eqvoc testing init" ));
     974           0 :   fd_topo_print_log( 1, &config->topo );
     975             : 
     976           0 :   args_t configure_args = { .configure.command = CONFIGURE_CMD_INIT, };
     977           0 :   for( ulong i=0UL; STAGES[ i ]; i++ ) configure_args.configure.stages[ i ] = STAGES[ i ];
     978           0 :   configure_cmd_fn( &configure_args, config );
     979           0 :   if( 0==strcmp( config->net.provider, "xdp" ) ) fd_topo_install_xdp_simple( &config->topo, config->net.bind_address_parsed );
     980             : 
     981           0 :   run_firedancer_init( config, 1, 0 );
     982           0 :   fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_WRITE, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
     983           0 :   fd_topo_fill( &config->topo );
     984             : 
     985           0 :   repair_load_manifest( &config->topo, args->repair.manifest_path );
     986             : 
     987           0 :   ulong repair_tile_idx = fd_topo_find_tile( &config->topo, "repair", 0UL );
     988           0 :   fd_topo_tile_t * repair_tile = &config->topo.tiles[ repair_tile_idx ];
     989           0 :   volatile ulong * repair_metrics = fd_metrics_tile( repair_tile->metrics );
     990             : 
     991           0 :   void * scratch = fd_topo_obj_laddr( &config->topo, repair_tile->tile_obj_id );
     992           0 :   if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "Failed to access repair tile scratch memory" ));
     993           0 :   FD_SCRATCH_ALLOC_INIT( l, scratch );
     994           0 :   ctx_t * repair_ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ctx_t), sizeof(ctx_t) );
     995           0 :   (void)repair_ctx;
     996             : 
     997             :   /* read tower_out mcache dcache */
     998           0 :   ulong tower_out_link_idx = fd_topo_find_link( &config->topo, "tower_out", 0UL );
     999           0 :   FD_TEST( tower_out_link_idx!=ULONG_MAX );
    1000           0 :   fd_topo_link_t * tower_out_link = &config->topo.links[ tower_out_link_idx ];
    1001           0 :   fd_frag_meta_t * tower_out_mcache = tower_out_link->mcache;
    1002           0 :   fd_wksp_t * tower_out_mem = config->topo.workspaces[ config->topo.objs[ tower_out_link->dcache_obj_id ].wksp_id ].wksp;
    1003           0 :   ulong tower_out_chunk0 = fd_dcache_compact_chunk0( tower_out_mem, tower_out_link->dcache );
    1004           0 :   ulong tower_out_wmark = fd_dcache_compact_wmark( tower_out_mem, tower_out_link->dcache, tower_out_link->mtu );
    1005           0 :   ulong tower_out_chunk = tower_out_chunk0;
    1006             : 
    1007           0 :   fd_topo_run_single_process( &config->topo, 0, config->uid, config->gid, fdctl_tile_run );
    1008           0 :   int confirmed = 0;
    1009           0 :   for(;;) {
    1010             :     /* publish a confirmation on tower_out */
    1011           0 :     if( FD_UNLIKELY( !confirmed && repair_metrics[ MIDX( GAUGE, REPAIR, SLOT_HIGHEST_REPAIRED ) ] != 0 ) ) {
    1012           0 :       fd_tower_slot_confirmed_t * msg = fd_chunk_to_laddr( tower_out_mem, tower_out_chunk );
    1013           0 :       FD_LOG_NOTICE(( "publishing confirmation for slot %lu", msg->slot ));
    1014           0 :       fd_mcache_publish( tower_out_mcache, tower_out_link->depth, 0, FD_TOWER_SIG_SLOT_CONFIRMED, tower_out_chunk, sizeof(fd_tower_slot_confirmed_t), 0, 0, 0 );
    1015           0 :       tower_out_chunk = fd_dcache_compact_next( tower_out_chunk, sizeof(fd_tower_slot_confirmed_t), tower_out_chunk0, tower_out_wmark );
    1016           0 :       confirmed = 1;
    1017           0 :     }
    1018           0 :     sleep( 1 );
    1019           0 :   }
    1020           0 : }
    1021             : 
    1022             : static void
    1023             : repair_cmd_fn_metrics( args_t *   args,
    1024           0 :                        config_t * config ) {
    1025             :   //memset( &config->topo, 0, sizeof(config->topo) );
    1026             : 
    1027           0 :   fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_ONLY, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
    1028           0 :   fd_topo_fill( &config->topo );
    1029             : 
    1030           0 :   ctx_t *          repair_ctx;
    1031           0 :   fd_topo_wksp_t * repair_wksp;
    1032           0 :   repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
    1033             : 
    1034           0 :   ulong    shred_tile_idx  = fd_topo_find_tile( &config->topo, "shred", 0UL );
    1035           0 :   ulong    repair_tile_idx = fd_topo_find_tile( &config->topo, "repair", 0UL );
    1036           0 :   FD_TEST( shred_tile_idx != ULONG_MAX );
    1037           0 :   FD_TEST( repair_tile_idx!= ULONG_MAX );
    1038           0 :   fd_topo_tile_t * shred_tile  = &config->topo.tiles[ shred_tile_idx ];
    1039           0 :   fd_topo_tile_t * repair_tile = &config->topo.tiles[ repair_tile_idx ];
    1040             : 
    1041           0 :   volatile ulong * shred_metrics = fd_metrics_tile( shred_tile->metrics );
    1042           0 :   FD_TEST( shred_metrics );
    1043             : 
    1044           0 :   volatile ulong * repair_metrics = fd_metrics_tile( repair_tile->metrics );
    1045           0 :   FD_TEST( repair_metrics );
    1046           0 :   ulong * repair_metrics_prev = aligned_alloc( 8UL, sizeof(ulong) * FD_METRICS_TOTAL_SZ );
    1047           0 :   FD_TEST( repair_metrics_prev );
    1048           0 :   memset( repair_metrics_prev, 0, sizeof(ulong) * FD_METRICS_TOTAL_SZ );
    1049             : 
    1050             : 
    1051           0 :   ulong net_tile_cnt = config->layout.net_tile_count;
    1052           0 :   volatile ulong ** repair_net_links = aligned_alloc( 8UL, net_tile_cnt * sizeof(volatile ulong*) );
    1053           0 :   volatile ulong ** net_shred_links  = aligned_alloc( 8UL, net_tile_cnt * sizeof(volatile ulong*) );
    1054           0 :   FD_TEST( repair_net_links );
    1055           0 :   FD_TEST( net_shred_links );
    1056             : 
    1057           0 :   for( ulong i = 0UL; i < net_tile_cnt; i++ ) {
    1058             :     /* process all repair_net links */
    1059           0 :     ulong tile_idx = fd_topo_find_tile( &config->topo, "net", i );
    1060           0 :     if( FD_UNLIKELY( tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net tile %lu not found", i ));
    1061           0 :     fd_topo_tile_t * tile = &config->topo.tiles[ tile_idx ];
    1062             : 
    1063           0 :     ulong repair_net_in_idx = fd_topo_find_tile_in_link( &config->topo, tile, "repair_net", 0UL );
    1064           0 :     if( FD_UNLIKELY( repair_net_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "repair_net link not found for net tile %lu", i ));
    1065           0 :     repair_net_links[i] = fd_metrics_link_in( tile->metrics, repair_net_in_idx );
    1066           0 :     FD_TEST( repair_net_links[i] );
    1067             : 
    1068             :     /* process all net_shred links */
    1069           0 :     tile_idx = fd_topo_find_tile( &config->topo, "shred", 0 );
    1070           0 :     if( FD_UNLIKELY( tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "shred tile 0 not found" ));
    1071           0 :     fd_topo_tile_t * shred_tile = &config->topo.tiles[ tile_idx ];
    1072             : 
    1073           0 :     ulong shred_out_in_idx = fd_topo_find_tile_in_link( &config->topo, shred_tile, "net_shred", i );
    1074           0 :     if( FD_UNLIKELY( shred_out_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net_shred link not found for shred tile 0" ));
    1075           0 :     net_shred_links[i] = fd_metrics_link_in( shred_tile->metrics, shred_out_in_idx );
    1076           0 :     FD_TEST( net_shred_links[i] );
    1077           0 :   }
    1078             : 
    1079           0 :   long  last_print_ts = fd_log_wallclock();
    1080           0 :   ulong last_sent     = 0UL;
    1081           0 :   for(;;) {
    1082           0 :     long now = fd_log_wallclock();
    1083           0 :     if( FD_UNLIKELY( now - last_print_ts > 1e9L ) ) {
    1084           0 :       print_tile_metrics( shred_metrics, repair_metrics, repair_metrics_prev, repair_net_links, net_shred_links, net_tile_cnt, &last_sent, last_print_ts, now );
    1085           0 :       last_print_ts = now;
    1086           0 :     }
    1087           0 :   }
    1088           0 : }
    1089             : 
    1090             : static void
    1091             : repair_cmd_fn_forest( args_t *   args,
    1092           0 :                       config_t * config ) {
    1093           0 :   ctx_t *          repair_ctx;
    1094           0 :   fd_topo_wksp_t * repair_wksp;
    1095           0 :   repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
    1096             : 
    1097           0 :   ulong forest_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->forest );
    1098           0 :   fd_forest_t * forest = (fd_forest_t *)fd_wksp_laddr( repair_wksp->wksp, forest_gaddr );
    1099             : 
    1100           0 :   for( ;; ) {
    1101           0 :     fd_forest_print( forest );
    1102           0 :     sleep( 1 );
    1103           0 :   }
    1104           0 : }
    1105             : 
    1106             : static void
    1107             : repair_cmd_fn_inflight( args_t *   args,
    1108           0 :                         config_t * config ) {
    1109           0 :   ctx_t *          repair_ctx;
    1110           0 :   fd_topo_wksp_t * repair_wksp;
    1111           0 :   repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
    1112             : 
    1113           0 :   ulong            inflights_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->inflights );
    1114           0 :   fd_inflights_t * inflights       = (fd_inflights_t *)fd_wksp_laddr( repair_wksp->wksp, inflights_gaddr );
    1115             : 
    1116           0 :   ulong inflight_pool_off = (ulong)inflights->pool - (ulong)repair_ctx->inflights;
    1117           0 :   fd_inflight_t * inflight_pool = (fd_inflight_t *)fd_wksp_laddr( repair_wksp->wksp, inflights_gaddr + inflight_pool_off );
    1118             : 
    1119           0 :   for( ;; ) {
    1120           0 :     fd_inflights_print( inflights->outstanding_dl, inflight_pool );
    1121           0 :     printf("popped count: %lu\n", inflights->popped_cnt);
    1122           0 :     fd_inflights_print( inflights->popped_dl, inflight_pool );
    1123           0 :     sleep( 1 );
    1124           0 :   }
    1125           0 : }
    1126             : 
    1127             : static void
    1128             : repair_cmd_fn_requests( args_t *   args,
    1129           0 :                         config_t * config ) {
    1130           0 :   ctx_t *          repair_ctx;
    1131           0 :   fd_topo_wksp_t * repair_wksp;
    1132           0 :   repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
    1133             : 
    1134           0 :   fd_forest_t *          forest = fd_forest_join( fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->forest ) ) );
    1135           0 :   fd_forest_reqslist_t * dlist  = fd_forest_reqslist( forest );
    1136           0 :   fd_forest_ref_t *      pool   = fd_forest_reqspool( forest );
    1137             : 
    1138           0 :   fd_forest_reqslist_t * orphlist = fd_forest_orphlist( forest );
    1139             : 
    1140           0 :   for( ;; ) {
    1141           0 :     printf("%-15s %-12s %-12s %-12s %-20s %-12s\n",
    1142           0 :             "Slot", "Buffered Idx", "Complete Idx", "First Shred ts", "Turbine Cnt", "Repair Cnt");
    1143           0 :     printf("%-15s %-12s %-12s %-12s %-20s %-12s\n",
    1144           0 :             "---------------", "------------", "------------", "------------",
    1145           0 :             "--------------------", "------------");
    1146           0 :     for( fd_forest_reqslist_iter_t iter = fd_forest_reqslist_iter_fwd_init( dlist, pool );
    1147           0 :         !fd_forest_reqslist_iter_done( iter, dlist, pool );
    1148           0 :         iter = fd_forest_reqslist_iter_fwd_next( iter, dlist, pool ) ) {
    1149           0 :       fd_forest_ref_t * req = fd_forest_reqslist_iter_ele( iter, dlist, pool );
    1150           0 :       fd_forest_blk_t * blk = fd_forest_pool_ele( fd_forest_pool( forest ), req->idx );
    1151             : 
    1152           0 :       printf("%-15lu %-12u %-12u %-20ld %-12u %-10u\n",
    1153           0 :               blk->slot,
    1154           0 :               blk->buffered_idx,
    1155           0 :               blk->complete_idx,
    1156           0 :               blk->first_shred_ts,
    1157           0 :               blk->turbine_cnt,
    1158           0 :               blk->repair_cnt);
    1159           0 :     }
    1160           0 :     printf("\n");
    1161             : 
    1162             :     /* now lets print the orphreqs */
    1163             : 
    1164           0 :     printf("Orphan Requests:\n");
    1165           0 :     printf("%-15s %-12s %-12s %-12s %-20s %-12s %-10s\n",
    1166           0 :       "Slot", "Consumed Idx", "Buffered Idx", "Complete Idx",
    1167           0 :       "First Shred Timestamp", "Turbine Cnt", "Repair Cnt");
    1168           0 : printf("%-15s %-12s %-12s %-12s %-20s %-12s %-10s\n",
    1169           0 :       "---------------", "------------", "------------", "------------",
    1170           0 :       "--------------------", "------------", "----------");
    1171             : 
    1172           0 :     for( fd_forest_reqslist_iter_t iter = fd_forest_reqslist_iter_fwd_init( orphlist, pool );
    1173           0 :                                          !fd_forest_reqslist_iter_done( iter, orphlist, pool );
    1174           0 :                                    iter = fd_forest_reqslist_iter_fwd_next( iter, orphlist, pool ) ) {
    1175           0 :       fd_forest_ref_t * req = fd_forest_reqslist_iter_ele( iter, orphlist, pool );
    1176           0 :       fd_forest_blk_t * blk = fd_forest_pool_ele( fd_forest_pool( forest ), req->idx );
    1177           0 :       printf("%-15lu %-12u %-12u %-20ld %-12u %-10u\n",
    1178           0 :               blk->slot,
    1179           0 :               blk->buffered_idx,
    1180           0 :               blk->complete_idx,
    1181           0 :               blk->first_shred_ts,
    1182           0 :               blk->turbine_cnt,
    1183           0 :               blk->repair_cnt);
    1184           0 :     }
    1185           0 :     sleep( 1 );
    1186           0 :   }
    1187           0 : }
    1188             : 
    1189             : static void
    1190             : repair_cmd_fn_waterfall( args_t *   args,
    1191           0 :                          config_t * config ) {
    1192             : 
    1193           0 :   fd_topo_t * topo    = &config->topo;
    1194           0 :   ulong       wksp_id = fd_topo_find_wksp( topo, "repair" );
    1195           0 :   if( FD_UNLIKELY( wksp_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair workspace not found" ));
    1196           0 :   fd_topo_wksp_t * repair_wksp = &topo->workspaces[ wksp_id ];
    1197           0 :   fd_topo_join_workspace( topo, repair_wksp, FD_SHMEM_JOIN_MODE_READ_ONLY, FD_TOPO_CORE_DUMP_LEVEL_DISABLED );
    1198             : 
    1199             :   /* Access the repair tile scratch memory where repair_tile_ctx is stored */
    1200           0 :   ulong tile_id = fd_topo_find_tile( topo, "repair", 0UL );
    1201           0 :   if( FD_UNLIKELY( tile_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair tile not found" ));
    1202           0 :   fd_topo_tile_t * tile = &topo->tiles[ tile_id ];
    1203           0 :   void * scratch = fd_topo_obj_laddr( &config->topo, tile->tile_obj_id );
    1204           0 :   if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "Failed to access repair tile scratch memory" ));
    1205             : 
    1206           0 :   FD_SCRATCH_ALLOC_INIT( l, scratch );
    1207           0 :   ctx_t * repair_ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ctx_t), sizeof(ctx_t) );
    1208             : 
    1209             :   /* catchup cmd owned memory */
    1210           0 :   location_table = fd_location_table_join( fd_location_table_new( location_table_mem ) );
    1211           0 :   read_iptable( args->repair.iptable_path, location_table );
    1212             : 
    1213             :   // Add terminal setup here - same as monitor.c
    1214           0 :   atexit( restore_terminal );
    1215           0 :   if( FD_UNLIKELY( 0!=tcgetattr( STDIN_FILENO, &termios_backup ) ) ) {
    1216           0 :     FD_LOG_ERR(( "tcgetattr(STDIN_FILENO) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
    1217           0 :   }
    1218             : 
    1219             :   /* Disable character echo and line buffering */
    1220           0 :   struct termios term = termios_backup;
    1221           0 :   term.c_lflag &= (tcflag_t)~(ICANON | ECHO);
    1222           0 :   if( FD_UNLIKELY( 0!=tcsetattr( STDIN_FILENO, TCSANOW, &term ) ) ) {
    1223           0 :     FD_LOG_WARNING(( "tcsetattr(STDIN_FILENO) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
    1224           0 :   }
    1225             : 
    1226           0 :   int  catchup_verbose = 0;
    1227           0 :   long last_print = 0;
    1228           0 :   for( ;; ) {
    1229           0 :     int c = fd_getchar();
    1230           0 :     if( FD_UNLIKELY( c=='i'    ) ) catchup_verbose = !catchup_verbose;
    1231           0 :     if( FD_UNLIKELY( c=='\x04' ) ) break; /* Ctrl-D */
    1232             : 
    1233           0 :     long now = fd_log_wallclock();
    1234           0 :     if( FD_UNLIKELY( now - last_print > 1e9L ) ) {
    1235           0 :       last_print = now;
    1236           0 :       print_catchup_slots( repair_wksp->wksp, repair_ctx, catchup_verbose, args->repair.sort_by_slot );
    1237           0 :       printf( "catchup slots | Use 'i' to toggle extra slot information" TEXT_NEWLINE );
    1238           0 :       fflush( stdout );
    1239             : 
    1240             :       /* Peer location latency is not that useful post catchup, and also
    1241             :          requires some concurrent dlist iteration, so only print it when
    1242             :          in catchup mode. */
    1243           0 :     }
    1244           0 :   }
    1245           0 : }
    1246             : 
    1247           0 : #define PEERS_DISPLAY_MAX 20
    1248             : 
    1249             : static void
    1250             : print_peer_dlist( fd_policy_peer_dlist_t *     dlist,
    1251             :                   fd_policy_peer_t *           pool,
    1252             :                   fd_policy_peer_dlist_iter_t  cursor,
    1253           0 :                   char const *                 label ) {
    1254           0 :   ulong cnt = 0;
    1255           0 :   for( fd_policy_peer_dlist_iter_t it = fd_policy_peer_dlist_iter_fwd_init( dlist, pool );
    1256           0 :        !fd_policy_peer_dlist_iter_done( it, dlist, pool );
    1257           0 :        it = fd_policy_peer_dlist_iter_fwd_next( it, dlist, pool ) ) cnt++;
    1258             : 
    1259           0 :   printf( "%s (%lu peers)\n", label, cnt );
    1260           0 :   if( !cnt || fd_policy_peer_dlist_iter_done( cursor, dlist, pool ) ) {
    1261           0 :     printf( "  (empty or iterator not initialized)\n\n" );
    1262           0 :     return;
    1263           0 :   }
    1264             : 
    1265           0 :   printf( "     | %-8s | %-12s | %-12s | %-8s | %-8s\n",
    1266           0 :           "Idx", "Pubkey", "Ewma Lat", "Avg Lat", "Req/Res" );
    1267           0 :   printf( "-----+----------+--------------+--------------+----------+---------\n" );
    1268             : 
    1269           0 :   fd_policy_peer_dlist_iter_t it = cursor;
    1270           0 :   for( ulong i = 0; i < PEERS_DISPLAY_MAX && i < cnt; i++ ) {
    1271           0 :     fd_policy_peer_t * peer = fd_policy_peer_dlist_iter_ele( it, dlist, pool );
    1272             : 
    1273           0 :     FD_BASE58_ENCODE_32_BYTES( peer->key.key, b58 );
    1274           0 :     char pubkey_short[13];
    1275           0 :     fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( pubkey_short ), b58, 12 ) );
    1276             : 
    1277           0 :     double avg_lat_ms  = peer->res_cnt ? ((double)peer->total_lat / (double)peer->res_cnt) / 1e6 : 0.0;
    1278           0 :     double ewma_lat_ms = (double)peer->ewma_lat / 1e6;
    1279             : 
    1280           0 :     printf( " %s%c%s | %-8lu | %-12s | %9.3fms | %9.3fms | %lu/%lu\n",
    1281           0 :             i == 0 ? "\033[1;33m" : "",
    1282           0 :             i == 0 ? '>' : ' ',
    1283           0 :             i == 0 ? "\033[0m"    : "",
    1284           0 :             fd_policy_peer_pool_idx( pool, peer ),
    1285           0 :             pubkey_short,
    1286           0 :             ewma_lat_ms,
    1287           0 :             avg_lat_ms,
    1288           0 :             peer->req_cnt,
    1289           0 :             peer->res_cnt );
    1290             : 
    1291           0 :     it = fd_policy_peer_dlist_iter_fwd_next( it, dlist, pool );
    1292           0 :     if( fd_policy_peer_dlist_iter_done( it, dlist, pool ) ) {
    1293           0 :       it = fd_policy_peer_dlist_iter_fwd_init( dlist, pool );
    1294           0 :     }
    1295           0 :   }
    1296           0 :   if( cnt > PEERS_DISPLAY_MAX ) printf( "  ... (%lu more)\n", cnt - PEERS_DISPLAY_MAX );
    1297           0 :   printf( "\n" );
    1298           0 : }
    1299             : 
    1300             : static void
    1301             : repair_cmd_fn_peers( args_t *   args,
    1302           0 :                      config_t * config ) {
    1303           0 :   ctx_t *          repair_ctx;
    1304           0 :   fd_topo_wksp_t * repair_wksp;
    1305           0 :   repair_ctx_wksp( args, config, &repair_ctx, &repair_wksp );
    1306             : 
    1307           0 :   fd_policy_t * policy = fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->policy ) );
    1308             : 
    1309           0 :   fd_policy_peer_dlist_t * fast_dlist = fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, policy->peers.fast ) );
    1310           0 :   fd_policy_peer_dlist_t * slow_dlist = fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, policy->peers.slow ) );
    1311           0 :   fd_policy_peer_t *       pool       = fd_wksp_laddr( repair_wksp->wksp, fd_wksp_gaddr_fast( repair_ctx->wksp, policy->peers.pool ) );
    1312             : 
    1313           0 :   long last_print = 0;
    1314           0 :   for( ;; ) {
    1315           0 :     long now = fd_log_wallclock();
    1316           0 :     if( FD_UNLIKELY( now - last_print > 1e9L ) ) {
    1317           0 :       last_print = now;
    1318           0 :       printf( "\033[2J\033[H" );
    1319             : 
    1320           0 :       char fast_label[64];
    1321           0 :       char slow_label[64];
    1322           0 :       snprintf( fast_label, sizeof(fast_label), "FAST PEERS (ewma < %ldms)", (long)(FD_POLICY_LATENCY_THRESH / 1e6L) );
    1323           0 :       snprintf( slow_label, sizeof(slow_label), "SLOW PEERS (ewma >= %ldms or no responses)", (long)(FD_POLICY_LATENCY_THRESH / 1e6L) );
    1324           0 :       print_peer_dlist( fast_dlist, pool, policy->peers.select.fast_iter, fast_label );
    1325           0 :       print_peer_dlist( slow_dlist, pool, policy->peers.select.slow_iter, slow_label );
    1326             : 
    1327           0 :       printf( "select cnt: %u / %u (fast per slow)\n", policy->peers.select.cnt, FD_POLICY_FAST_PER_SLOW );
    1328           0 :       printf( "pool used: %lu / %lu\n", fd_policy_peer_pool_used( pool ), fd_policy_peer_pool_max( pool ) );
    1329             : 
    1330           0 :       fflush( stdout );
    1331           0 :     }
    1332             : 
    1333           0 :   }
    1334           0 : }
    1335             : 
    1336             : 
    1337             : 
    1338             : 
    1339             : void
    1340             : repair_cmd_args( int *    pargc,
    1341             :                  char *** pargv,
    1342           0 :                  args_t * args ) {
    1343             : 
    1344             :   /* positional arg */
    1345             : 
    1346           0 :   args->repair.pos_arg = (*pargv)[0];
    1347           0 :   if( FD_UNLIKELY( !args->repair.pos_arg ) ) {
    1348           0 :     args->repair.help = 1;
    1349           0 :     return;
    1350           0 :   }
    1351           0 :   (*pargc)--;
    1352           0 :   (*pargv)++;
    1353             : 
    1354             :   /* required args */
    1355             : 
    1356           0 :   char const * manifest_path = fd_env_strip_cmdline_cstr    ( pargc, pargv, "--manifest-path", NULL, NULL      );
    1357             : 
    1358             :   /* optional args */
    1359             : 
    1360           0 :   char const * iptable_path  = fd_env_strip_cmdline_cstr    ( pargc, pargv, "--iptable",       NULL, NULL      );
    1361           0 :   ulong        slot          = fd_env_strip_cmdline_ulong   ( pargc, pargv, "--slot",          NULL, ULONG_MAX );
    1362           0 :   int          sort_by_slot  = fd_env_strip_cmdline_contains( pargc, pargv, "--sort-by-slot"                   );
    1363             : 
    1364           0 :   if( FD_UNLIKELY( !strcmp( args->repair.pos_arg, "catchup" ) && !manifest_path ) ) {
    1365           0 :     args->repair.help = 1;
    1366           0 :     return;
    1367           0 :   }
    1368             : 
    1369           0 :   fd_cstr_fini( fd_cstr_append_cstr_safe( fd_cstr_init( args->repair.manifest_path ), manifest_path, sizeof(args->repair.manifest_path)-1UL ) );
    1370           0 :   fd_cstr_fini( fd_cstr_append_cstr_safe( fd_cstr_init( args->repair.iptable_path ),  iptable_path,  sizeof(args->repair.iptable_path )-1UL ) );
    1371           0 :   args->repair.slot         = slot;
    1372           0 :   args->repair.sort_by_slot = sort_by_slot;
    1373           0 : }
    1374             : 
    1375             : static void
    1376             : repair_cmd_fn( args_t *   args,
    1377           0 :               config_t * config ) {
    1378             : 
    1379           0 :   if( args->repair.help ) {
    1380           0 :     fd_action_help_print( &fd_action_repair );
    1381           0 :     return;
    1382           0 :   }
    1383             : 
    1384           0 :   if     ( !strcmp( args->repair.pos_arg, "catchup"   ) ) repair_cmd_fn_catchup  ( args, config );
    1385           0 :   else if( !strcmp( args->repair.pos_arg, "eqvoc"     ) ) repair_cmd_fn_eqvoc    ( args, config );
    1386           0 :   else if( !strcmp( args->repair.pos_arg, "forest"    ) ) repair_cmd_fn_forest   ( args, config );
    1387           0 :   else if( !strcmp( args->repair.pos_arg, "inflight"  ) ) repair_cmd_fn_inflight ( args, config );
    1388           0 :   else if( !strcmp( args->repair.pos_arg, "requests"  ) ) repair_cmd_fn_requests ( args, config );
    1389           0 :   else if( !strcmp( args->repair.pos_arg, "waterfall" ) ) repair_cmd_fn_waterfall( args, config );
    1390           0 :   else if( !strcmp( args->repair.pos_arg, "peers"     ) ) repair_cmd_fn_peers    ( args, config );
    1391           0 :   else if( !strcmp( args->repair.pos_arg, "metrics"   ) ) repair_cmd_fn_metrics  ( args, config );
    1392           0 :   else                                                    fd_action_help_print( &fd_action_repair );
    1393           0 : }
    1394             : 
    1395             : static void
    1396           0 : repair_args_help( fd_action_help_t * help ) {
    1397           0 :   fd_action_help_arg( help, "catchup",         NULL,     "Run a reduced topology that only repairs slots until catchup.\n"
    1398           0 :                                                          "Requires --manifest-path; accepts --iptable and --sort-by-slot" );
    1399           0 :   fd_action_help_arg( help, "eqvoc",           NULL,     "Test equivocation detection and the repair path" );
    1400           0 :   fd_action_help_arg( help, "forest",          NULL,     "Print the repair forest.  Accepts --slot to drill into a slot" );
    1401           0 :   fd_action_help_arg( help, "inflight",        NULL,     "Print the inflight repairs" );
    1402           0 :   fd_action_help_arg( help, "requests",        NULL,     "Print the queued repair requests" );
    1403           0 :   fd_action_help_arg( help, "waterfall",       NULL,     "Print a waterfall diagram of recent slot completion times and\n"
    1404           0 :                                                          "response latencies.  Accepts --iptable and --sort-by-slot" );
    1405           0 :   fd_action_help_arg( help, "peers",           NULL,     "Print the list of slow and fast repair peers" );
    1406           0 :   fd_action_help_arg( help, "metrics",         NULL,     "Print repair tile metrics in a digestible format" );
    1407           0 :   fd_action_help_arg( help, "--manifest-path", "<path>", "Path to manifest file (required by catchup)" );
    1408           0 :   fd_action_help_arg( help, "--iptable",       "<path>", "Path to iptable file (catchup, waterfall)" );
    1409           0 :   fd_action_help_arg( help, "--slot",          "<slot>", "Specific forest slot to drill into (forest)" );
    1410             :   fd_action_help_arg( help, "--sort-by-slot",  NULL,     "Sort results by slot (catchup, waterfall)" );
    1411           0 : }
    1412             : 
    1413             : action_t fd_action_repair = {
    1414             :   .name        = "repair",
    1415             :   .args        = repair_cmd_args,
    1416             :   .fn          = repair_cmd_fn,
    1417             :   .perm        = dev_cmd_perm,
    1418             :   .description = "Spawn a reduced topology for inspecting and profiling the repair tile",
    1419             :   .detail      = "Boots a smaller Firedancer topology focused on the repair tile and runs the\n"
    1420             :                  "requested subcommand to drive or inspect repair behavior.  Pick one of the\n"
    1421             :                  "subcommands below.",
    1422             :   .usage       = "repair <catchup|eqvoc|forest|inflight|requests|waterfall|peers|metrics> [OPTIONS]",
    1423             :   .args_help   = repair_args_help,
    1424             : };

Generated by: LCOV version 1.14