LCOV - code coverage report
Current view: top level - app/firedancer-dev/commands - repair.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 0 481 0.0 %
Date: 2025-09-19 04:41:14 Functions: 0 11 0.0 %

          Line data    Source code
       1             : /* The repair command spawns a smaller topology for profiling the repair
       2             :    tile.  This is a standalone application, and it can be run in mainnet,
       3             :    testnet and/or a private cluster. */
       4             : 
       5             : #include "../../../disco/net/fd_net_tile.h"
       6             : #include "../../../disco/tiles.h"
       7             : #include "../../../disco/topo/fd_topob.h"
       8             : #include "../../../disco/topo/fd_cpu_topo.h"
       9             : #include "../../../util/pod/fd_pod_format.h"
      10             : #include "../../../util/tile/fd_tile_private.h"
      11             : 
      12             : #include "../../firedancer/topology.h"
      13             : #include "../../firedancer/topology.c"
      14             : #include "../../shared/commands/configure/configure.h"
      15             : #include "../../shared/commands/run/run.h" /* initialize_workspaces */
      16             : #include "../../shared/fd_config.h" /* config_t */
      17             : #include "../../shared_dev/commands/dev.h"
      18             : #include "../../../disco/tiles.h"
      19             : #include "../../../disco/topo/fd_topob.h"
      20             : #include "../../../util/pod/fd_pod_format.h"
      21             : #include "../../../waltz/resolv/fd_io_readline.h"
      22             : 
      23             : #include "../../../discof/repair/fd_repair_tile.c"
      24             : 
      25             : #include "gossip.h"
      26             : #include "core_subtopo.h"
      27             : 
      28             : #include <unistd.h> /* pause */
      29             : #include <fcntl.h>
      30             : #include <stdio.h>
      31             : 
      32             : struct fd_location_info {
      33             :   ulong ip4_addr;         /* for map key convenience */
      34             :   char location[ 128 ];
      35             : };
      36             : typedef struct fd_location_info fd_location_info_t;
      37             : 
      38             : #define MAP_NAME    fd_location_table
      39           0 : #define MAP_T       fd_location_info_t
      40           0 : #define MAP_KEY     ip4_addr
      41           0 : #define MAP_LG_SLOT_CNT 16
      42             : #define MAP_MEMOIZE 0
      43             : #include "../../../util/tmpl/fd_map.c"
      44             : 
      45             : uchar __attribute__((aligned(alignof(fd_location_info_t)))) location_table_mem[ sizeof(fd_location_info_t) * (1 << 16 ) ];
      46             : 
      47             : fd_topo_run_tile_t
      48             : fdctl_tile_run( fd_topo_tile_t const * tile );
      49             : 
      50             : /* repair_topo is a subset of "src/app/firedancer/topology.c" at commit
      51             :    0d8386f4f305bb15329813cfe4a40c3594249e96, slightly modified to work
      52             :    as a repair profiler.  TODO ideally, one should invoke the firedancer
      53             :    topology first, and exclude the parts that are not needed, instead of
      54             :    manually generating new topologies for every command.  This would
      55             :    also guarantee that the profiler is replicating (as close as possible)
      56             :    the full topology. */
      57             : static void
      58           0 : repair_topo( config_t * config ) {
      59           0 :   resolve_gossip_entrypoints( config );
      60             : 
      61           0 :   ulong net_tile_cnt    = config->layout.net_tile_count;
      62           0 :   ulong shred_tile_cnt  = config->layout.shred_tile_count;
      63           0 :   ulong quic_tile_cnt   = config->layout.quic_tile_count;
      64           0 :   ulong sign_tile_cnt   = config->firedancer.layout.sign_tile_count;
      65           0 :   ulong gossvf_tile_cnt = config->firedancer.layout.gossvf_tile_count;
      66             : 
      67           0 :   fd_topo_t * topo = { fd_topob_new( &config->topo, config->name ) };
      68           0 :   topo->max_page_size = fd_cstr_to_shmem_page_sz( config->hugetlbfs.max_page_size );
      69           0 :   topo->gigantic_page_threshold = config->hugetlbfs.gigantic_page_threshold_mib << 20;
      70             : 
      71           0 :   ulong tile_to_cpu[ FD_TILE_MAX ] = {0};
      72           0 :   ushort parsed_tile_to_cpu[ FD_TILE_MAX ];
      73             :   /* Unassigned tiles will be floating, unless auto topology is enabled. */
      74           0 :   for( ulong i=0UL; i<FD_TILE_MAX; i++ ) parsed_tile_to_cpu[ i ] = USHORT_MAX;
      75             : 
      76           0 :   int is_auto_affinity = !strcmp( config->layout.affinity, "auto" );
      77           0 :   int is_bench_auto_affinity = !strcmp( config->development.bench.affinity, "auto" );
      78             : 
      79           0 :   if( FD_UNLIKELY( is_auto_affinity != is_bench_auto_affinity ) ) {
      80           0 :     FD_LOG_ERR(( "The CPU affinity string in the configuration file under [layout.affinity] and [development.bench.affinity] must all be set to 'auto' or all be set to a specific CPU affinity string." ));
      81           0 :   }
      82             : 
      83           0 :   fd_topo_cpus_t cpus[1];
      84           0 :   fd_topo_cpus_init( cpus );
      85             : 
      86           0 :   ulong affinity_tile_cnt = 0UL;
      87           0 :   if( FD_LIKELY( !is_auto_affinity ) ) affinity_tile_cnt = fd_tile_private_cpus_parse( config->layout.affinity, parsed_tile_to_cpu );
      88             : 
      89           0 :   for( ulong i=0UL; i<affinity_tile_cnt; i++ ) {
      90           0 :     if( FD_UNLIKELY( parsed_tile_to_cpu[ i ]!=USHORT_MAX && parsed_tile_to_cpu[ i ]>=cpus->cpu_cnt ) )
      91           0 :       FD_LOG_ERR(( "The CPU affinity string in the configuration file under [layout.affinity] specifies a CPU index of %hu, but the system "
      92           0 :                   "only has %lu CPUs. You should either change the CPU allocations in the affinity string, or increase the number of CPUs "
      93           0 :                   "in the system.",
      94           0 :                   parsed_tile_to_cpu[ i ], cpus->cpu_cnt ));
      95           0 :     tile_to_cpu[ i ] = fd_ulong_if( parsed_tile_to_cpu[ i ]==USHORT_MAX, ULONG_MAX, (ulong)parsed_tile_to_cpu[ i ] );
      96           0 :   }
      97             : 
      98           0 :   fd_core_subtopo(   config, tile_to_cpu );
      99           0 :   fd_gossip_subtopo( config, tile_to_cpu );
     100             : 
     101             :   /*             topo, name */
     102           0 :   fd_topob_wksp( topo, "net_shred"    );
     103           0 :   fd_topob_wksp( topo, "net_repair"   );
     104           0 :   fd_topob_wksp( topo, "net_quic"     );
     105             : 
     106           0 :   fd_topob_wksp( topo, "shred_repair" );
     107           0 :   fd_topob_wksp( topo, "replay_stake" );
     108             : 
     109           0 :   fd_topob_wksp( topo, "poh_shred"    );
     110             : 
     111           0 :   fd_topob_wksp( topo, "shred_sign"   );
     112           0 :   fd_topob_wksp( topo, "sign_shred"   );
     113             : 
     114           0 :   fd_topob_wksp( topo, "repair_sign"  );
     115           0 :   fd_topob_wksp( topo, "sign_repair"  );
     116             : 
     117           0 :   fd_topob_wksp( topo, "repair_repla" );
     118           0 :   fd_topob_wksp( topo, "send_txns"    );
     119             : 
     120           0 :   fd_topob_wksp( topo, "shred"        );
     121           0 :   fd_topob_wksp( topo, "repair"       );
     122           0 :   fd_topob_wksp( topo, "fec_sets"     );
     123           0 :   fd_topob_wksp( topo, "snap_out"     );
     124             : 
     125           0 :   fd_topob_wksp( topo, "slot_fseqs"   ); /* fseqs for marked slots eg. turbine slot */
     126             : 
     127           0 :   #define FOR(cnt) for( ulong i=0UL; i<cnt; i++ )
     128             : 
     129           0 :   ulong pending_fec_shreds_depth = fd_ulong_min( fd_ulong_pow2_up( config->tiles.shred.max_pending_shred_sets * FD_REEDSOL_DATA_SHREDS_MAX ), USHORT_MAX + 1 /* dcache max */ );
     130             : 
     131             :   /*                                  topo, link_name,      wksp_name,      depth,                                    mtu,                           burst */
     132           0 :   FOR(quic_tile_cnt)   fd_topob_link( topo, "quic_net",     "net_quic",     config->net.ingress_buffer_size,          FD_NET_MTU,                    1UL );
     133           0 :   FOR(shred_tile_cnt)  fd_topob_link( topo, "shred_net",    "net_shred",    config->net.ingress_buffer_size,          FD_NET_MTU,                    1UL );
     134             : 
     135             :   /**/                 fd_topob_link( topo, "replay_stake", "replay_stake", 128UL,                                    40UL + 40200UL * 40UL,         1UL );
     136             : 
     137           0 :   FOR(shred_tile_cnt)  fd_topob_link( topo, "shred_sign",   "shred_sign",   128UL,                                    32UL,                          1UL );
     138           0 :   FOR(shred_tile_cnt)  fd_topob_link( topo, "sign_shred",   "sign_shred",   128UL,                                    64UL,                          1UL );
     139             : 
     140           0 :   /**/                 fd_topob_link( topo, "repair_net",   "net_repair",   config->net.ingress_buffer_size,          FD_NET_MTU,                    1UL );
     141             : 
     142           0 :   FOR(shred_tile_cnt)  fd_topob_link( topo, "shred_repair", "shred_repair", pending_fec_shreds_depth,                 FD_SHRED_REPAIR_MTU,           2UL /* at most 2 msgs per after_frag */ );
     143             : 
     144           0 :   FOR(shred_tile_cnt)  fd_topob_link( topo, "repair_shred", "shred_repair", pending_fec_shreds_depth,                 sizeof(fd_ed25519_sig_t),      1UL );
     145             : 
     146             :   /**/                 fd_topob_link( topo, "ping_sign",    "repair_sign",  128UL,                                    2048UL,                        1UL );
     147           0 :   /**/                 fd_topob_link( topo, "sign_ping",    "sign_repair",  128UL,                                    sizeof(fd_ed25519_sig_t),      1UL );
     148           0 :   FOR(sign_tile_cnt-1) fd_topob_link( topo, "repair_sign",  "repair_sign",  128UL,                                    2048UL,                        1UL );
     149           0 :   FOR(sign_tile_cnt-1) fd_topob_link( topo, "sign_repair",  "sign_repair",  1024UL,                                   sizeof(fd_ed25519_sig_t),      1UL );
     150             : 
     151             :   /**/                 fd_topob_link( topo, "repair_repla", "repair_repla", 65536UL,                                  sizeof(fd_reasm_fec_t),    1UL );
     152           0 :   /**/                 fd_topob_link( topo, "poh_shred",    "poh_shred",    16384UL,                                  USHORT_MAX,                    1UL );
     153             : 
     154           0 :   /**/                 fd_topob_link( topo, "send_txns",    "send_txns",    128UL,                                    FD_TXN_MTU,                    1UL );
     155             : 
     156             :   /**/                 fd_topob_link( topo, "snap_out",     "snap_out",     2UL,                                      sizeof(fd_snapshot_manifest_t), 1UL );
     157             : 
     158           0 :   FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_repair", i, config->net.ingress_buffer_size );
     159           0 :   FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_quic",   i, config->net.ingress_buffer_size );
     160           0 :   FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_shred",  i, config->net.ingress_buffer_size );
     161             : 
     162             :   /*                                              topo, tile_name, tile_wksp, metrics_wksp, cpu_idx,                       is_agave, uses_keyswitch */
     163           0 :   FOR(shred_tile_cnt)              fd_topob_tile( topo, "shred",   "shred",   "metric_in",  tile_to_cpu[ topo->tile_cnt ], 0,        1 );
     164           0 :   fd_topo_tile_t * repair_tile =   fd_topob_tile( topo, "repair",  "repair",  "metric_in",  tile_to_cpu[ topo->tile_cnt ], 0,        0 );
     165             : 
     166             :   /* Setup a shared wksp object for fec sets. */
     167             : 
     168           0 :   ulong shred_depth = 65536UL; /* from fdctl/topology.c shred_store link. MAKE SURE TO KEEP IN SYNC. */
     169           0 :   ulong fec_set_cnt = shred_depth + config->tiles.shred.max_pending_shred_sets + 4UL;
     170           0 :   ulong fec_sets_sz = fec_set_cnt*sizeof(fd_shred34_t)*4; /* mirrors # of dcache entires in frankendancer */
     171           0 :   fd_topo_obj_t * fec_sets_obj = setup_topo_fec_sets( topo, "fec_sets", shred_tile_cnt*fec_sets_sz );
     172           0 :   for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
     173           0 :     fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
     174           0 :     fd_topob_tile_uses( topo, shred_tile,  fec_sets_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
     175           0 :   }
     176           0 :   fd_topob_tile_uses( topo, repair_tile, fec_sets_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
     177           0 :   FD_TEST( fd_pod_insertf_ulong( topo->props, fec_sets_obj->id, "fec_sets" ) );
     178             : 
     179             :   /* There's another special fseq that's used to communicate the shred
     180             :     version from the Agave boot path to the shred tile. */
     181           0 :   fd_topo_obj_t * poh_shred_obj = fd_topob_obj( topo, "fseq", "poh_shred" );
     182           0 :   fd_topo_tile_t * poh_tile = &topo->tiles[ fd_topo_find_tile( topo, "gossip", 0UL ) ];
     183           0 :   fd_topob_tile_uses( topo, poh_tile, poh_shred_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
     184             : 
     185             :   /* root_slot is an fseq marking the validator's current Tower root. */
     186             : 
     187           0 :   fd_topo_obj_t * root_slot_obj = fd_topob_obj( topo, "fseq", "slot_fseqs" );
     188           0 :   FD_TEST( fd_pod_insertf_ulong( topo->props, root_slot_obj->id, "root_slot" ) );
     189             : 
     190           0 :   for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
     191           0 :     fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
     192           0 :     fd_topob_tile_uses( topo, shred_tile, poh_shred_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
     193           0 :   }
     194           0 :   FD_TEST( fd_pod_insertf_ulong( topo->props, poh_shred_obj->id, "poh_shred" ) );
     195             : 
     196           0 :   if( FD_LIKELY( !is_auto_affinity ) ) {
     197           0 :     if( FD_UNLIKELY( affinity_tile_cnt<topo->tile_cnt ) )
     198           0 :       FD_LOG_ERR(( "The topology you are using has %lu tiles, but the CPU affinity specified in the config tile as [layout.affinity] only provides for %lu cores. "
     199           0 :                   "You should either increase the number of cores dedicated to Firedancer in the affinity string, or decrease the number of cores needed by reducing "
     200           0 :                   "the total tile count. You can reduce the tile count by decreasing individual tile counts in the [layout] section of the configuration file.",
     201           0 :                   topo->tile_cnt, affinity_tile_cnt ));
     202           0 :     if( FD_UNLIKELY( affinity_tile_cnt>topo->tile_cnt ) )
     203           0 :       FD_LOG_WARNING(( "The topology you are using has %lu tiles, but the CPU affinity specified in the config tile as [layout.affinity] provides for %lu cores. "
     204           0 :                       "Not all cores in the affinity will be used by Firedancer. You may wish to increase the number of tiles in the system by increasing "
     205           0 :                       "individual tile counts in the [layout] section of the configuration file.",
     206           0 :                       topo->tile_cnt, affinity_tile_cnt ));
     207           0 :   }
     208             : 
     209             :   /*                                      topo, tile_name, tile_kind_id, fseq_wksp,   link_name,      link_kind_id, reliable,            polled */
     210           0 :   for( ulong j=0UL; j<shred_tile_cnt; j++ )
     211           0 :                   fd_topos_tile_in_net(  topo,                          "metric_in", "shred_net",    j,            FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
     212           0 :   for( ulong j=0UL; j<quic_tile_cnt; j++ )
     213           0 :                   {fd_topos_tile_in_net(  topo,                          "metric_in", "quic_net",     j,            FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );} /* No reliable consumers of networking fragments, may be dropped or overrun */
     214             : 
     215           0 :   /**/            fd_topob_tile_in(      topo, "gossip",  0UL,         "metric_in", "send_txns",    0UL,           FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED );
     216             : 
     217           0 :   /**/            fd_topos_tile_in_net(  topo,                          "metric_in", "repair_net",   0UL,          FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
     218             : 
     219           0 :   FOR(shred_tile_cnt) for( ulong j=0UL; j<net_tile_cnt; j++ )
     220           0 :                        fd_topob_tile_in(  topo, "shred",  i,             "metric_in", "net_shred",     j,            FD_TOPOB_UNRELIABLE,   FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
     221           0 :   FOR(shred_tile_cnt)  fd_topob_tile_in(  topo, "shred",  i,             "metric_in", "poh_shred",     0UL,          FD_TOPOB_RELIABLE,     FD_TOPOB_POLLED );
     222           0 :   FOR(shred_tile_cnt)  fd_topob_tile_in(  topo, "shred",  i,             "metric_in", "replay_stake",  0UL,          FD_TOPOB_RELIABLE,     FD_TOPOB_POLLED );
     223           0 :   FOR(shred_tile_cnt)  fd_topob_tile_in(  topo, "shred",  i,             "metric_in", "gossip_out",    0UL,          FD_TOPOB_RELIABLE,     FD_TOPOB_POLLED );
     224           0 :   FOR(shred_tile_cnt)  fd_topob_tile_out( topo, "shred",  i,                          "shred_repair",  i                                                    );
     225           0 :   FOR(shred_tile_cnt)  fd_topob_tile_out( topo, "shred",  i,                          "shred_net",     i                                                    );
     226           0 :   FOR(shred_tile_cnt)  fd_topob_tile_in ( topo, "shred",  i,             "metric_in", "ipecho_out",    0UL,          FD_TOPOB_RELIABLE,     FD_TOPOB_POLLED );
     227             : 
     228           0 :   FOR(shred_tile_cnt)  fd_topob_tile_in(  topo, "shred",  i,             "metric_in",  "repair_shred", i,            FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     229             : 
     230             :   /**/                 fd_topob_tile_out( topo, "repair",  0UL,                       "repair_net",    0UL                                                  );
     231             : 
     232             :   /* Sign links don't need to be reliable because they are synchronous,
     233             :     so there's at most one fragment in flight at a time anyway.  The
     234             :     sign links are also not polled by the mux, instead the tiles will
     235             :     read the sign responses out of band in a dedicated spin loop. */
     236           0 :   for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
     237           0 :     /**/               fd_topob_tile_in(  topo, "sign",   0UL,           "metric_in", "shred_sign",    i,            FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED   );
     238           0 :     /**/               fd_topob_tile_out( topo, "shred",  i,                          "shred_sign",    i                                                    );
     239           0 :     /**/               fd_topob_tile_in(  topo, "shred",  i,             "metric_in", "sign_shred",    i,            FD_TOPOB_UNRELIABLE, FD_TOPOB_UNPOLLED );
     240           0 :     /**/               fd_topob_tile_out( topo, "sign",   0UL,                        "sign_shred",    i                                                    );
     241           0 :   }
     242             : 
     243           0 :   FOR(gossvf_tile_cnt) fd_topob_tile_in ( topo, "gossvf",   i,            "metric_in", "replay_stake", 0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     244             : 
     245           0 :   /**/                 fd_topob_tile_in ( topo, "gossip",   0UL,          "metric_in", "replay_stake", 0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     246             : 
     247           0 :   FOR(net_tile_cnt)    fd_topob_tile_in(  topo, "repair",  0UL,          "metric_in", "net_repair",    i,            FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED   ); /* No reliable consumers of networking fragments, may be dropped or overrun */
     248           0 :   /**/                 fd_topob_tile_in(  topo, "repair",  0UL,          "metric_in", "gossip_out",    0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     249           0 :   /**/                 fd_topob_tile_in(  topo, "repair",  0UL,          "metric_in", "replay_stake",  0UL,          FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED   );
     250           0 :                        fd_topob_tile_in(  topo, "repair",  0UL,          "metric_in", "snap_out",      0UL,          FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     251           0 :   FOR(shred_tile_cnt)  fd_topob_tile_in(  topo, "repair",  0UL,          "metric_in", "shred_repair",  i,            FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     252             : 
     253           0 :   /**/                 fd_topob_tile_in(  topo, "sign",   0UL,         "metric_in",  "ping_sign",    0UL,    FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     254           0 :   /**/                 fd_topob_tile_out( topo, "repair", 0UL,                       "ping_sign",    0UL                                            );
     255           0 :   /**/                 fd_topob_tile_out( topo, "repair", 0UL,                       "repair_repla", 0UL                                            );
     256           0 :   FOR(shred_tile_cnt)  fd_topob_tile_out( topo, "repair", 0UL,                       "repair_shred", i                                              );
     257           0 :   /**/                 fd_topob_tile_out( topo, "sign",   0UL,                       "sign_ping",    0UL                                            );
     258             : 
     259           0 :   FOR(sign_tile_cnt-1) fd_topob_tile_out( topo, "repair", 0UL,                        "repair_sign",  i                                              );
     260           0 :   FOR(sign_tile_cnt-1) fd_topob_tile_in ( topo, "sign",   i+1,           "metric_in", "repair_sign",  i,      FD_TOPOB_RELIABLE,   FD_TOPOB_POLLED   );
     261           0 :   FOR(sign_tile_cnt-1) fd_topob_tile_out( topo, "sign",   i+1,                        "sign_repair",  i                                              );
     262           0 :   FOR(sign_tile_cnt-1) fd_topob_tile_in ( topo, "repair", 0UL,           "metric_in", "sign_repair",  i,      FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED   );
     263           0 :     /**/               fd_topob_tile_in ( topo, "repair", 0UL,           "metric_in", "sign_ping",    0UL,    FD_TOPOB_UNRELIABLE, FD_TOPOB_UNPOLLED );
     264             : 
     265           0 :     /**/               fd_topob_tile_in ( topo, "gossip", 0UL,           "metric_in", "sign_gossip",  0UL,    FD_TOPOB_UNRELIABLE, FD_TOPOB_UNPOLLED );
     266             : 
     267           0 :   if( 1 ) {
     268           0 :     fd_topob_wksp( topo, "scap" );
     269             : 
     270           0 :     fd_topob_wksp( topo, "repair_scap" );
     271           0 :     fd_topob_wksp( topo, "replay_scap" );
     272             : 
     273           0 :     fd_topo_tile_t * scap_tile = fd_topob_tile( topo, "scap", "scap", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 );
     274             : 
     275           0 :     fd_topob_link( topo, "repair_scap", "repair_scap", 128UL, FD_SLICE_MAX_WITH_HEADERS, 1UL );
     276           0 :     fd_topob_link( topo, "replay_scap", "replay_scap", 128UL, sizeof(fd_hash_t)+sizeof(ulong), 1UL );
     277             : 
     278           0 :     fd_topob_tile_in(  topo, "scap", 0UL, "metric_in", "repair_net", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
     279           0 :     for( ulong j=0UL; j<net_tile_cnt; j++ ) {
     280           0 :       fd_topob_tile_in(  topo, "scap", 0UL, "metric_in", "net_shred", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
     281           0 :     }
     282           0 :     for( ulong j=0UL; j<shred_tile_cnt; j++ ) {
     283           0 :       fd_topob_tile_in(  topo, "scap", 0UL, "metric_in", "shred_repair", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
     284           0 :     }
     285           0 :     fd_topob_tile_in( topo, "scap", 0UL, "metric_in", "gossip_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
     286             : 
     287           0 :     fd_topob_tile_in( topo, "scap", 0UL, "metric_in", "repair_scap", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
     288           0 :     fd_topob_tile_in( topo, "scap", 0UL, "metric_in", "replay_scap", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
     289             : 
     290           0 :     fd_topob_tile_uses( topo, scap_tile, root_slot_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
     291           0 :     fd_topob_tile_out( topo, "scap", 0UL, "replay_stake", 0UL );
     292           0 :     fd_topob_tile_out( topo, "scap", 0UL, "snap_out",     0UL );
     293           0 :   }
     294             : 
     295           0 :   FD_TEST( fd_link_permit_no_producers( topo, "quic_net"     ) == quic_tile_cnt );
     296           0 :   FD_TEST( fd_link_permit_no_producers( topo, "poh_shred"    ) == 1UL           );
     297           0 :   FD_TEST( fd_link_permit_no_producers( topo, "send_txns"    ) == 1UL           );
     298           0 :   FD_TEST( fd_link_permit_no_producers( topo, "repair_scap"  ) == 1UL           );
     299           0 :   FD_TEST( fd_link_permit_no_producers( topo, "replay_scap"  ) == 1UL           );
     300             : 
     301           0 :   FD_TEST( fd_link_permit_no_consumers( topo, "net_quic"     ) == quic_tile_cnt );
     302           0 :   FD_TEST( fd_link_permit_no_consumers( topo, "repair_repla" ) == 1UL           );
     303             : 
     304           0 :   config->tiles.send.send_src_port = 0; /* disable send */
     305             : 
     306           0 :   FOR(net_tile_cnt) fd_topos_net_tile_finish( topo, i );
     307             : 
     308           0 :   for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
     309           0 :     fd_topo_tile_t * tile = &topo->tiles[ i ];
     310           0 :     fd_topo_configure_tile( tile, config );
     311           0 :   }
     312             : 
     313           0 :   if( FD_UNLIKELY( is_auto_affinity ) ) fd_topob_auto_layout( topo, 0 );
     314             : 
     315           0 :   fd_topob_finish( topo, CALLBACKS );
     316             : 
     317           0 :   config->topo = *topo;
     318           0 : }
     319             : 
     320             : extern int * fd_log_private_shared_lock;
     321             : 
     322             : void
     323             : repair_cmd_args( int *    pargc,
     324             :                  char *** pargv,
     325           0 :                  args_t * args ) {
     326             : 
     327           0 :   if( FD_UNLIKELY( !*pargc ) )
     328           0 :     FD_LOG_ERR(( "\n \
     329           0 : usage: (1) repair --manifest-path <manifest_path> [--iptable-path <iptable_path>] \n \
     330           0 :        (2) repair --metrics [--iptable-path <iptable_path>] \n\n \
     331           0 :         Passing --manifest-path starts up profiler mode, which runs a reduced topology that tests catchup and repair performance. \n \
     332           0 :         Passing --metrics prints recent slot completion times and response latencies during a live run. These modes are exclusive. \n \
     333           0 :         --iptable-path: optional path to iptable file to map IP addresses to locations." ));
     334             : 
     335           0 :   char const * manifest_path = fd_env_strip_cmdline_cstr( pargc, pargv, "--manifest-path", NULL, NULL );
     336           0 :   if( fd_env_strip_cmdline_contains( pargc, pargv, "--metrics" ) ) {
     337           0 :     args->repair.metrics_only = 1;
     338           0 :     if( FD_UNLIKELY( manifest_path ) ) FD_LOG_ERR(( "metrics mode does not support --manifest-path" ));
     339           0 :   } else {
     340           0 :     fd_cstr_fini( fd_cstr_append_cstr_safe( fd_cstr_init( args->repair.manifest_path ), manifest_path, sizeof(args->repair.manifest_path)-1UL ) );
     341           0 :   }
     342             : 
     343           0 :   char const * iptable_path = fd_env_strip_cmdline_cstr( pargc, pargv, "--iptable-path", NULL, NULL );
     344           0 :   if( FD_LIKELY( iptable_path ) ) {
     345           0 :     fd_cstr_fini( fd_cstr_append_cstr_safe( fd_cstr_init( args->repair.iptable_path ), iptable_path, sizeof(args->repair.iptable_path)-1UL ) );
     346           0 :   }
     347           0 : }
     348             : 
     349             : static char *
     350           0 : fmt_count( char buf[ static 64 ], ulong count ) {
     351           0 :   char tmp[ 64 ];
     352           0 :   if( FD_LIKELY( count<1000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%lu", count ) );
     353           0 :   else if( FD_LIKELY( count<1000000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%.1f K", (double)count/1000.0 ) );
     354           0 :   else if( FD_LIKELY( count<1000000000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%.1f M", (double)count/1000000.0 ) );
     355             : 
     356           0 :   FD_TEST( fd_cstr_printf_check( buf, 64UL, NULL, "%12s", tmp ) );
     357           0 :   return buf;
     358           0 : }
     359             : 
     360             : static void
     361             : print_histogram_buckets( volatile ulong * metrics,
     362             :                          ulong offset,
     363             :                          int converter,
     364             :                          double histmin,
     365             :                          double histmax,
     366           0 :                          char * title ) {
     367           0 :   fd_histf_t hist[1];
     368             : 
     369             :   /* Create histogram structure only to get bucket edges for display */
     370           0 :   if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
     371             :     /* For SLOT_COMPLETE_TIME: min=0.2, max=2.0 seconds */
     372           0 :     FD_TEST( fd_histf_new( hist, fd_metrics_convert_seconds_to_ticks( histmin ), fd_metrics_convert_seconds_to_ticks( histmax ) ) );
     373           0 :   } else if( FD_LIKELY( converter == FD_METRICS_CONVERTER_NONE ) ) {
     374             :     /* For non-time histograms, we'd need the actual min/max values */
     375           0 :     FD_TEST( fd_histf_new( hist, (ulong)histmin, (ulong)histmax ) );
     376           0 :   } else {
     377           0 :     FD_LOG_ERR(( "unknown converter %i", converter ));
     378           0 :   }
     379             : 
     380           0 :   printf( " +---------------------+--------------------+--------------+\n" );
     381           0 :   printf( " | %-19s |                    | Count        |\n", title );
     382           0 :   printf( " +---------------------+--------------------+--------------+\n" );
     383             : 
     384           0 :   ulong total_count = 0;
     385           0 :   for( ulong k = 0; k < FD_HISTF_BUCKET_CNT; k++ ) {
     386           0 :     ulong bucket_count = metrics[ offset + k ];
     387           0 :     total_count += bucket_count;
     388           0 :   }
     389             : 
     390           0 :   for( ulong k = 0; k < FD_HISTF_BUCKET_CNT; k++ ) {
     391             :     /* Get individual bucket count directly from metrics array */
     392           0 :     ulong bucket_count = metrics[ offset + k ];
     393             : 
     394           0 :     char * le_str;
     395           0 :     char le_buf[ 64 ];
     396           0 :     if( FD_UNLIKELY( k == FD_HISTF_BUCKET_CNT - 1UL ) ) {
     397           0 :       le_str = "+Inf";
     398           0 :     } else {
     399           0 :       ulong edge = fd_histf_right( hist, k );
     400           0 :       if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
     401           0 :         double edgef = fd_metrics_convert_ticks_to_seconds( edge - 1 );
     402           0 :         FD_TEST( fd_cstr_printf_check( le_buf, sizeof( le_buf ), NULL, "%.3f", edgef ) );
     403           0 :       } else {
     404           0 :         FD_TEST( fd_cstr_printf_check( le_buf, sizeof( le_buf ), NULL, "%.3f", (double)(edge - 1) / 1000000.0 ) );
     405           0 :       }
     406           0 :       le_str = le_buf;
     407           0 :     }
     408             : 
     409           0 :     char count_buf[ 64 ];
     410           0 :     fmt_count( count_buf, bucket_count );
     411             : 
     412             :     /* Create visual bar - scale to max 20 characters */
     413           0 :     char bar_buf[ 22 ];
     414           0 :     if( bucket_count > 0 && total_count > 0 ) {
     415           0 :       ulong bar_length = (bucket_count * 22UL) / total_count;
     416           0 :       if( bar_length == 0 ) bar_length = 1;
     417           0 :       for( ulong i = 0; i < bar_length; i++ ) { bar_buf[ i ] = '|'; }
     418           0 :       bar_buf[ bar_length ] = '\0';
     419           0 :     } else {
     420           0 :       bar_buf[ 0 ] = '\0';
     421           0 :     }
     422             : 
     423           0 :     printf( " | %-19s | %-18s | %s |\n", le_str, bar_buf, count_buf );
     424           0 :   }
     425             : 
     426             :   /* Print sum and total count */
     427           0 :   char sum_buf[ 64 ];
     428           0 :   char avg_buf[ 64 ];
     429           0 :   if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
     430           0 :     double sumf = fd_metrics_convert_ticks_to_seconds( metrics[ offset + FD_HISTF_BUCKET_CNT ] );
     431           0 :     FD_TEST( fd_cstr_printf_check( sum_buf, sizeof( sum_buf ), NULL, "%.6f", sumf ) );
     432           0 :     double avg = sumf / (double)total_count;
     433           0 :     FD_TEST( fd_cstr_printf_check( avg_buf, sizeof( avg_buf ), NULL, "%.6f", avg ) );
     434           0 :   } else {
     435           0 :     FD_TEST( fd_cstr_printf_check( sum_buf, sizeof( sum_buf ), NULL, "%lu", metrics[ offset + FD_HISTF_BUCKET_CNT ] ));
     436           0 :   }
     437             : 
     438           0 :   printf( " +---------------------+--------------------+---------------+\n" );
     439           0 :   printf( " | Sum: %-14s | Count: %-11lu | Avg: %-8s |\n", sum_buf, total_count, avg_buf );
     440           0 :   printf( " +---------------------+--------------------+---------------+\n" );
     441           0 : }
     442             : 
     443             : static void
     444           0 : print_catchup_slots( fd_wksp_t * repair_tile_wksp, fd_repair_tile_ctx_t * repair_ctx ) {
     445           0 :   fd_catchup_t * catchup = repair_ctx->catchup;
     446           0 :   ulong catchup_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, catchup );
     447           0 :   fd_catchup_t * catchup_table = (fd_catchup_t *)fd_wksp_laddr( repair_tile_wksp, catchup_gaddr );
     448           0 :   fd_catchup_print( catchup_table );
     449           0 : }
     450             : 
     451             : static void
     452           0 : sort_peers_by_latency( fd_active_elem_t * active_table, fd_peer_t * peer_arr, ulong peer_cnt ) {
     453           0 :   for( uint i = 0; i < peer_cnt - 1; i++ ) {
     454           0 :     int swapped = 0;
     455           0 :     for( uint j = 0; j < peer_cnt - 1 - i; j++ ) {
     456           0 :       fd_active_elem_t const * active_j  = fd_active_table_query_const( active_table, &peer_arr[ j ].key,     NULL );
     457           0 :       fd_active_elem_t const * active_j1 = fd_active_table_query_const( active_table, &peer_arr[ j + 1 ].key, NULL );
     458             : 
     459             :       /* Skip peers with no responses */
     460           0 :       double latency_j  = 10e9;
     461           0 :       double latency_j1 = 10e9;
     462           0 :       if( FD_LIKELY( active_j && active_j->resp_cnt > 0   ) ) latency_j  = ((double)active_j->total_latency / (double)active_j->resp_cnt);
     463           0 :       if( FD_LIKELY( active_j1 && active_j1->resp_cnt > 0 ) ) latency_j1 = ((double)active_j1->total_latency / (double)active_j1->resp_cnt);
     464             : 
     465             :       /* Swap if j has higher latency than j+1 */
     466           0 :       if( latency_j > latency_j1 ) {
     467           0 :         fd_peer_t temp    = peer_arr[ j ];
     468           0 :         peer_arr[ j ]     = peer_arr[ j + 1 ];
     469           0 :         peer_arr[ j + 1 ] = temp;
     470           0 :         swapped           = 1;
     471           0 :       }
     472           0 :     }
     473           0 :     if( !swapped ) break;
     474           0 :   }
     475           0 : }
     476             : 
     477             : 
     478             : 
     479             : fd_peer_t peers_copy[ FD_ACTIVE_KEY_MAX ];
     480             : 
     481             : static void
     482           0 : print_peer_location_latency( fd_wksp_t * repair_tile_wksp, fd_repair_tile_ctx_t * repair_ctx, fd_location_info_t * location_table ) {
     483           0 :   ulong              repair_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->repair );
     484           0 :   fd_repair_t *      repair       = fd_wksp_laddr( repair_tile_wksp, repair_gaddr );
     485           0 :   ulong              active_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, repair->actives );
     486           0 :   fd_active_elem_t * active_table = (fd_active_elem_t *)fd_wksp_laddr( repair_tile_wksp, active_gaddr );
     487             : 
     488           0 :   ulong peer_cnt = repair->peer_cnt;
     489           0 :   FD_COMPILER_MFENCE();
     490           0 :   memcpy( peers_copy, repair->peers, peer_cnt * sizeof(fd_peer_t) );
     491             :   /* Assumption is that peer_cnt is always increasing, so it's valid
     492             :      to copy the peers array from repair tile as it's being modified. */
     493             : 
     494           0 :   sort_peers_by_latency( active_table, peers_copy, peer_cnt );
     495           0 :   printf("\nPeer Location/Latency Information\n");
     496           0 :   printf( "| %-46s | %-7s | %-8s | %-8s | %-7s | %12s | %s\n", "Pubkey", "Req Cnt", "Req B/s", "Rx B/s", "Rx Rate", "Avg Latency", "Location Info" );
     497           0 :   for( uint i = 0; i < peer_cnt; i++ ) {
     498           0 :     fd_active_elem_t const * active = fd_active_table_query_const( active_table, &peers_copy[ i ].key, NULL );
     499           0 :     if( FD_LIKELY( active && active->resp_cnt > 0 ) ) {
     500           0 :       fd_location_info_t * info = fd_location_table_query( location_table, active->addr.addr, NULL );
     501           0 :       char * geolocation = info ? info->location : "Unknown";
     502           0 :       double peer_bps    = (double)(active->resp_cnt * FD_SHRED_MIN_SZ) / ((double)(active->last_resp_ts - active->first_resp_ts) / 1e9);
     503           0 :       double req_bps     = (double)active->req_cnt * 202 / ((double)(active->last_req_ts - active->first_req_ts) / 1e9);
     504           0 :       printf( "| %-46s | %-7lu | %-8.2f | %-8.2f | %-7.2f | %10.3fms | %s\n", FD_BASE58_ENC_32_ALLOCA( &active->key ), active->req_cnt, req_bps, peer_bps, (double)active->resp_cnt / (double)active->req_cnt, ((double)active->total_latency / (double)active->resp_cnt) / 1e6, geolocation );
     505           0 :     }
     506           0 :   }
     507           0 :   fflush( stdout );
     508           0 : }
     509             : 
     510             : static void
     511           0 : read_iptable( char * iptable_path, fd_location_info_t * location_table ) {
     512           0 :   int iptable_fd = open( iptable_path, O_RDONLY );
     513           0 :   if( FD_UNLIKELY( iptable_fd<0 ) ) {
     514           0 :     FD_LOG_NOTICE(( "iptable file: %s", iptable_path ));
     515           0 :     return;
     516           0 :   }
     517             : 
     518             :   /* read iptable line by line */
     519           0 :   if( FD_LIKELY( iptable_fd>=0 ) ) {
     520           0 :     char line[ 256 ];
     521           0 :     uchar istream_buf[256];
     522           0 :     fd_io_buffered_istream_t istream[1];
     523           0 :     fd_io_buffered_istream_init( istream, iptable_fd, istream_buf, sizeof(istream_buf) );
     524           0 :     for(;;) {
     525           0 :       int err;
     526           0 :       if( !fd_io_fgets( line, sizeof(line), istream, &err ) ) break;
     527           0 :       fd_location_info_t location_info;
     528           0 :       sscanf( line, "%lu %[^\n]", &location_info.ip4_addr, location_info.location );
     529             :       //FD_LOG_NOTICE(( "inserting location info for ip4_addr %lu, location %s", location_info.ip4_addr, location_info.location ));
     530           0 :       fd_location_info_t * info = fd_location_table_insert( location_table, location_info.ip4_addr );
     531           0 :       if( FD_UNLIKELY( info==NULL ) ) break;
     532           0 :       memcpy( info->location, location_info.location, sizeof(info->location) );
     533           0 :     }
     534           0 :   }
     535           0 : }
     536             : 
     537             : static void
     538             : repair_cmd_fn_metrics_mode( args_t *   args,
     539           0 :                             config_t * config ) {
     540           0 :   FD_LOG_NOTICE(( "Attempting to join with running firedancer-dev instance..." ));
     541             : 
     542           0 :   fd_topo_t * topo = &config->topo;
     543           0 :   ulong wksp_id = fd_topo_find_wksp( topo, "repair" );
     544           0 :   if( FD_UNLIKELY( wksp_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair workspace not found" ));
     545             : 
     546           0 :   fd_topo_wksp_t * repair_wksp = &topo->workspaces[ wksp_id ];
     547             : 
     548           0 :   ulong tile_id = fd_topo_find_tile( topo, "repair", 0UL );
     549           0 :   if( FD_UNLIKELY( tile_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair tile not found" ));
     550             : 
     551           0 :   fd_topo_join_workspace( topo, repair_wksp, FD_SHMEM_JOIN_MODE_READ_ONLY );
     552             : 
     553             :   /* Access the repair tile scratch memory where repair_tile_ctx is stored */
     554           0 :   fd_topo_tile_t * tile = &topo->tiles[ tile_id ];
     555           0 :   void * scratch = fd_topo_obj_laddr( &config->topo, tile->tile_obj_id );
     556           0 :   if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "Failed to access repair tile scratch memory" ));
     557             : 
     558           0 :   FD_SCRATCH_ALLOC_INIT( l, scratch );
     559           0 :   fd_repair_tile_ctx_t * repair_ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_repair_tile_ctx_t), sizeof(fd_repair_tile_ctx_t) );
     560             : 
     561             :   /* catchup cmd owned memory */
     562           0 :   fd_location_info_t * location_table = fd_location_table_join( fd_location_table_new( location_table_mem ) );
     563             : 
     564           0 :   read_iptable( args->repair.iptable_path, location_table );
     565           0 :   print_peer_location_latency( repair_wksp->wksp, repair_ctx, location_table );
     566           0 :   print_catchup_slots( repair_wksp->wksp, repair_ctx );
     567           0 :   printf( "\nCatchup tool completed successfully.\n" );
     568           0 : }
     569             : 
     570             : static void
     571             : repair_cmd_fn_profiler_mode( args_t *   args,
     572           0 :                              config_t * config ) {
     573           0 :   FD_LOG_NOTICE(( "Repair profiler topo" ));
     574             : 
     575           0 :   memset( &config->topo, 0, sizeof(config->topo) );
     576           0 :   repair_topo( config );
     577             : 
     578           0 :   for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
     579           0 :     fd_topo_tile_t * tile = &config->topo.tiles[ i ];
     580           0 :     if( FD_UNLIKELY( !strcmp( tile->name, "scap" ) ) ) {
     581             :       /* This is not part of the config, and it must be set manually
     582             :          on purpose as a safety mechanism. */
     583           0 :       tile->shredcap.enable_publish_stake_weights = 1;
     584           0 :       strncpy( tile->shredcap.manifest_path, args->repair.manifest_path, PATH_MAX );
     585           0 :     }
     586           0 :   }
     587             : 
     588           0 :   FD_LOG_NOTICE(( "Repair profiler init" ));
     589           0 :   fd_topo_print_log( 1, &config->topo );
     590             : 
     591           0 :   args_t configure_args = {
     592           0 :     .configure.command = CONFIGURE_CMD_INIT,
     593           0 :   };
     594           0 :   for( ulong i=0UL; STAGES[ i ]; i++ ) {
     595           0 :     configure_args.configure.stages[ i ] = STAGES[ i ];
     596           0 :   }
     597           0 :   configure_cmd_fn( &configure_args, config );
     598           0 :   if( 0==strcmp( config->net.provider, "xdp" ) ) {
     599           0 :     fd_xdp_fds_t fds = fd_topo_install_xdp( &config->topo, config->net.bind_address_parsed );
     600           0 :     (void)fds;
     601           0 :   }
     602             : 
     603           0 :   run_firedancer_init( config, 1, 0 );
     604             : 
     605           0 :   fd_log_private_shared_lock[ 1 ] = 0;
     606           0 :   fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_WRITE );
     607             : 
     608           0 :   fd_topo_fill( &config->topo );
     609             : 
     610           0 :   ulong repair_tile_idx = fd_topo_find_tile( &config->topo, "repair", 0UL );
     611           0 :   FD_TEST( repair_tile_idx!=ULONG_MAX );
     612           0 :   fd_topo_tile_t * repair_tile = &config->topo.tiles[ repair_tile_idx ];
     613             : 
     614           0 :   ulong shred_tile_idx = fd_topo_find_tile( &config->topo, "shred", 0UL );
     615           0 :   FD_TEST( shred_tile_idx!=ULONG_MAX );
     616           0 :   fd_topo_tile_t * shred_tile = &config->topo.tiles[ shred_tile_idx ];
     617             : 
     618           0 :   volatile ulong * shred_metrics = fd_metrics_tile( shred_tile->metrics );
     619           0 :   FD_TEST( shred_metrics );
     620             : 
     621           0 :   volatile ulong * repair_metrics = fd_metrics_tile( repair_tile->metrics );
     622           0 :   FD_TEST( repair_metrics );
     623             : 
     624           0 :   FD_LOG_NOTICE(( "Repair profiler run" ));
     625             : 
     626           0 :   ulong shred_repair_link_idx  = fd_topo_find_link( &config->topo, "shred_repair", 0UL );
     627           0 :   ulong repair_replay_link_idx = fd_topo_find_link( &config->topo, "repair_repla", 0UL );
     628           0 :   FD_TEST( shred_repair_link_idx!=ULONG_MAX );
     629           0 :   FD_TEST( repair_replay_link_idx!=ULONG_MAX );
     630           0 :   fd_topo_link_t * shred_repair_link  = &config->topo.links[ shred_repair_link_idx  ];
     631           0 :   fd_topo_link_t * repair_replay_link = &config->topo.links[ repair_replay_link_idx ];
     632           0 :   FD_TEST( shred_repair_link );
     633           0 :   FD_TEST( repair_replay_link );
     634           0 :   fd_frag_meta_t * shred_repair_mcache = shred_repair_link->mcache;
     635             : 
     636             : 
     637           0 :   ulong turbine_slot0    = 0UL;
     638           0 :   ulong last_checked_seq = 0UL; // Last looked seq number of repair->replay link
     639           0 :   long  last_print       = fd_log_wallclock();
     640           0 :   fd_topo_run_single_process( &config->topo, 0, config->uid, config->gid, fdctl_tile_run );
     641           0 :   for(;;) {
     642             : 
     643           0 :     if( FD_UNLIKELY( !turbine_slot0 ) ) {
     644           0 :       fd_frag_meta_t * frag = &shred_repair_mcache[1]; /* hack to get first frag */
     645           0 :       if ( frag->sz > 0 ) {
     646           0 :         turbine_slot0 = fd_disco_shred_repair_shred_sig_slot( frag->sig );
     647           0 :         FD_LOG_NOTICE(("turbine_slot0: %lu", turbine_slot0));
     648           0 :       }
     649           0 :     }
     650             : 
     651             :     /* get most updated mcache entry from repair_replay_link - stem style polling */
     652             : 
     653           0 :     if( FD_UNLIKELY( !last_checked_seq ) ) {
     654           0 :       last_checked_seq = fd_mcache_seq0( repair_replay_link->mcache );
     655           0 :     }
     656             : 
     657           0 :     fd_frag_meta_t * mcache_line = repair_replay_link->mcache + fd_mcache_line_idx( last_checked_seq, repair_replay_link->depth );
     658           0 :     __m128i seq_sig = fd_frag_meta_seq_sig_query( mcache_line );
     659           0 :     ulong seq_found = fd_frag_meta_sse0_seq( seq_sig );
     660           0 :     ulong sig       = fd_frag_meta_sse0_sig( seq_sig );
     661             : 
     662             :     // Check if we found a new fragment
     663           0 :     long diff = fd_seq_diff( last_checked_seq, seq_found );
     664           0 :     int catchup_finished = 0;
     665           0 :     if( FD_LIKELY( diff == 0L ) ) {
     666           0 :       if( FD_UNLIKELY( turbine_slot0 && (sig >> 32) > turbine_slot0 ) ) {
     667           0 :         catchup_finished = 1;
     668           0 :       }
     669             : 
     670             :       // Advance to the next sequence number
     671           0 :       last_checked_seq = fd_seq_inc( last_checked_seq, 1UL );
     672           0 :     } else if( FD_UNLIKELY( diff < 0L ) ) {
     673             :       // We're behind - producer has moved ahead
     674           0 :       last_checked_seq = seq_found;
     675           0 :     } else {
     676             :       // we're ahead
     677           0 :     }
     678             : 
     679             :     /* print metrics */
     680             : 
     681           0 :     long now = fd_log_wallclock();
     682           0 :     if( FD_UNLIKELY( catchup_finished || now - last_print > 1e9L ) ) {
     683           0 :       char buf2[ 64 ];
     684           0 :       ulong rcvd = shred_metrics [ MIDX( COUNTER, SHRED,  SHRED_REPAIR_RCV ) ];
     685           0 :       ulong sent = repair_metrics[ MIDX( COUNTER, REPAIR, SHRED_REPAIR_REQ ) ];
     686           0 :       printf(" Requests received: (%lu/%lu) %.1f%% \n", rcvd, sent, (double)rcvd / (double)sent * 100.0 );
     687           0 :       printf( " +---------------+--------------+\n" );
     688           0 :       printf( " | Request Type  | Count        |\n" );
     689           0 :       printf( " +---------------+--------------+\n" );
     690           0 :       printf( " | Orphan        | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_ORPHAN         ) ] ) );
     691           0 :       printf( " | HighestWindow | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_HIGHEST_WINDOW ) ] ) );
     692           0 :       printf( " | Index         | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_WINDOW         ) ] ) );
     693           0 :       printf( " +---------------+--------------+\n" );
     694             : 
     695           0 :       print_histogram_buckets( repair_metrics,
     696           0 :                                MIDX( HISTOGRAM, REPAIR, RESPONSE_LATENCY ),
     697           0 :                                FD_METRICS_CONVERTER_NONE,
     698           0 :                                FD_METRICS_HISTOGRAM_REPAIR_RESPONSE_LATENCY_MIN,
     699           0 :                                FD_METRICS_HISTOGRAM_REPAIR_RESPONSE_LATENCY_MAX,
     700           0 :                                "Response Latency" );
     701             : 
     702           0 :       printf(" Repaired slots: %lu/%lu  (slots behind: %lu)\n", repair_metrics[ MIDX( COUNTER, REPAIR, REPAIRED_SLOTS ) ], turbine_slot0, turbine_slot0 - repair_metrics[ MIDX( COUNTER, REPAIR, REPAIRED_SLOTS ) ] );
     703             :       /* Print histogram buckets similar to Prometheus format */
     704           0 :       print_histogram_buckets( repair_metrics,
     705           0 :                                MIDX( HISTOGRAM, REPAIR, SLOT_COMPLETE_TIME ),
     706           0 :                                FD_METRICS_CONVERTER_SECONDS,
     707           0 :                                FD_METRICS_HISTOGRAM_REPAIR_SLOT_COMPLETE_TIME_MIN,
     708           0 :                                FD_METRICS_HISTOGRAM_REPAIR_SLOT_COMPLETE_TIME_MAX,
     709           0 :                                "Slot Complete Time" );
     710             : 
     711           0 :       printf( " Repair Peers: %lu\n", repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_PEERS ) ] );
     712             : 
     713           0 :       printf("\n");
     714           0 :       fflush( stdout );
     715           0 :       last_print = now;
     716           0 :     }
     717           0 :     if( FD_UNLIKELY( catchup_finished ) ) {
     718           0 :       repair_cmd_fn_metrics_mode( args, config );
     719           0 :       FD_LOG_ERR(("catchup finished. slot %lu", turbine_slot0));
     720           0 :     }
     721           0 :   }
     722           0 : }
     723             : 
     724             : static void
     725             : repair_cmd_fn( args_t *   args,
     726           0 :                config_t * config ) {
     727           0 :   if( args->repair.metrics_only ) {
     728           0 :     repair_cmd_fn_metrics_mode( args, config );
     729           0 :   } else {
     730           0 :     repair_cmd_fn_profiler_mode( args, config );
     731           0 :   }
     732           0 : }
     733             : 
     734             : action_t fd_action_repair = {
     735             :   .name = "repair",
     736             :   .args = repair_cmd_args,
     737             :   .fn   = repair_cmd_fn,
     738             :   .perm = dev_cmd_perm,
     739             : };

Generated by: LCOV version 1.14