Line data Source code
1 : /* The repair command spawns a smaller topology for profiling the repair
2 : tile. This is a standalone application, and it can be run in mainnet,
3 : testnet and/or a private cluster. */
4 :
5 : #include "../../../disco/net/fd_net_tile.h"
6 : #include "../../../disco/tiles.h"
7 : #include "../../../disco/topo/fd_topob.h"
8 : #include "../../../disco/topo/fd_cpu_topo.h"
9 : #include "../../../util/pod/fd_pod_format.h"
10 : #include "../../../util/tile/fd_tile_private.h"
11 :
12 : #include "../../firedancer/topology.h"
13 : #include "../../firedancer/topology.c"
14 : #include "../../shared/commands/configure/configure.h"
15 : #include "../../shared/commands/run/run.h" /* initialize_workspaces */
16 : #include "../../shared/fd_config.h" /* config_t */
17 : #include "../../shared_dev/commands/dev.h"
18 : #include "../../../disco/tiles.h"
19 : #include "../../../disco/topo/fd_topob.h"
20 : #include "../../../util/pod/fd_pod_format.h"
21 : #include "../../../waltz/resolv/fd_io_readline.h"
22 : #include "../../shared/commands/monitor/helper.h"
23 : #include "../../../discof/repair/fd_repair_tile.c"
24 :
25 : #include "gossip.h"
26 : #include "core_subtopo.h"
27 :
28 : #include <unistd.h> /* pause */
29 : #include <fcntl.h>
30 : #include <stdio.h>
31 : #include <termios.h>
32 : #include <errno.h>
33 :
34 : struct fd_location_info {
35 : ulong ip4_addr; /* for map key convenience */
36 : char location[ 128 ];
37 : };
38 : typedef struct fd_location_info fd_location_info_t;
39 :
40 : #define MAP_NAME fd_location_table
41 0 : #define MAP_T fd_location_info_t
42 0 : #define MAP_KEY ip4_addr
43 0 : #define MAP_LG_SLOT_CNT 16
44 : #define MAP_MEMOIZE 0
45 : #include "../../../util/tmpl/fd_map.c"
46 :
47 : uchar __attribute__((aligned(alignof(fd_location_info_t)))) location_table_mem[ sizeof(fd_location_info_t) * (1 << 16 ) ];
48 :
49 : static struct termios termios_backup;
50 :
51 : static void
52 0 : restore_terminal( void ) {
53 0 : (void)tcsetattr( STDIN_FILENO, TCSANOW, &termios_backup );
54 0 : }
55 :
56 : fd_topo_run_tile_t
57 : fdctl_tile_run( fd_topo_tile_t const * tile );
58 :
59 : /* repair_topo is a subset of "src/app/firedancer/topology.c" at commit
60 : 0d8386f4f305bb15329813cfe4a40c3594249e96, slightly modified to work
61 : as a repair profiler. TODO ideally, one should invoke the firedancer
62 : topology first, and exclude the parts that are not needed, instead of
63 : manually generating new topologies for every command. This would
64 : also guarantee that the profiler is replicating (as close as possible)
65 : the full topology. */
66 : static void
67 0 : repair_topo( config_t * config ) {
68 0 : resolve_gossip_entrypoints( config );
69 :
70 0 : ulong net_tile_cnt = config->layout.net_tile_count;
71 0 : ulong shred_tile_cnt = config->layout.shred_tile_count;
72 0 : ulong quic_tile_cnt = config->layout.quic_tile_count;
73 0 : ulong sign_tile_cnt = config->firedancer.layout.sign_tile_count;
74 0 : ulong gossvf_tile_cnt = config->firedancer.layout.gossvf_tile_count;
75 :
76 0 : fd_topo_t * topo = { fd_topob_new( &config->topo, config->name ) };
77 0 : topo->max_page_size = fd_cstr_to_shmem_page_sz( config->hugetlbfs.max_page_size );
78 0 : topo->gigantic_page_threshold = config->hugetlbfs.gigantic_page_threshold_mib << 20;
79 :
80 0 : ulong tile_to_cpu[ FD_TILE_MAX ] = {0};
81 0 : ushort parsed_tile_to_cpu[ FD_TILE_MAX ];
82 : /* Unassigned tiles will be floating, unless auto topology is enabled. */
83 0 : for( ulong i=0UL; i<FD_TILE_MAX; i++ ) parsed_tile_to_cpu[ i ] = USHORT_MAX;
84 :
85 0 : int is_auto_affinity = !strcmp( config->layout.affinity, "auto" );
86 0 : int is_bench_auto_affinity = !strcmp( config->development.bench.affinity, "auto" );
87 :
88 0 : if( FD_UNLIKELY( is_auto_affinity != is_bench_auto_affinity ) ) {
89 0 : FD_LOG_ERR(( "The CPU affinity string in the configuration file under [layout.affinity] and [development.bench.affinity] must all be set to 'auto' or all be set to a specific CPU affinity string." ));
90 0 : }
91 :
92 0 : fd_topo_cpus_t cpus[1];
93 0 : fd_topo_cpus_init( cpus );
94 :
95 0 : ulong affinity_tile_cnt = 0UL;
96 0 : if( FD_LIKELY( !is_auto_affinity ) ) affinity_tile_cnt = fd_tile_private_cpus_parse( config->layout.affinity, parsed_tile_to_cpu );
97 :
98 0 : for( ulong i=0UL; i<affinity_tile_cnt; i++ ) {
99 0 : if( FD_UNLIKELY( parsed_tile_to_cpu[ i ]!=USHORT_MAX && parsed_tile_to_cpu[ i ]>=cpus->cpu_cnt ) )
100 0 : FD_LOG_ERR(( "The CPU affinity string in the configuration file under [layout.affinity] specifies a CPU index of %hu, but the system "
101 0 : "only has %lu CPUs. You should either change the CPU allocations in the affinity string, or increase the number of CPUs "
102 0 : "in the system.",
103 0 : parsed_tile_to_cpu[ i ], cpus->cpu_cnt ));
104 0 : tile_to_cpu[ i ] = fd_ulong_if( parsed_tile_to_cpu[ i ]==USHORT_MAX, ULONG_MAX, (ulong)parsed_tile_to_cpu[ i ] );
105 0 : }
106 :
107 0 : fd_core_subtopo( config, tile_to_cpu );
108 0 : fd_gossip_subtopo( config, tile_to_cpu );
109 :
110 : /* topo, name */
111 0 : fd_topob_wksp( topo, "net_shred" );
112 0 : fd_topob_wksp( topo, "net_repair" );
113 0 : fd_topob_wksp( topo, "net_quic" );
114 :
115 0 : fd_topob_wksp( topo, "shred_out" );
116 0 : fd_topob_wksp( topo, "replay_stake" );
117 :
118 0 : fd_topob_wksp( topo, "poh_shred" );
119 :
120 0 : fd_topob_wksp( topo, "shred_sign" );
121 0 : fd_topob_wksp( topo, "sign_shred" );
122 :
123 0 : fd_topob_wksp( topo, "repair_sign" );
124 0 : fd_topob_wksp( topo, "sign_repair" );
125 :
126 0 : fd_topob_wksp( topo, "send_txns" );
127 :
128 0 : fd_topob_wksp( topo, "shred" );
129 0 : fd_topob_wksp( topo, "repair" );
130 0 : fd_topob_wksp( topo, "fec_sets" );
131 0 : fd_topob_wksp( topo, "snap_out" );
132 :
133 0 : fd_topob_wksp( topo, "slot_fseqs" ); /* fseqs for marked slots eg. turbine slot */
134 :
135 0 : #define FOR(cnt) for( ulong i=0UL; i<cnt; i++ )
136 :
137 0 : ulong pending_fec_shreds_depth = fd_ulong_min( fd_ulong_pow2_up( config->tiles.shred.max_pending_shred_sets * FD_REEDSOL_DATA_SHREDS_MAX ), USHORT_MAX + 1 /* dcache max */ );
138 :
139 : /* topo, link_name, wksp_name, depth, mtu, burst */
140 0 : FOR(quic_tile_cnt) fd_topob_link( topo, "quic_net", "net_quic", config->net.ingress_buffer_size, FD_NET_MTU, 1UL );
141 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "shred_net", "net_shred", config->net.ingress_buffer_size, FD_NET_MTU, 1UL );
142 :
143 : /**/ fd_topob_link( topo, "replay_stake", "replay_stake", 128UL, 40UL + 40200UL * 40UL, 1UL );
144 :
145 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "shred_sign", "shred_sign", 128UL, 32UL, 1UL );
146 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "sign_shred", "sign_shred", 128UL, 64UL, 1UL );
147 :
148 0 : /**/ fd_topob_link( topo, "repair_net", "net_repair", config->net.ingress_buffer_size, FD_NET_MTU, 1UL );
149 :
150 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "shred_out", "shred_out", pending_fec_shreds_depth, FD_SHRED_OUT_MTU, 2UL /* at most 2 msgs per after_frag */ );
151 :
152 0 : FOR(shred_tile_cnt) fd_topob_link( topo, "repair_shred", "shred_out", pending_fec_shreds_depth, sizeof(fd_ed25519_sig_t), 1UL );
153 :
154 0 : FOR(sign_tile_cnt-1) fd_topob_link( topo, "repair_sign", "repair_sign", 128UL, FD_REPAIR_MAX_PREIMAGE_SZ, 1UL );
155 0 : FOR(sign_tile_cnt-1) fd_topob_link( topo, "sign_repair", "sign_repair", 128UL, sizeof(fd_ed25519_sig_t), 1UL );
156 :
157 0 : /**/ fd_topob_link( topo, "poh_shred", "poh_shred", 16384UL, USHORT_MAX, 1UL );
158 :
159 0 : /**/ fd_topob_link( topo, "send_txns", "send_txns", 128UL, FD_TXN_MTU, 1UL );
160 :
161 : /**/ fd_topob_link( topo, "snap_out", "snap_out", 2UL, sizeof(fd_snapshot_manifest_t), 1UL );
162 :
163 0 : FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_repair", i, config->net.ingress_buffer_size );
164 0 : FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_quic", i, config->net.ingress_buffer_size );
165 0 : FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_shred", i, config->net.ingress_buffer_size );
166 :
167 : /* topo, tile_name, tile_wksp, metrics_wksp, cpu_idx, is_agave, uses_keyswitch */
168 0 : FOR(shred_tile_cnt) fd_topob_tile( topo, "shred", "shred", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 1 );
169 0 : fd_topo_tile_t * repair_tile = fd_topob_tile( topo, "repair", "repair", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 );
170 :
171 : /* Setup a shared wksp object for fec sets. */
172 :
173 0 : ulong shred_depth = 65536UL; /* from fdctl/topology.c shred_store link. MAKE SURE TO KEEP IN SYNC. */
174 0 : ulong fec_set_cnt = shred_depth + config->tiles.shred.max_pending_shred_sets + 4UL;
175 0 : ulong fec_sets_sz = fec_set_cnt*sizeof(fd_shred34_t)*4; /* mirrors # of dcache entires in frankendancer */
176 0 : fd_topo_obj_t * fec_sets_obj = setup_topo_fec_sets( topo, "fec_sets", shred_tile_cnt*fec_sets_sz );
177 0 : for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
178 0 : fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
179 0 : fd_topob_tile_uses( topo, shred_tile, fec_sets_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
180 0 : }
181 0 : fd_topob_tile_uses( topo, repair_tile, fec_sets_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
182 0 : FD_TEST( fd_pod_insertf_ulong( topo->props, fec_sets_obj->id, "fec_sets" ) );
183 :
184 : /* There's another special fseq that's used to communicate the shred
185 : version from the Agave boot path to the shred tile. */
186 0 : fd_topo_obj_t * poh_shred_obj = fd_topob_obj( topo, "fseq", "poh_shred" );
187 0 : fd_topo_tile_t * poh_tile = &topo->tiles[ fd_topo_find_tile( topo, "gossip", 0UL ) ];
188 0 : fd_topob_tile_uses( topo, poh_tile, poh_shred_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
189 :
190 : /* root_slot is an fseq marking the validator's current Tower root. */
191 :
192 0 : fd_topo_obj_t * root_slot_obj = fd_topob_obj( topo, "fseq", "slot_fseqs" );
193 0 : FD_TEST( fd_pod_insertf_ulong( topo->props, root_slot_obj->id, "root_slot" ) );
194 :
195 0 : for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
196 0 : fd_topo_tile_t * shred_tile = &topo->tiles[ fd_topo_find_tile( topo, "shred", i ) ];
197 0 : fd_topob_tile_uses( topo, shred_tile, poh_shred_obj, FD_SHMEM_JOIN_MODE_READ_ONLY );
198 0 : }
199 0 : FD_TEST( fd_pod_insertf_ulong( topo->props, poh_shred_obj->id, "poh_shred" ) );
200 :
201 0 : if( FD_LIKELY( !is_auto_affinity ) ) {
202 0 : if( FD_UNLIKELY( affinity_tile_cnt<topo->tile_cnt ) )
203 0 : FD_LOG_ERR(( "The topology you are using has %lu tiles, but the CPU affinity specified in the config tile as [layout.affinity] only provides for %lu cores. "
204 0 : "You should either increase the number of cores dedicated to Firedancer in the affinity string, or decrease the number of cores needed by reducing "
205 0 : "the total tile count. You can reduce the tile count by decreasing individual tile counts in the [layout] section of the configuration file.",
206 0 : topo->tile_cnt, affinity_tile_cnt ));
207 0 : if( FD_UNLIKELY( affinity_tile_cnt>topo->tile_cnt ) )
208 0 : FD_LOG_WARNING(( "The topology you are using has %lu tiles, but the CPU affinity specified in the config tile as [layout.affinity] provides for %lu cores. "
209 0 : "Not all cores in the affinity will be used by Firedancer. You may wish to increase the number of tiles in the system by increasing "
210 0 : "individual tile counts in the [layout] section of the configuration file.",
211 0 : topo->tile_cnt, affinity_tile_cnt ));
212 0 : }
213 :
214 : /* topo, tile_name, tile_kind_id, fseq_wksp, link_name, link_kind_id, reliable, polled */
215 0 : for( ulong j=0UL; j<shred_tile_cnt; j++ )
216 0 : fd_topos_tile_in_net( topo, "metric_in", "shred_net", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
217 0 : for( ulong j=0UL; j<quic_tile_cnt; j++ )
218 0 : {fd_topos_tile_in_net( topo, "metric_in", "quic_net", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );} /* No reliable consumers of networking fragments, may be dropped or overrun */
219 :
220 0 : /**/ fd_topob_tile_in( topo, "gossip", 0UL, "metric_in", "send_txns", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
221 :
222 0 : /**/ fd_topos_tile_in_net( topo, "metric_in", "repair_net", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
223 :
224 0 : FOR(shred_tile_cnt) for( ulong j=0UL; j<net_tile_cnt; j++ )
225 0 : fd_topob_tile_in( topo, "shred", i, "metric_in", "net_shred", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
226 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "shred", i, "metric_in", "poh_shred", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
227 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "shred", i, "metric_in", "replay_stake", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
228 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "shred", i, "metric_in", "gossip_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
229 0 : FOR(shred_tile_cnt) fd_topob_tile_out( topo, "shred", i, "shred_out", i );
230 0 : FOR(shred_tile_cnt) fd_topob_tile_out( topo, "shred", i, "shred_net", i );
231 0 : FOR(shred_tile_cnt) fd_topob_tile_in ( topo, "shred", i, "metric_in", "ipecho_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
232 :
233 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "shred", i, "metric_in", "repair_shred", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
234 :
235 : /**/ fd_topob_tile_out( topo, "repair", 0UL, "repair_net", 0UL );
236 :
237 : /* Sign links don't need to be reliable because they are synchronous,
238 : so there's at most one fragment in flight at a time anyway. The
239 : sign links are also not polled by the mux, instead the tiles will
240 : read the sign responses out of band in a dedicated spin loop. */
241 0 : for( ulong i=0UL; i<shred_tile_cnt; i++ ) {
242 0 : /**/ fd_topob_tile_in( topo, "sign", 0UL, "metric_in", "shred_sign", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
243 0 : /**/ fd_topob_tile_out( topo, "shred", i, "shred_sign", i );
244 0 : /**/ fd_topob_tile_in( topo, "shred", i, "metric_in", "sign_shred", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_UNPOLLED );
245 0 : /**/ fd_topob_tile_out( topo, "sign", 0UL, "sign_shred", i );
246 0 : }
247 0 : FOR(gossvf_tile_cnt) fd_topob_tile_in ( topo, "gossvf", i, "metric_in", "replay_stake", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
248 :
249 0 : /**/ fd_topob_tile_in ( topo, "gossip", 0UL, "metric_in", "replay_stake", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
250 :
251 0 : FOR(net_tile_cnt) fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "net_repair", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */
252 0 : /**/ fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "gossip_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
253 0 : /**/ fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "replay_stake", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
254 0 : fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "snap_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
255 0 : FOR(shred_tile_cnt) fd_topob_tile_in( topo, "repair", 0UL, "metric_in", "shred_out", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
256 0 : FOR(shred_tile_cnt) fd_topob_tile_out( topo, "repair", 0UL, "repair_shred", i );
257 0 : FOR(sign_tile_cnt-1) fd_topob_tile_out( topo, "repair", 0UL, "repair_sign", i );
258 0 : FOR(sign_tile_cnt-1) fd_topob_tile_in ( topo, "sign", i+1, "metric_in", "repair_sign", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
259 0 : FOR(sign_tile_cnt-1) fd_topob_tile_out( topo, "sign", i+1, "sign_repair", i );
260 0 : FOR(sign_tile_cnt-1) fd_topob_tile_in ( topo, "repair", 0UL, "metric_in", "sign_repair", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
261 :
262 0 : /**/ fd_topob_tile_in ( topo, "gossip", 0UL, "metric_in", "sign_gossip", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_UNPOLLED );
263 :
264 0 : if( 1 ) {
265 0 : fd_topob_wksp( topo, "scap" );
266 :
267 0 : fd_topo_tile_t * scap_tile = fd_topob_tile( topo, "scap", "scap", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 );
268 :
269 0 : fd_topob_tile_in( topo, "scap", 0UL, "metric_in", "repair_net", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
270 0 : for( ulong j=0UL; j<net_tile_cnt; j++ ) {
271 0 : fd_topob_tile_in( topo, "scap", 0UL, "metric_in", "net_shred", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
272 0 : }
273 0 : for( ulong j=0UL; j<shred_tile_cnt; j++ ) {
274 0 : fd_topob_tile_in( topo, "scap", 0UL, "metric_in", "shred_out", j, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED );
275 0 : }
276 0 : fd_topob_tile_in( topo, "scap", 0UL, "metric_in", "gossip_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
277 :
278 0 : fd_topob_tile_uses( topo, scap_tile, root_slot_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
279 0 : fd_topob_tile_out( topo, "scap", 0UL, "replay_stake", 0UL );
280 0 : fd_topob_tile_out( topo, "scap", 0UL, "snap_out", 0UL );
281 0 : }
282 :
283 0 : FD_TEST( fd_link_permit_no_producers( topo, "quic_net" ) == quic_tile_cnt );
284 0 : FD_TEST( fd_link_permit_no_producers( topo, "poh_shred" ) == 1UL );
285 0 : FD_TEST( fd_link_permit_no_producers( topo, "send_txns" ) == 1UL );
286 :
287 0 : FD_TEST( fd_link_permit_no_consumers( topo, "net_quic" ) == quic_tile_cnt );
288 :
289 0 : config->tiles.send.send_src_port = 0; /* disable send */
290 :
291 0 : FOR(net_tile_cnt) fd_topos_net_tile_finish( topo, i );
292 :
293 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
294 0 : fd_topo_tile_t * tile = &topo->tiles[ i ];
295 0 : fd_topo_configure_tile( tile, config );
296 0 : }
297 :
298 0 : if( FD_UNLIKELY( is_auto_affinity ) ) fd_topob_auto_layout( topo, 0 );
299 :
300 0 : fd_topob_finish( topo, CALLBACKS );
301 :
302 0 : config->topo = *topo;
303 0 : }
304 :
305 : extern int * fd_log_private_shared_lock;
306 :
307 : void
308 : repair_cmd_args( int * pargc,
309 : char *** pargv,
310 0 : args_t * args ) {
311 :
312 0 : if( FD_UNLIKELY( !*pargc ) )
313 0 : FD_LOG_ERR(( "\n \
314 0 : usage: (1) repair --manifest-path <manifest_path> [--iptable-path <iptable_path>] \n \
315 0 : (2) repair --metrics [--iptable-path <iptable_path>] \n\n \
316 0 : (3) repair --tree \n\n \
317 0 : Passing --manifest-path starts up profiler mode, which runs a reduced topology that tests catchup and repair performance. \n \
318 0 : Passing --metrics prints recent slot completion times and response latencies during a live run. These modes are exclusive. \n \
319 0 : Passing --tree prints the tree of the repair process. \n \
320 0 : --iptable-path: optional path to iptable file to map IP addresses to locations." ));
321 :
322 0 : char const * manifest_path = fd_env_strip_cmdline_cstr( pargc, pargv, "--manifest-path", NULL, NULL );
323 0 : if( fd_env_strip_cmdline_contains( pargc, pargv, "--metrics" ) ) {
324 0 : args->repair.metrics_only = 1;
325 0 : if( FD_UNLIKELY( manifest_path ) ) FD_LOG_ERR(( "metrics mode does not support --manifest-path" ));
326 0 : } else if( fd_env_strip_cmdline_contains( pargc, pargv, "--tree" ) ) {
327 0 : args->repair.forest_only = 1;
328 0 : if( FD_UNLIKELY( manifest_path ) ) FD_LOG_ERR(( "tree mode does not support --manifest-path" ));
329 0 : } else {
330 0 : fd_cstr_fini( fd_cstr_append_cstr_safe( fd_cstr_init( args->repair.manifest_path ), manifest_path, sizeof(args->repair.manifest_path)-1UL ) );
331 0 : }
332 :
333 0 : char const * iptable_path = fd_env_strip_cmdline_cstr( pargc, pargv, "--iptable-path", NULL, NULL );
334 0 : if( FD_LIKELY( iptable_path ) ) {
335 0 : fd_cstr_fini( fd_cstr_append_cstr_safe( fd_cstr_init( args->repair.iptable_path ), iptable_path, sizeof(args->repair.iptable_path)-1UL ) );
336 0 : }
337 0 : }
338 :
339 : static char *
340 0 : fmt_count( char buf[ static 64 ], ulong count ) {
341 0 : char tmp[ 64 ];
342 0 : if( FD_LIKELY( count<1000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%lu", count ) );
343 0 : else if( FD_LIKELY( count<1000000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%.1f K", (double)count/1000.0 ) );
344 0 : else if( FD_LIKELY( count<1000000000UL ) ) FD_TEST( fd_cstr_printf_check( tmp, 64UL, NULL, "%.1f M", (double)count/1000000.0 ) );
345 :
346 0 : FD_TEST( fd_cstr_printf_check( buf, 64UL, NULL, "%12s", tmp ) );
347 0 : return buf;
348 0 : }
349 :
350 : static void
351 : print_histogram_buckets( volatile ulong * metrics,
352 : ulong offset,
353 : int converter,
354 : double histmin,
355 : double histmax,
356 0 : char * title ) {
357 0 : fd_histf_t hist[1];
358 :
359 : /* Create histogram structure only to get bucket edges for display */
360 0 : if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
361 : /* For SLOT_COMPLETE_TIME: min=0.2, max=2.0 seconds */
362 0 : FD_TEST( fd_histf_new( hist, fd_metrics_convert_seconds_to_ticks( histmin ), fd_metrics_convert_seconds_to_ticks( histmax ) ) );
363 0 : } else if( FD_LIKELY( converter == FD_METRICS_CONVERTER_NONE ) ) {
364 : /* For non-time histograms, we'd need the actual min/max values */
365 0 : FD_TEST( fd_histf_new( hist, (ulong)histmin, (ulong)histmax ) );
366 0 : } else {
367 0 : FD_LOG_ERR(( "unknown converter %i", converter ));
368 0 : }
369 :
370 0 : printf( " +---------------------+--------------------+--------------+\n" );
371 0 : printf( " | %-19s | | Count |\n", title );
372 0 : printf( " +---------------------+--------------------+--------------+\n" );
373 :
374 0 : ulong total_count = 0;
375 0 : for( ulong k = 0; k < FD_HISTF_BUCKET_CNT; k++ ) {
376 0 : ulong bucket_count = metrics[ offset + k ];
377 0 : total_count += bucket_count;
378 0 : }
379 :
380 0 : for( ulong k = 0; k < FD_HISTF_BUCKET_CNT; k++ ) {
381 : /* Get individual bucket count directly from metrics array */
382 0 : ulong bucket_count = metrics[ offset + k ];
383 :
384 0 : char * le_str;
385 0 : char le_buf[ 64 ];
386 0 : if( FD_UNLIKELY( k == FD_HISTF_BUCKET_CNT - 1UL ) ) {
387 0 : le_str = "+Inf";
388 0 : } else {
389 0 : ulong edge = fd_histf_right( hist, k );
390 0 : if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
391 0 : double edgef = fd_metrics_convert_ticks_to_seconds( edge - 1 );
392 0 : FD_TEST( fd_cstr_printf_check( le_buf, sizeof( le_buf ), NULL, "%.3f", edgef ) );
393 0 : } else {
394 0 : FD_TEST( fd_cstr_printf_check( le_buf, sizeof( le_buf ), NULL, "%.3f", (double)(edge - 1) / 1000000.0 ) );
395 0 : }
396 0 : le_str = le_buf;
397 0 : }
398 :
399 0 : char count_buf[ 64 ];
400 0 : fmt_count( count_buf, bucket_count );
401 :
402 : /* Create visual bar - scale to max 20 characters */
403 0 : char bar_buf[ 22 ];
404 0 : if( bucket_count > 0 && total_count > 0 ) {
405 0 : ulong bar_length = (bucket_count * 22UL) / total_count;
406 0 : if( bar_length == 0 ) bar_length = 1;
407 0 : for( ulong i = 0; i < bar_length; i++ ) { bar_buf[ i ] = '|'; }
408 0 : bar_buf[ bar_length ] = '\0';
409 0 : } else {
410 0 : bar_buf[ 0 ] = '\0';
411 0 : }
412 :
413 0 : printf( " | %-19s | %-18s | %s |\n", le_str, bar_buf, count_buf );
414 0 : }
415 :
416 : /* Print sum and total count */
417 0 : char sum_buf[ 64 ];
418 0 : char avg_buf[ 64 ];
419 0 : if( FD_LIKELY( converter == FD_METRICS_CONVERTER_SECONDS ) ) {
420 0 : double sumf = fd_metrics_convert_ticks_to_seconds( metrics[ offset + FD_HISTF_BUCKET_CNT ] );
421 0 : FD_TEST( fd_cstr_printf_check( sum_buf, sizeof( sum_buf ), NULL, "%.6f", sumf ) );
422 0 : double avg = sumf / (double)total_count;
423 0 : FD_TEST( fd_cstr_printf_check( avg_buf, sizeof( avg_buf ), NULL, "%.6f", avg ) );
424 0 : } else {
425 0 : FD_TEST( fd_cstr_printf_check( sum_buf, sizeof( sum_buf ), NULL, "%lu", metrics[ offset + FD_HISTF_BUCKET_CNT ] ));
426 0 : }
427 :
428 0 : printf( " +---------------------+--------------------+---------------+\n" );
429 0 : printf( " | Sum: %-14s | Count: %-11lu | Avg: %-8s |\n", sum_buf, total_count, avg_buf );
430 0 : printf( " +---------------------+--------------------+---------------+\n" );
431 0 : }
432 :
433 : static void
434 0 : print_catchup_slots( fd_wksp_t * repair_tile_wksp, ctx_t * repair_ctx, int verbose ) {
435 0 : fd_repair_metrics_t * catchup = repair_ctx->slot_metrics;
436 0 : ulong catchup_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, catchup );
437 0 : fd_repair_metrics_t * catchup_table = (fd_repair_metrics_t *)fd_wksp_laddr( repair_tile_wksp, catchup_gaddr );
438 0 : fd_repair_metrics_print( catchup_table, verbose );
439 0 : }
440 :
441 : static fd_location_info_t * location_table;
442 : static fd_pubkey_t peers_copy[ FD_ACTIVE_KEY_MAX ];
443 :
444 : static ulong
445 0 : sort_peers_by_latency( fd_policy_peer_t * active_table, fd_peer_dlist_t * peers_dlist, fd_peer_t * peers_arr ) {
446 0 : ulong i = 0;
447 0 : fd_peer_dlist_iter_t iter = fd_peer_dlist_iter_fwd_init( peers_dlist, peers_arr );
448 0 : while( !fd_peer_dlist_iter_done( iter, peers_dlist, peers_arr ) ) {
449 0 : fd_peer_t * peer = fd_peer_dlist_iter_ele( iter, peers_dlist, peers_arr );
450 0 : if( FD_UNLIKELY( !peer ) ) break;
451 0 : peers_copy[ i++ ] = peer->identity;
452 0 : if( FD_UNLIKELY( i >= FD_ACTIVE_KEY_MAX ) ) break;
453 0 : iter = fd_peer_dlist_iter_fwd_next( iter, peers_dlist, peers_arr );
454 0 : }
455 :
456 0 : ulong peer_cnt = i;
457 0 : for( uint i = 0; i < peer_cnt - 1; i++ ) {
458 0 : int swapped = 0;
459 0 : for( uint j = 0; j < peer_cnt - 1 - i; j++ ) {
460 0 : fd_policy_peer_t const * active_j = fd_policy_peer_map_query( active_table, peers_copy[ j ], NULL );
461 0 : fd_policy_peer_t const * active_j1 = fd_policy_peer_map_query( active_table, peers_copy[ j + 1 ], NULL );
462 :
463 : /* Skip peers with no responses */
464 0 : double latency_j = 10e9;
465 0 : double latency_j1 = 10e9;
466 0 : if( FD_LIKELY( active_j && active_j->res_cnt > 0 ) ) latency_j = ((double)active_j->total_lat / (double)active_j->res_cnt);
467 0 : if( FD_LIKELY( active_j1 && active_j1->res_cnt > 0 ) ) latency_j1 = ((double)active_j1->total_lat / (double)active_j1->res_cnt);
468 :
469 : /* Swap if j has higher latency than j+1 */
470 0 : if( latency_j > latency_j1 ) {
471 0 : fd_pubkey_t temp = peers_copy[ j ];
472 0 : peers_copy[ j ] = peers_copy[ j + 1 ];
473 0 : peers_copy[ j + 1 ] = temp;
474 0 : swapped = 1;
475 0 : }
476 0 : }
477 0 : if( !swapped ) break;
478 0 : }
479 0 : return peer_cnt;
480 0 : }
481 :
482 : static void
483 0 : print_peer_location_latency( fd_wksp_t * repair_tile_wksp, ctx_t * tile_ctx ) {
484 0 : ulong policy_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, tile_ctx->policy );
485 0 : fd_policy_t * policy = fd_wksp_laddr ( repair_tile_wksp, policy_gaddr );
486 0 : ulong peermap_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.map );
487 0 : ulong peerarr_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.pool );
488 0 : ulong peerlst_gaddr = fd_wksp_gaddr_fast( tile_ctx->wksp, policy->peers.dlist );
489 0 : fd_policy_peer_t * peers_map = (fd_policy_peer_t *)fd_wksp_laddr( repair_tile_wksp, peermap_gaddr );
490 0 : fd_peer_dlist_t * peers_dlist = (fd_peer_dlist_t *)fd_wksp_laddr( repair_tile_wksp, peerlst_gaddr );
491 0 : fd_peer_t * peers_arr = (fd_peer_t *)fd_wksp_laddr( repair_tile_wksp, peerarr_gaddr );
492 :
493 0 : ulong peer_cnt = sort_peers_by_latency( peers_map, peers_dlist, peers_arr );
494 0 : printf("\nPeer Location/Latency Information\n");
495 0 : printf( "| %-46s | %-7s | %-8s | %-8s | %-7s | %12s | %s\n", "Pubkey", "Req Cnt", "Req B/s", "Rx B/s", "Rx Rate", "Avg Latency", "Location Info" );
496 0 : for( uint i = 0; i < peer_cnt; i++ ) {
497 0 : fd_policy_peer_t const * active = fd_policy_peer_map_query( peers_map, peers_copy[ i ], NULL );
498 0 : if( FD_LIKELY( active && active->res_cnt > 0 ) ) {
499 0 : fd_location_info_t * info = fd_location_table_query( location_table, active->ip4, NULL );
500 0 : char * geolocation = info ? info->location : "Unknown";
501 0 : double peer_bps = (double)(active->res_cnt * FD_SHRED_MIN_SZ) / ((double)(active->last_resp_ts - active->first_resp_ts) / 1e9);
502 0 : double req_bps = (double)active->req_cnt * 202 / ((double)(active->last_req_ts - active->first_req_ts) / 1e9);
503 0 : printf( "| %-46s | %-7lu | %-8.2f | %-8.2f | %-7.2f | %10.3fms | %s\n", FD_BASE58_ENC_32_ALLOCA( &active->key ), active->req_cnt, req_bps, peer_bps, (double)active->res_cnt / (double)active->req_cnt, ((double)active->total_lat / (double)active->res_cnt) / 1e6, geolocation );
504 0 : }
505 0 : }
506 0 : fflush( stdout );
507 0 : }
508 :
509 : static void
510 0 : read_iptable( char * iptable_path, fd_location_info_t * location_table ) {
511 0 : int iptable_fd = open( iptable_path, O_RDONLY );
512 0 : if( FD_UNLIKELY( iptable_fd<0 ) ) {
513 0 : FD_LOG_NOTICE(( "iptable file: %s", iptable_path ));
514 0 : return;
515 0 : }
516 :
517 : /* read iptable line by line */
518 0 : if( FD_LIKELY( iptable_fd>=0 ) ) {
519 0 : char line[ 256 ];
520 0 : uchar istream_buf[256];
521 0 : fd_io_buffered_istream_t istream[1];
522 0 : fd_io_buffered_istream_init( istream, iptable_fd, istream_buf, sizeof(istream_buf) );
523 0 : for(;;) {
524 0 : int err;
525 0 : if( !fd_io_fgets( line, sizeof(line), istream, &err ) ) break;
526 0 : fd_location_info_t location_info;
527 0 : sscanf( line, "%lu %[^\n]", &location_info.ip4_addr, location_info.location );
528 : //FD_LOG_NOTICE(( "inserting location info for ip4_addr %lu, location %s", location_info.ip4_addr, location_info.location ));
529 0 : fd_location_info_t * info = fd_location_table_insert( location_table, location_info.ip4_addr );
530 0 : if( FD_UNLIKELY( info==NULL ) ) break;
531 0 : memcpy( info->location, location_info.location, sizeof(info->location) );
532 0 : }
533 0 : }
534 0 : }
535 :
536 : static void
537 : repair_cmd_fn_metrics_mode( args_t * args,
538 0 : config_t * config ) {
539 0 : FD_LOG_NOTICE(( "Attempting to join with running firedancer-dev instance..." ));
540 :
541 0 : fd_topo_t * topo = &config->topo;
542 0 : ulong wksp_id = fd_topo_find_wksp( topo, "repair" );
543 0 : if( FD_UNLIKELY( wksp_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair workspace not found" ));
544 :
545 0 : fd_topo_wksp_t * repair_wksp = &topo->workspaces[ wksp_id ];
546 :
547 0 : ulong tile_id = fd_topo_find_tile( topo, "repair", 0UL );
548 0 : if( FD_UNLIKELY( tile_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair tile not found" ));
549 :
550 0 : fd_topo_join_workspace( topo, repair_wksp, FD_SHMEM_JOIN_MODE_READ_ONLY );
551 :
552 : /* Access the repair tile scratch memory where repair_tile_ctx is stored */
553 0 : fd_topo_tile_t * tile = &topo->tiles[ tile_id ];
554 0 : void * scratch = fd_topo_obj_laddr( &config->topo, tile->tile_obj_id );
555 0 : if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "Failed to access repair tile scratch memory" ));
556 :
557 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
558 0 : ctx_t * repair_ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ctx_t), sizeof(ctx_t) );
559 :
560 : /* catchup cmd owned memory */
561 0 : location_table = fd_location_table_join( fd_location_table_new( location_table_mem ) );
562 :
563 0 : read_iptable( args->repair.iptable_path, location_table );
564 :
565 0 : if( FD_UNLIKELY( !args->repair.metrics_only ) ) {
566 0 : print_peer_location_latency( repair_wksp->wksp, repair_ctx );
567 0 : print_catchup_slots( repair_wksp->wksp, repair_ctx, 0 );
568 0 : printf( "\nCatchup tool completed successfully.\n" );
569 0 : return;
570 0 : }
571 :
572 : // Add terminal setup here - same as monitor.c
573 0 : atexit( restore_terminal );
574 0 : if( FD_UNLIKELY( 0!=tcgetattr( STDIN_FILENO, &termios_backup ) ) ) {
575 0 : FD_LOG_ERR(( "tcgetattr(STDIN_FILENO) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
576 0 : }
577 :
578 : /* Disable character echo and line buffering */
579 0 : struct termios term = termios_backup;
580 0 : term.c_lflag &= (tcflag_t)~(ICANON | ECHO);
581 0 : if( FD_UNLIKELY( 0!=tcsetattr( STDIN_FILENO, TCSANOW, &term ) ) ) {
582 0 : FD_LOG_WARNING(( "tcsetattr(STDIN_FILENO) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
583 0 : }
584 :
585 0 : int catchup_verbose = 0;
586 0 : long last_print = 0;
587 0 : for( ;; ) {
588 0 : int c = fd_getchar();
589 0 : if( FD_UNLIKELY( c=='i' ) ) catchup_verbose = !catchup_verbose;
590 0 : if( FD_UNLIKELY( c=='\x04' ) ) break; /* Ctrl-D */
591 :
592 0 : long now = fd_log_wallclock();
593 0 : if( FD_UNLIKELY( now - last_print > 1e9L ) ) {
594 0 : last_print = now;
595 0 : print_catchup_slots( repair_wksp->wksp, repair_ctx, catchup_verbose );
596 0 : printf( "catchup slots | Use 'i' to toggle extra slot information" TEXT_NEWLINE );
597 0 : fflush( stdout );
598 :
599 : /* Peer location latency is not that useful post catchup, and also
600 : requires some concurrent dlist iteration, so only print it when
601 : in profiler mode. */
602 0 : }
603 0 : }
604 0 : }
605 :
606 : static void
607 : repair_cmd_fn_tree_mode( args_t * args,
608 0 : config_t * config ) {
609 0 : (void)args;
610 0 : FD_LOG_NOTICE(( "Attempting to join with running firedancer-dev instance..." ));
611 :
612 0 : fd_topo_t * topo = &config->topo;
613 0 : ulong wksp_id = fd_topo_find_wksp( topo, "repair" );
614 0 : if( FD_UNLIKELY( wksp_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair workspace not found" ));
615 :
616 0 : fd_topo_wksp_t * repair_wksp = &topo->workspaces[ wksp_id ];
617 :
618 0 : ulong tile_id = fd_topo_find_tile( topo, "repair", 0UL );
619 0 : if( FD_UNLIKELY( tile_id==ULONG_MAX ) ) FD_LOG_ERR(( "repair tile not found" ));
620 :
621 0 : fd_topo_join_workspace( topo, repair_wksp, FD_SHMEM_JOIN_MODE_READ_ONLY );
622 :
623 : /* Access the repair tile scratch memory where repair_tile_ctx is stored */
624 0 : fd_topo_tile_t * tile = &topo->tiles[ tile_id ];
625 0 : void * scratch = fd_topo_obj_laddr( &config->topo, tile->tile_obj_id );
626 0 : if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "Failed to access repair tile scratch memory" ));
627 :
628 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
629 0 : ctx_t * repair_ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ctx_t), sizeof(ctx_t) );
630 :
631 0 : ulong forest_gaddr = fd_wksp_gaddr_fast( repair_ctx->wksp, repair_ctx->forest );
632 0 : fd_forest_t * forest = (fd_forest_t *)fd_wksp_laddr( repair_wksp->wksp, forest_gaddr );
633 :
634 0 : for( ;; ) {
635 0 : fd_forest_print( forest );
636 0 : sleep( 1 );
637 0 : }
638 0 : }
639 :
640 : static void
641 : repair_cmd_fn_profiler_mode( args_t * args,
642 0 : config_t * config ) {
643 0 : FD_LOG_NOTICE(( "Repair profiler topo" ));
644 :
645 0 : memset( &config->topo, 0, sizeof(config->topo) );
646 0 : repair_topo( config );
647 :
648 0 : for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
649 0 : fd_topo_tile_t * tile = &config->topo.tiles[ i ];
650 0 : if( FD_UNLIKELY( !strcmp( tile->name, "scap" ) ) ) {
651 : /* This is not part of the config, and it must be set manually
652 : on purpose as a safety mechanism. */
653 0 : tile->shredcap.enable_publish_stake_weights = 1;
654 0 : strncpy( tile->shredcap.manifest_path, args->repair.manifest_path, PATH_MAX );
655 0 : }
656 0 : }
657 :
658 0 : FD_LOG_NOTICE(( "Repair profiler init" ));
659 0 : fd_topo_print_log( 1, &config->topo );
660 :
661 0 : args_t configure_args = {
662 0 : .configure.command = CONFIGURE_CMD_INIT,
663 0 : };
664 0 : for( ulong i=0UL; STAGES[ i ]; i++ ) {
665 0 : configure_args.configure.stages[ i ] = STAGES[ i ];
666 0 : }
667 0 : configure_cmd_fn( &configure_args, config );
668 0 : if( 0==strcmp( config->net.provider, "xdp" ) ) {
669 0 : fd_xdp_fds_t fds = fd_topo_install_xdp( &config->topo, config->net.bind_address_parsed );
670 0 : (void)fds;
671 0 : }
672 :
673 0 : run_firedancer_init( config, 1, 0 );
674 :
675 0 : fd_log_private_shared_lock[ 1 ] = 0;
676 0 : fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_WRITE );
677 :
678 0 : fd_topo_fill( &config->topo );
679 :
680 0 : ulong repair_tile_idx = fd_topo_find_tile( &config->topo, "repair", 0UL );
681 0 : FD_TEST( repair_tile_idx!=ULONG_MAX );
682 0 : fd_topo_tile_t * repair_tile = &config->topo.tiles[ repair_tile_idx ];
683 :
684 0 : ulong shred_tile_idx = fd_topo_find_tile( &config->topo, "shred", 0UL );
685 0 : FD_TEST( shred_tile_idx!=ULONG_MAX );
686 0 : fd_topo_tile_t * shred_tile = &config->topo.tiles[ shred_tile_idx ];
687 :
688 0 : volatile ulong * shred_metrics = fd_metrics_tile( shred_tile->metrics );
689 0 : FD_TEST( shred_metrics );
690 :
691 0 : volatile ulong * repair_metrics = fd_metrics_tile( repair_tile->metrics );
692 0 : FD_TEST( repair_metrics );
693 :
694 0 : FD_LOG_NOTICE(( "Repair profiler run" ));
695 :
696 0 : ulong shred_out_link_idx = fd_topo_find_link( &config->topo, "shred_out", 0UL );
697 0 : FD_TEST( shred_out_link_idx!=ULONG_MAX );
698 0 : fd_topo_link_t * shred_out_link = &config->topo.links[ shred_out_link_idx ];
699 0 : FD_TEST( shred_out_link );
700 0 : fd_frag_meta_t * shred_out_mcache = shred_out_link->mcache;
701 :
702 0 : ulong turbine_slot0 = 0UL;
703 0 : long last_print = fd_log_wallclock();
704 0 : fd_topo_run_single_process( &config->topo, 0, config->uid, config->gid, fdctl_tile_run );
705 0 : for(;;) {
706 :
707 0 : if( FD_UNLIKELY( !turbine_slot0 ) ) {
708 0 : fd_frag_meta_t * frag = &shred_out_mcache[1]; /* hack to get first frag */
709 0 : if ( frag->sz > 0 ) {
710 0 : turbine_slot0 = fd_disco_shred_out_shred_sig_slot( frag->sig );
711 0 : FD_LOG_NOTICE(("turbine_slot0: %lu", turbine_slot0));
712 0 : }
713 0 : }
714 :
715 : /* print metrics */
716 :
717 0 : long now = fd_log_wallclock();
718 0 : int catchup_finished = 0;
719 0 : if( FD_UNLIKELY( now - last_print > 1e9L ) ) {
720 0 : char buf2[ 64 ];
721 0 : ulong rcvd = shred_metrics [ MIDX( COUNTER, SHRED, SHRED_OUT_RCV ) ];
722 0 : ulong sent = repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_WINDOW ) ] +
723 0 : repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_HIGHEST_WINDOW ) ] +
724 0 : repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_ORPHAN ) ];
725 0 : ulong sign_tile_unavail = repair_metrics[ MIDX( COUNTER, REPAIR, SIGN_TILE_UNAVAIL ) ];
726 0 : printf(" Requests received: (%lu/%lu) %.1f%% \n", rcvd, sent, (double)rcvd / (double)sent * 100.0 );
727 0 : printf( " +---------------+--------------+\n" );
728 0 : printf( " | Request Type | Count |\n" );
729 0 : printf( " +---------------+--------------+\n" );
730 0 : printf( " | Orphan | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_ORPHAN ) ] ) );
731 0 : printf( " | HighestWindow | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_HIGHEST_WINDOW ) ] ) );
732 0 : printf( " | Index | %s |\n", fmt_count( buf2, repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_WINDOW ) ] ) );
733 0 : printf( " +---------------+--------------+\n" );
734 :
735 0 : print_histogram_buckets( repair_metrics,
736 0 : MIDX( HISTOGRAM, REPAIR, RESPONSE_LATENCY ),
737 0 : FD_METRICS_CONVERTER_NONE,
738 0 : FD_METRICS_HISTOGRAM_REPAIR_RESPONSE_LATENCY_MIN,
739 0 : FD_METRICS_HISTOGRAM_REPAIR_RESPONSE_LATENCY_MAX,
740 0 : "Response Latency" );
741 :
742 0 : printf(" Sign tile unavailable: %lu\n", sign_tile_unavail);
743 0 : printf(" Repair Peers: %lu\n", repair_metrics[ MIDX( COUNTER, REPAIR, REQUEST_PEERS ) ] );
744 0 : ulong slots_behind = turbine_slot0 > repair_metrics[ MIDX( COUNTER, REPAIR, REPAIRED_SLOTS ) ] ? turbine_slot0 - repair_metrics[ MIDX( COUNTER, REPAIR, REPAIRED_SLOTS ) ] : 0;
745 0 : printf(" Repaired slots: %lu/%lu (slots behind: %lu)\n", repair_metrics[ MIDX( COUNTER, REPAIR, REPAIRED_SLOTS ) ], turbine_slot0, slots_behind );
746 0 : if( turbine_slot0 && !slots_behind ) { catchup_finished = 1; }
747 : /* Print histogram buckets similar to Prometheus format */
748 0 : print_histogram_buckets( repair_metrics,
749 0 : MIDX( HISTOGRAM, REPAIR, SLOT_COMPLETE_TIME ),
750 0 : FD_METRICS_CONVERTER_SECONDS,
751 0 : FD_METRICS_HISTOGRAM_REPAIR_SLOT_COMPLETE_TIME_MIN,
752 0 : FD_METRICS_HISTOGRAM_REPAIR_SLOT_COMPLETE_TIME_MAX,
753 0 : "Slot Complete Time" );
754 :
755 0 : printf("\n");
756 0 : fflush( stdout );
757 0 : last_print = now;
758 0 : }
759 0 : if( FD_UNLIKELY( catchup_finished ) ) {
760 0 : repair_cmd_fn_metrics_mode( args, config );
761 0 : FD_LOG_ERR(("catchup finished. slot %lu", turbine_slot0));
762 0 : }
763 0 : }
764 0 : }
765 :
766 : static void
767 : repair_cmd_fn( args_t * args,
768 0 : config_t * config ) {
769 0 : if( args->repair.metrics_only ) {
770 0 : repair_cmd_fn_metrics_mode( args, config );
771 0 : } else if( args->repair.forest_only ) {
772 0 : repair_cmd_fn_tree_mode( args, config );
773 0 : } else {
774 0 : repair_cmd_fn_profiler_mode( args, config );
775 0 : }
776 0 : }
777 :
778 : action_t fd_action_repair = {
779 : .name = "repair",
780 : .args = repair_cmd_args,
781 : .fn = repair_cmd_fn,
782 : .perm = dev_cmd_perm,
783 : };
|