Line data Source code
1 : #include "../metrics/fd_metrics.h"
2 : #include "../stem/fd_stem.h"
3 : #include "../topo/fd_topo.h"
4 :
5 : #include <fcntl.h>
6 : #include <errno.h>
7 : #include <stdlib.h>
8 : #include <sys/types.h> /* SEEK_SET */
9 : #include <time.h>
10 : #include <unistd.h>
11 :
12 : #include "generated/fd_diag_tile_seccomp.h"
13 :
14 0 : #define REPORT_INTERVAL_MILLIS (100L)
15 :
16 0 : #define FD_DIAG_HEALTH_UNHEALTHY (0UL)
17 0 : #define FD_DIAG_HEALTH_HEALTHY (1UL)
18 0 : #define FD_DIAG_HEALTH_DISABLED (2UL)
19 :
20 : struct fd_diag_tile {
21 : long next_report_nanos;
22 :
23 : ulong tile_cnt;
24 : int is_voting;
25 :
26 : struct {
27 : ulong bundle_tile_idx[ FD_TILE_MAX ];
28 : ulong bundle_cnt;
29 : ulong shred_tile_idx[ FD_TILE_MAX ];
30 : ulong shred_cnt;
31 : ulong tower_idx;
32 : ulong replay_idx;
33 : } tiles;
34 :
35 : ulong starttime_nanos[ FD_TILE_MAX ];
36 : long first_seen_died[ FD_TILE_MAX ];
37 :
38 : int stat_fds[ FD_TILE_MAX ];
39 : int sched_fds[ FD_TILE_MAX ];
40 :
41 : volatile ulong * metrics[ FD_TILE_MAX ];
42 :
43 : struct {
44 : ulong prev_vote_slot;
45 : long vote_slot_changed_ns;
46 : ulong prev_reset_slot;
47 : long reset_slot_changed_ns;
48 : ulong prev_turbine_slot;
49 : long turbine_slot_changed_ns;
50 :
51 : ulong snapshot_turbine_bytes;
52 : ulong snapshot_repair_bytes;
53 : long byte_snapshot_ns;
54 : int repair_outpacing;
55 : } check_engine;
56 : };
57 :
58 : typedef struct fd_diag_tile fd_diag_tile_t;
59 :
60 : FD_FN_CONST static inline ulong
61 0 : scratch_align( void ) {
62 0 : return 128UL;
63 0 : }
64 :
65 : FD_FN_PURE static inline ulong
66 0 : scratch_footprint( fd_topo_tile_t const * tile ) {
67 0 : (void)tile;
68 0 : ulong l = FD_LAYOUT_INIT;
69 0 : l = FD_LAYOUT_APPEND( l, alignof( fd_diag_tile_t ), sizeof( fd_diag_tile_t ) );
70 0 : return FD_LAYOUT_FINI( l, scratch_align() );
71 0 : }
72 :
73 : static int
74 : read_stat_file( int fd,
75 : ulong ns_per_tick,
76 0 : volatile ulong * metrics ) {
77 0 : if( FD_UNLIKELY( -1==lseek( fd, 0, SEEK_SET ) ) ) FD_LOG_ERR(( "lseek failed (%i-%s)", errno, strerror( errno ) ));
78 :
79 0 : char contents[ 4096 ] = {0};
80 0 : ulong contents_len = 0UL;
81 :
82 0 : while( 1 ) {
83 0 : if( FD_UNLIKELY( contents_len>=sizeof( contents ) ) ) FD_LOG_ERR(( "stat contents overflow" ));
84 0 : long n = read( fd, contents + contents_len, sizeof( contents ) - contents_len );
85 0 : if( FD_UNLIKELY( -1==n ) ) {
86 0 : if( FD_UNLIKELY( errno==ESRCH ) ) return 1;
87 0 : FD_LOG_ERR(( "read failed (%i-%s)", errno, strerror( errno ) ));
88 0 : }
89 0 : if( FD_LIKELY( 0==n ) ) break;
90 0 : contents_len += (ulong)n;
91 0 : }
92 :
93 : /* Parse stat file: fields are space-separated.
94 : Field 10 (1-indexed) = minflt, field 12 = majflt,
95 : field 14 = utime, field 15 = stime (all in clock ticks). */
96 0 : char * saveptr;
97 0 : char * token = strtok_r( contents, " ", &saveptr );
98 0 : ulong field_idx = 0UL;
99 :
100 0 : while( token ) {
101 0 : if( FD_UNLIKELY( 9UL==field_idx ) ) {
102 0 : char * endptr;
103 0 : ulong minflt = strtoul( token, &endptr, 10 );
104 0 : if( FD_UNLIKELY( *endptr!='\0' || minflt==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul failed for minflt" ));
105 0 : metrics[ FD_METRICS_COUNTER_TILE_PAGE_FAULT_MINOR_COUNT_OFF ] = minflt;
106 0 : } else if( FD_UNLIKELY( 11UL==field_idx ) ) {
107 0 : char * endptr;
108 0 : ulong majflt = strtoul( token, &endptr, 10 );
109 0 : if( FD_UNLIKELY( *endptr!='\0' || majflt==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul failed for majflt" ));
110 0 : metrics[ FD_METRICS_COUNTER_TILE_PAGE_FAULT_MAJOR_COUNT_OFF ] = majflt;
111 0 : } else if( FD_UNLIKELY( 13UL==field_idx ) ) {
112 0 : char * endptr;
113 0 : ulong utime_ticks = strtoul( token, &endptr, 10 );
114 0 : if( FD_UNLIKELY( *endptr!='\0' || utime_ticks==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul failed for utime" ));
115 0 : metrics[ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_USER_OFF ] = utime_ticks*ns_per_tick;
116 0 : } else if( FD_UNLIKELY( 14UL==field_idx ) ) {
117 0 : char * endptr;
118 0 : ulong stime_ticks = strtoul( token, &endptr, 10 );
119 0 : if( FD_UNLIKELY( *endptr!='\0' || stime_ticks==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul failed for stime" ));
120 0 : metrics[ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_SYSTEM_OFF ] = stime_ticks*ns_per_tick;
121 0 : } else if( FD_UNLIKELY( 38UL==field_idx ) ) {
122 0 : char * endptr;
123 0 : ulong last_cpu = strtoul( token, &endptr, 10 );
124 0 : if( FD_UNLIKELY( *endptr!='\0' || last_cpu==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul failed for processor" ));
125 0 : metrics[ FD_METRICS_GAUGE_TILE_LAST_CPU_OFF ] = last_cpu;
126 0 : break; /* No need to parse stat further */
127 0 : }
128 0 : token = strtok_r( NULL, " ", &saveptr );
129 0 : field_idx++;
130 0 : }
131 :
132 0 : if( FD_UNLIKELY( field_idx!=38UL ) ) FD_LOG_ERR(( "failed to parse /proc/<pid>/task/<tid>/stat" ));
133 :
134 0 : return 0;
135 0 : }
136 :
137 : static int
138 : read_sched_file( int fd,
139 0 : volatile ulong * metrics ) {
140 0 : if( FD_UNLIKELY( -1==lseek( fd, 0, SEEK_SET ) ) ) FD_LOG_ERR(( "lseek failed (%i-%s)", errno, strerror( errno ) ));
141 :
142 0 : char contents[ 8192 ] = {0};
143 0 : ulong contents_len = 0UL;
144 :
145 0 : while( 1 ) {
146 0 : if( FD_UNLIKELY( contents_len>=sizeof( contents ) ) ) FD_LOG_ERR(( "sched contents overflow" ));
147 0 : long n = read( fd, contents + contents_len, sizeof( contents ) - contents_len );
148 0 : if( FD_UNLIKELY( -1==n ) ) {
149 0 : if( FD_UNLIKELY( errno==ESRCH ) ) return 1;
150 0 : FD_LOG_ERR(( "read failed (%i-%s)", errno, strerror( errno ) ));
151 0 : }
152 0 : if( FD_LIKELY( 0==n ) ) break;
153 0 : contents_len += (ulong)n;
154 0 : }
155 :
156 0 : int found_wait_sum = 0;
157 0 : int found_voluntary = 0;
158 0 : int found_involuntary = 0;
159 :
160 0 : char * line = contents;
161 0 : while( 1 ) {
162 0 : char * next_line = strchr( line, '\n' );
163 0 : if( FD_UNLIKELY( NULL==next_line ) ) break;
164 0 : *next_line = '\0';
165 :
166 0 : if( FD_UNLIKELY( !strncmp( line, "wait_sum", 8UL ) ) ) {
167 0 : char * colon = strchr( line, ':' );
168 0 : if( FD_LIKELY( colon ) ) {
169 0 : char * value = colon + 1;
170 0 : while( ' '==*value || '\t'==*value ) value++;
171 : /* wait_sum is displayed as seconds.microseconds (e.g., "123.456789").
172 : Parse both components as integers and convert to nanoseconds. */
173 0 : char * endptr;
174 0 : ulong seconds = strtoul( value, &endptr, 10 );
175 0 : if( FD_UNLIKELY( '.'!=*endptr ) ) FD_LOG_ERR(( "expected '.' after seconds in wait_sum" ));
176 0 : if( FD_UNLIKELY( seconds==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul overflow for wait_sum seconds" ));
177 0 : ulong microseconds = strtoul( endptr + 1, &endptr, 10 );
178 0 : if( FD_UNLIKELY( '\0'!=*endptr ) ) FD_LOG_ERR(( "unexpected char after microseconds in wait_sum" ));
179 0 : if( FD_UNLIKELY( microseconds==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul overflow for wait_sum microseconds" ));
180 0 : ulong wait_sum_ns = seconds*1000000000UL + microseconds*1000UL;
181 0 : metrics[ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_WAIT_OFF ] = wait_sum_ns;
182 0 : found_wait_sum = 1;
183 0 : }
184 0 : } else if( FD_UNLIKELY( !strncmp( line, "nr_voluntary_switches", 21UL ) ) ) {
185 0 : char * colon = strchr( line, ':' );
186 0 : if( FD_LIKELY( colon ) ) {
187 0 : char * value = colon + 1;
188 0 : while( ' '==*value || '\t'==*value ) value++;
189 0 : char * endptr;
190 0 : ulong voluntary_switches = strtoul( value, &endptr, 10 );
191 0 : if( FD_UNLIKELY( '\0'!=*endptr ) ) FD_LOG_ERR(( "unexpected char after nr_voluntary_switches" ));
192 0 : if( FD_UNLIKELY( voluntary_switches==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul overflow for nr_voluntary_switches" ));
193 0 : metrics[ FD_METRICS_COUNTER_TILE_CONTEXT_SWITCH_VOLUNTARY_COUNT_OFF ] = voluntary_switches;
194 0 : found_voluntary = 1;
195 0 : }
196 0 : } else if( FD_UNLIKELY( !strncmp( line, "nr_involuntary_switches", 23UL ) ) ) {
197 0 : char * colon = strchr( line, ':' );
198 0 : if( FD_LIKELY( colon ) ) {
199 0 : char * value = colon + 1;
200 0 : while( ' '==*value || '\t'==*value ) value++;
201 0 : char * endptr;
202 0 : ulong involuntary_switches = strtoul( value, &endptr, 10 );
203 0 : if( FD_UNLIKELY( '\0'!=*endptr ) ) FD_LOG_ERR(( "unexpected char after nr_involuntary_switches" ));
204 0 : if( FD_UNLIKELY( involuntary_switches==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul overflow for nr_involuntary_switches" ));
205 0 : metrics[ FD_METRICS_COUNTER_TILE_CONTEXT_SWITCH_INVOLUNTARY_COUNT_OFF ] = involuntary_switches;
206 0 : found_involuntary = 1;
207 0 : }
208 0 : }
209 :
210 0 : line = next_line + 1;
211 0 : }
212 :
213 : // wait_sum not present on kernels compiled without CONFIG_SCHEDSTATS=y
214 : // if( FD_UNLIKELY( !found_wait_sum ) ) FD_LOG_ERR(( "wait_sum not found in sched file" ));
215 0 : (void)found_wait_sum;
216 0 : if( FD_UNLIKELY( !found_voluntary ) ) FD_LOG_ERR(( "nr_voluntary_switches not found in sched file" ));
217 0 : if( FD_UNLIKELY( !found_involuntary ) ) FD_LOG_ERR(( "nr_involuntary_switches not found in sched file" ));
218 :
219 0 : return 0;
220 0 : }
221 :
222 : static void
223 0 : check_engine_metric( fd_diag_tile_t * ctx, long now ) {
224 0 : static ulong const vote_distance_threshold = 150UL;
225 0 : static long const vote_stall_threshold_ns = 60L*1000L*1000L*1000L;
226 0 : static ulong const replay_distance_threshold = 12UL;
227 0 : static long const replay_stall_threshold_ns = 12L*1000L*1000L*1000L;
228 0 : static long const turbine_stall_threshold_ns = 12L*1000L*1000L*1000L;
229 0 : static long const turbine_byte_cmp_window_ns = 12L*1000L*1000L*1000L;
230 :
231 0 : ulong bundle_cnt = ctx->tiles.bundle_cnt;
232 0 : ulong bundle_health = fd_ulong_if( bundle_cnt>0UL, FD_DIAG_HEALTH_UNHEALTHY, FD_DIAG_HEALTH_DISABLED );
233 0 : if( FD_LIKELY( bundle_cnt ) ) {
234 0 : int any_connected = 0;
235 0 : for( ulong i=0UL; i<bundle_cnt; i++ ) {
236 0 : volatile ulong * m = ctx->metrics[ ctx->tiles.bundle_tile_idx[ i ] ];
237 0 : if( FD_LIKELY( m[ FD_METRICS_GAUGE_BUNDLE_CONNECTED_OFF ]==1UL ) ) any_connected = 1;
238 0 : }
239 0 : bundle_health = fd_ulong_if( any_connected, FD_DIAG_HEALTH_HEALTHY, FD_DIAG_HEALTH_UNHEALTHY );
240 0 : }
241 :
242 0 : ulong tower_idx = ctx->tiles.tower_idx;
243 0 : ulong vote_health = fd_ulong_if( ctx->is_voting && tower_idx!=ULONG_MAX, FD_DIAG_HEALTH_UNHEALTHY, FD_DIAG_HEALTH_DISABLED );
244 0 : if( FD_LIKELY( ctx->is_voting && tower_idx!=ULONG_MAX && ctx->metrics[ tower_idx ][ FD_METRICS_GAUGE_TILE_STATUS_OFF ]==1UL ) ) {
245 0 : volatile ulong * m = ctx->metrics[ tower_idx ];
246 0 : ulong vote_slot = m[ FD_METRICS_GAUGE_TOWER_VOTE_SLOT_OFF ];
247 0 : ulong replay_slot = m[ FD_METRICS_GAUGE_TOWER_REPLAY_SLOT_OFF ];
248 0 : int bad;
249 0 : if( FD_UNLIKELY( vote_slot==ULONG_MAX || replay_slot==0UL ) ) {
250 0 : bad = 1;
251 0 : } else {
252 0 : if( FD_UNLIKELY( vote_slot!=ctx->check_engine.prev_vote_slot ) ) {
253 0 : ctx->check_engine.prev_vote_slot = vote_slot;
254 0 : ctx->check_engine.vote_slot_changed_ns = now;
255 0 : }
256 0 : bad = (replay_slot>vote_slot && replay_slot-vote_slot>vote_distance_threshold) ||
257 0 : (now-ctx->check_engine.vote_slot_changed_ns>vote_stall_threshold_ns);
258 0 : }
259 0 : vote_health = fd_ulong_if( bad, FD_DIAG_HEALTH_UNHEALTHY, FD_DIAG_HEALTH_HEALTHY );
260 0 : }
261 :
262 0 : ulong replay_idx = ctx->tiles.replay_idx;
263 0 : int replay_running = replay_idx!=ULONG_MAX && ctx->metrics[ replay_idx ][ FD_METRICS_GAUGE_TILE_STATUS_OFF ]==1UL;
264 0 : ulong replay_health = fd_ulong_if( replay_idx!=ULONG_MAX, FD_DIAG_HEALTH_UNHEALTHY, FD_DIAG_HEALTH_DISABLED );
265 0 : if( FD_LIKELY( replay_running ) ) {
266 0 : volatile ulong * m = ctx->metrics[ replay_idx ];
267 0 : ulong turbine_slot = m[ FD_METRICS_GAUGE_REPLAY_REASM_LATEST_SLOT_OFF ];
268 0 : ulong reset_slot = m[ FD_METRICS_GAUGE_REPLAY_RESET_SLOT_OFF ];
269 0 : if( FD_UNLIKELY( reset_slot!=ctx->check_engine.prev_reset_slot ) ) {
270 0 : ctx->check_engine.prev_reset_slot = reset_slot;
271 0 : ctx->check_engine.reset_slot_changed_ns = now;
272 0 : }
273 0 : int bad = (turbine_slot==0UL) || (reset_slot==0UL) || ((turbine_slot>reset_slot) && (turbine_slot-reset_slot>replay_distance_threshold)) || (now-ctx->check_engine.reset_slot_changed_ns>replay_stall_threshold_ns);
274 0 : replay_health = fd_ulong_if( bad, FD_DIAG_HEALTH_UNHEALTHY, FD_DIAG_HEALTH_HEALTHY );
275 0 : }
276 :
277 0 : ulong shred_cnt = ctx->tiles.shred_cnt;
278 0 : ulong turbine_health = fd_ulong_if( replay_idx!=ULONG_MAX && shred_cnt>0UL, FD_DIAG_HEALTH_UNHEALTHY, FD_DIAG_HEALTH_DISABLED );
279 0 : if( FD_LIKELY( replay_running && shred_cnt ) ) {
280 0 : int all_shred_running = 1;
281 0 : ulong cur_turbine_bytes = 0UL, cur_repair_bytes = 0UL;
282 0 : for( ulong i=0UL; i<shred_cnt; i++ ) {
283 0 : volatile ulong * sm = ctx->metrics[ ctx->tiles.shred_tile_idx[ i ] ];
284 0 : cur_turbine_bytes += sm[ FD_METRICS_COUNTER_SHRED_SHRED_TURBINE_RCV_BYTES_OFF ];
285 0 : cur_repair_bytes += sm[ FD_METRICS_COUNTER_SHRED_SHRED_REPAIR_RCV_BYTES_OFF ];
286 0 : if( FD_UNLIKELY( sm[ FD_METRICS_GAUGE_TILE_STATUS_OFF ]!=1UL ) ) {
287 0 : all_shred_running = 0;
288 0 : break;
289 0 : }
290 0 : }
291 0 : if( FD_LIKELY( all_shred_running ) ) {
292 0 : ulong turbine_slot = ctx->metrics[ replay_idx ][ FD_METRICS_GAUGE_REPLAY_REASM_LATEST_SLOT_OFF ];
293 0 : if( FD_UNLIKELY( turbine_slot!=ctx->check_engine.prev_turbine_slot ) ) {
294 0 : ctx->check_engine.prev_turbine_slot = turbine_slot;
295 0 : ctx->check_engine.turbine_slot_changed_ns = now;
296 0 : }
297 0 : if( FD_UNLIKELY( now-ctx->check_engine.byte_snapshot_ns>=turbine_byte_cmp_window_ns ) ) {
298 0 : ctx->check_engine.repair_outpacing = (cur_repair_bytes-ctx->check_engine.snapshot_repair_bytes)>(cur_turbine_bytes-ctx->check_engine.snapshot_turbine_bytes);
299 0 : ctx->check_engine.snapshot_turbine_bytes = cur_turbine_bytes;
300 0 : ctx->check_engine.snapshot_repair_bytes = cur_repair_bytes;
301 0 : ctx->check_engine.byte_snapshot_ns = now;
302 0 : }
303 :
304 0 : int bad = (turbine_slot==0UL) || (now-ctx->check_engine.turbine_slot_changed_ns>turbine_stall_threshold_ns) || ctx->check_engine.repair_outpacing;
305 0 : turbine_health = fd_ulong_if( bad, FD_DIAG_HEALTH_UNHEALTHY, FD_DIAG_HEALTH_HEALTHY );
306 0 : }
307 0 : }
308 :
309 0 : FD_MGAUGE_SET( DIAG, BUNDLE_HEALTH, bundle_health );
310 0 : FD_MGAUGE_SET( DIAG, VOTE_HEALTH, vote_health );
311 0 : FD_MGAUGE_SET( DIAG, REPLAY_HEALTH, replay_health );
312 0 : FD_MGAUGE_SET( DIAG, TURBINE_HEALTH, turbine_health );
313 0 : }
314 :
315 : static void
316 : before_credit( fd_diag_tile_t * ctx,
317 : fd_stem_context_t * stem,
318 0 : int * charge_busy ) {
319 0 : (void)stem;
320 :
321 0 : long now = fd_log_wallclock();
322 0 : if( now<ctx->next_report_nanos ) {
323 0 : long diff = ctx->next_report_nanos - now;
324 0 : diff = fd_long_min( diff, 2e6 /* 2ms */ );
325 0 : struct timespec const ts = {
326 0 : .tv_sec = diff / (long)1e9,
327 0 : .tv_nsec = diff % (long)1e9
328 0 : };
329 0 : clock_nanosleep( CLOCK_REALTIME, 0, &ts, NULL );
330 0 : return;
331 0 : }
332 0 : ctx->next_report_nanos += REPORT_INTERVAL_MILLIS*1000L*1000L;
333 :
334 0 : *charge_busy = 1;
335 :
336 0 : struct timespec boottime;
337 0 : if( FD_UNLIKELY( -1==clock_gettime( CLOCK_BOOTTIME, &boottime ) ) ) FD_LOG_ERR(( "clock_gettime(CLOCK_BOOTTIME) failed (%i-%s)", errno, strerror( errno ) ));
338 0 : ulong now_since_boot_nanos = (ulong)boottime.tv_sec*1000000000UL + (ulong)boottime.tv_nsec;
339 :
340 0 : for( ulong i=0UL; i<ctx->tile_cnt; i++ ) {
341 0 : if( FD_UNLIKELY( -1==ctx->stat_fds[ i ] ) ) continue;
342 :
343 : /* CLK_TCK is typically 100, so 1 tick = 10ms = 10,000,000 ns */
344 0 : int process_died1 = read_stat_file( ctx->stat_fds[ i ], 10000000UL, ctx->metrics[ i ] );
345 0 : int process_died2 = read_sched_file( ctx->sched_fds[ i ], ctx->metrics[ i ] );
346 :
347 0 : if( FD_UNLIKELY( process_died1 || process_died2 ) ) {
348 0 : ctx->stat_fds[ i ] = -1;
349 0 : continue;
350 0 : }
351 :
352 0 : ulong task_lifetime_nanos = now_since_boot_nanos - ctx->starttime_nanos[ i ];
353 0 : ulong user_nanos = ctx->metrics[ i ][ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_USER_OFF ];
354 0 : ulong system_nanos = ctx->metrics[ i ][ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_SYSTEM_OFF ];
355 0 : ulong wait_nanos = ctx->metrics[ i ][ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_WAIT_OFF ];
356 0 : ulong busy_nanos = user_nanos+system_nanos+wait_nanos;
357 0 : ulong idle_nanos = (task_lifetime_nanos>busy_nanos) ? (task_lifetime_nanos-busy_nanos) : 0UL;
358 :
359 : /* Counter can't go backwards in Prometheus else it thinks the
360 : application restarted. Use max to ensure monotonicity. */
361 0 : ctx->metrics[ i ][ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_IDLE_OFF ] = fd_ulong_max( idle_nanos, ctx->metrics[ i ][ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_IDLE_OFF ] );
362 0 : }
363 :
364 0 : for( ulong i=0UL; i<ctx->tile_cnt; i++ ) {
365 0 : if( FD_LIKELY( -1!=ctx->stat_fds[ i ] ) ) continue;
366 :
367 : /* The tile died, but it's a tile which is allowed to shutdown, so
368 : just stop updating metrics for it. */
369 0 : if( FD_LIKELY( 2UL==ctx->metrics[ i ][ FD_METRICS_GAUGE_TILE_STATUS_OFF ] ) ) continue;
370 :
371 : /* Supervisor is going to bring the whole process tree down if any
372 : of the target PIDs died, so we can ignore this and wait. */
373 0 : if( FD_UNLIKELY( !ctx->first_seen_died[ i ] ) ) {
374 0 : ctx->first_seen_died[ i ] = now;
375 0 : } else if( FD_LIKELY( ctx->first_seen_died[ i ]==LONG_MAX ) ) {
376 : /* We already reported this, so we can ignore it. */
377 0 : } else if( FD_UNLIKELY( now-ctx->first_seen_died[ i ] < 10L*1000L*1000L*1000L ) ) {
378 : /* Wait 10 seconds for supervisor to kill us before reporting WARNING */
379 0 : } else {
380 0 : FD_LOG_WARNING(( "cannot get metrics for dead tile idx %lu", i ));
381 0 : ctx->first_seen_died[ i ] = LONG_MAX;
382 0 : }
383 0 : }
384 :
385 0 : check_engine_metric( ctx, now );
386 0 : }
387 :
388 : static void
389 : privileged_init( fd_topo_t * topo,
390 0 : fd_topo_tile_t * tile ) {
391 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
392 :
393 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
394 0 : fd_diag_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_diag_tile_t), sizeof(fd_diag_tile_t) );
395 :
396 0 : FD_TEST( topo->tile_cnt<FD_TILE_MAX );
397 :
398 0 : FD_TEST( 100L == sysconf( _SC_CLK_TCK ) );
399 :
400 0 : ctx->tile_cnt = topo->tile_cnt;
401 0 : for( ulong i=0UL; i<FD_TILE_MAX; i++ ) {
402 0 : ctx->stat_fds[ i ] = -1;
403 0 : ctx->sched_fds[ i ] = -1;
404 0 : }
405 :
406 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
407 0 : ulong * metrics = fd_metrics_join( fd_topo_obj_laddr( topo, topo->tiles[ i ].metrics_obj_id ) );
408 :
409 0 : for(;;) {
410 0 : ulong pid, tid;
411 0 : if( FD_UNLIKELY( tile->id==i ) ) {
412 0 : pid = fd_sandbox_getpid();
413 0 : tid = fd_sandbox_gettid();
414 0 : } else {
415 0 : pid = fd_metrics_tile( metrics )[ FD_METRICS_GAUGE_TILE_PID_OFF ];
416 0 : tid = fd_metrics_tile( metrics )[ FD_METRICS_GAUGE_TILE_TID_OFF ];
417 0 : if( FD_UNLIKELY( !pid || !tid ) ) {
418 0 : FD_SPIN_PAUSE();
419 0 : continue;
420 0 : }
421 0 : }
422 :
423 0 : ctx->metrics[ i ] = fd_metrics_tile( metrics );
424 :
425 0 : char path[ 64UL ];
426 0 : FD_TEST( fd_cstr_printf_check( path, sizeof( path ), NULL, "/proc/%lu/task/%lu/stat", pid, tid ) );
427 0 : ctx->stat_fds[ i ] = open( path, O_RDONLY );
428 0 : if( FD_UNLIKELY( -1==ctx->stat_fds[ i ] ) ) {
429 : /* Might be a tile that's allowed to shutdown already did so
430 : before we got to here, due to a race condition. Just
431 : proceed, we will not be able to get metrics for the shut
432 : down process. */
433 0 : if( FD_LIKELY( 2UL!=ctx->metrics[ i ][ FD_METRICS_GAUGE_TILE_STATUS_OFF ] ) ) FD_LOG_ERR(( "open stat failed (%i-%s)", errno, strerror( errno ) ));
434 0 : break;
435 0 : }
436 :
437 0 : FD_TEST( fd_cstr_printf_check( path, sizeof( path ), NULL, "/proc/%lu/task/%lu/sched", pid, tid ) );
438 0 : ctx->sched_fds[ i ] = open( path, O_RDONLY );
439 0 : if( FD_UNLIKELY( -1==ctx->sched_fds[ i ] ) ) {
440 0 : if( FD_LIKELY( 2UL!=ctx->metrics[ i ][ FD_METRICS_GAUGE_TILE_STATUS_OFF ] ) ) FD_LOG_ERR(( "open sched failed (%i-%s)", errno, strerror( errno ) ));
441 0 : ctx->stat_fds[ i ] = -1;
442 0 : }
443 0 : break;
444 0 : }
445 0 : }
446 0 : }
447 :
448 : /* Read starttime (field 22) from stat file. Returns 0 on success, 1 if
449 : process died (ESRCH). */
450 :
451 : static int
452 : read_starttime( int fd,
453 : ulong ns_per_tick,
454 0 : ulong * out_starttime_nanos ) {
455 0 : char contents[ 4096 ] = {0};
456 0 : ulong contents_len = 0UL;
457 :
458 0 : while( 1 ) {
459 0 : if( FD_UNLIKELY( contents_len>=sizeof( contents ) ) ) FD_LOG_ERR(( "stat contents overflow" ));
460 0 : long n = read( fd, contents + contents_len, sizeof( contents ) - contents_len );
461 0 : if( FD_UNLIKELY( -1==n ) ) {
462 0 : if( FD_UNLIKELY( errno==ESRCH ) ) return 1;
463 0 : FD_LOG_ERR(( "read stat failed (%i-%s)", errno, strerror( errno ) ));
464 0 : }
465 0 : if( FD_LIKELY( 0L==n ) ) break;
466 0 : contents_len += (ulong)n;
467 0 : }
468 :
469 : /* Parse field 22 (starttime) from stat file */
470 0 : char * saveptr;
471 0 : char * token = strtok_r( contents, " ", &saveptr );
472 0 : ulong field_idx = 0UL;
473 :
474 0 : while( token && field_idx<21UL ) {
475 0 : token = strtok_r( NULL, " ", &saveptr );
476 0 : field_idx++;
477 0 : }
478 :
479 0 : if( FD_UNLIKELY( !token || field_idx!=21UL ) ) FD_LOG_ERR(( "starttime (field 22) not found in stat" ));
480 :
481 0 : char * endptr;
482 0 : ulong starttime_ticks = strtoul( token, &endptr, 10 );
483 0 : if( FD_UNLIKELY( *endptr!=' ' && *endptr!='\0' ) ) FD_LOG_ERR(( "strtoul failed for starttime" ));
484 0 : if( FD_UNLIKELY( starttime_ticks==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul overflow for starttime" ));
485 :
486 0 : *out_starttime_nanos = starttime_ticks * ns_per_tick;
487 0 : return 0;
488 0 : }
489 :
490 : static void
491 : unprivileged_init( fd_topo_t * topo,
492 0 : fd_topo_tile_t * tile ) {
493 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
494 :
495 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
496 0 : fd_diag_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_diag_tile_t), sizeof(fd_diag_tile_t) );
497 :
498 0 : memset( ctx->first_seen_died, 0, sizeof( ctx->first_seen_died ) );
499 0 : ctx->next_report_nanos = fd_log_wallclock();
500 :
501 : /* Read starttime (field 22) once at init for idle time calculation.
502 : CLK_TCK is always 100, so 1 tick = 10ms = 10,000,000 ns. */
503 0 : for( ulong i=0UL; i<ctx->tile_cnt; i++ ) {
504 0 : if( FD_LIKELY( -1!=ctx->stat_fds[ i ] ) ) {
505 0 : int died = read_starttime( ctx->stat_fds[ i ], 10000000UL, &ctx->starttime_nanos[ i ] );
506 0 : if( FD_UNLIKELY( died ) ) ctx->stat_fds[ i ] = -1;
507 0 : }
508 0 : }
509 :
510 0 : memset( &ctx->check_engine, 0, sizeof(ctx->check_engine) );
511 :
512 0 : ctx->tiles.bundle_cnt = fd_topo_tile_name_cnt( topo, "bundle" );
513 0 : for( ulong i=0UL; i<ctx->tiles.bundle_cnt; i++ ) ctx->tiles.bundle_tile_idx[ i ] = fd_topo_find_tile( topo, "bundle", i );
514 0 : ctx->tiles.shred_cnt = fd_topo_tile_name_cnt( topo, "shred" );
515 0 : for( ulong i=0UL; i<ctx->tiles.shred_cnt; i++ ) ctx->tiles.shred_tile_idx[ i ] = fd_topo_find_tile( topo, "shred", i );
516 0 : ctx->tiles.tower_idx = fd_topo_find_tile( topo, "tower", 0UL );
517 0 : ctx->tiles.replay_idx = fd_topo_find_tile( topo, "replay", 0UL );
518 :
519 0 : long now = fd_log_wallclock();
520 0 : ctx->is_voting = tile->diag.is_voting;
521 0 : ctx->check_engine.vote_slot_changed_ns = now;
522 0 : ctx->check_engine.reset_slot_changed_ns = now;
523 0 : ctx->check_engine.turbine_slot_changed_ns = now;
524 0 : ctx->check_engine.byte_snapshot_ns = now;
525 :
526 :
527 0 : ulong scratch_top = FD_SCRATCH_ALLOC_FINI( l, 1UL );
528 0 : if( FD_UNLIKELY( scratch_top > (ulong)scratch + scratch_footprint( tile ) ) )
529 0 : FD_LOG_ERR(( "scratch overflow %lu %lu %lu", scratch_top - (ulong)scratch - scratch_footprint( tile ), scratch_top, (ulong)scratch + scratch_footprint( tile ) ));
530 0 : }
531 :
532 : static ulong
533 : populate_allowed_seccomp( fd_topo_t const * topo,
534 : fd_topo_tile_t const * tile,
535 : ulong out_cnt,
536 0 : struct sock_filter * out ) {
537 0 : (void)topo;
538 0 : (void)tile;
539 :
540 0 : populate_sock_filter_policy_fd_diag_tile( out_cnt, out, (uint)fd_log_private_logfile_fd() );
541 0 : return sock_filter_policy_fd_diag_tile_instr_cnt;
542 0 : }
543 :
544 : static ulong
545 : populate_allowed_fds( fd_topo_t const * topo,
546 : fd_topo_tile_t const * tile,
547 : ulong out_fds_cnt,
548 0 : int * out_fds ) {
549 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
550 :
551 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
552 0 : fd_diag_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_diag_tile_t), sizeof(fd_diag_tile_t) );
553 :
554 0 : if( FD_UNLIKELY( out_fds_cnt<2UL+2UL*ctx->tile_cnt ) ) FD_LOG_ERR(( "out_fds_cnt %lu", out_fds_cnt ));
555 :
556 0 : ulong out_cnt = 0UL;
557 0 : out_fds[ out_cnt++ ] = 2; /* stderr */
558 0 : if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) )
559 0 : out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
560 0 : for( ulong i=0UL; i<ctx->tile_cnt; i++ ) {
561 0 : if( -1!=ctx->stat_fds[ i ] ) out_fds[ out_cnt++ ] = ctx->stat_fds[ i ]; /* /proc/<pid>/task/<tid>/stat */
562 0 : if( -1!=ctx->sched_fds[ i ] ) out_fds[ out_cnt++ ] = ctx->sched_fds[ i ]; /* /proc/<pid>/task/<tid>/sched */
563 0 : }
564 0 : return out_cnt;
565 0 : }
566 :
567 0 : #define STEM_BURST (1UL)
568 0 : #define STEM_LAZY ((long)10e6) /* 10ms */
569 :
570 0 : #define STEM_CALLBACK_CONTEXT_TYPE fd_diag_tile_t
571 0 : #define STEM_CALLBACK_CONTEXT_ALIGN alignof(fd_diag_tile_t)
572 :
573 0 : #define STEM_CALLBACK_BEFORE_CREDIT before_credit
574 :
575 : #include "../../disco/stem/fd_stem.c"
576 :
577 : fd_topo_run_tile_t fd_tile_diag = {
578 : .name = "diag",
579 : .populate_allowed_seccomp = populate_allowed_seccomp,
580 : .populate_allowed_fds = populate_allowed_fds,
581 : .scratch_align = scratch_align,
582 : .scratch_footprint = scratch_footprint,
583 : .privileged_init = privileged_init,
584 : .unprivileged_init = unprivileged_init,
585 : .run = stem_run,
586 : };
|