Line data Source code
1 : #include "fd_diag_tile.h"
2 :
3 : #include "../bundle/fd_bundle_tile.h"
4 : #include "../metrics/fd_metrics.h"
5 : #include "../stem/fd_stem.h"
6 : #include "../topo/fd_topo.h"
7 : #include "../../util/tile/fd_tile_private.h"
8 :
9 : #include <fcntl.h>
10 : #include <errno.h>
11 : #include <stdlib.h>
12 : #include <sys/types.h> /* SEEK_SET */
13 : #include <time.h>
14 : #include <unistd.h>
15 :
16 : #include "fd_proc_interrupts.h"
17 : #include "generated/fd_diag_tile_seccomp.h"
18 :
19 0 : #define REPORT_INTERVAL_MILLIS (100L)
20 :
21 : struct fd_diag_tile {
22 : long next_report_nanos;
23 :
24 : ulong tile_cnt;
25 : int is_voting;
26 :
27 : struct {
28 : ulong bundle_tile_idx[ FD_TILE_MAX ];
29 : ulong bundle_cnt;
30 : ulong shred_tile_idx[ FD_TILE_MAX ];
31 : ulong shred_cnt;
32 : ulong tower_idx;
33 : ulong replay_idx;
34 : } tiles;
35 :
36 : ulong starttime_nanos[ FD_TILE_MAX ];
37 : long first_seen_died[ FD_TILE_MAX ];
38 :
39 : int stat_fds[ FD_TILE_MAX ];
40 : int sched_fds[ FD_TILE_MAX ];
41 :
42 : ulong irq_cnt[ FD_METRICS_ENUM_SOFTIRQ_CNT ][ FD_TILE_MAX ];
43 : fd_cpuset_t cpu_has_tile[ fd_cpuset_word_cnt ];
44 : int proc_interrupts_fd;
45 : int proc_softirqs_fd;
46 : ulong device_irq_baseline[ FD_TILE_MAX ];
47 : ulong tlb_baseline[ FD_TILE_MAX ];
48 : ulong softirq_baseline[ FD_METRICS_ENUM_SOFTIRQ_CNT ][ FD_TILE_MAX ];
49 :
50 : ulong volatile * metrics [ FD_TILE_MAX ];
51 : ushort cpu_to_tile[ FD_TILE_MAX ];
52 :
53 : struct {
54 : ulong prev_vote_slot;
55 : long vote_slot_changed_ns;
56 : ulong prev_reset_slot;
57 : long reset_slot_changed_ns;
58 : ulong prev_turbine_slot;
59 : long turbine_slot_changed_ns;
60 :
61 : ulong snapshot_turbine_bytes;
62 : ulong snapshot_repair_bytes;
63 : long byte_snapshot_ns;
64 : int repair_outpacing;
65 : } check_engine;
66 : };
67 :
68 : typedef struct fd_diag_tile fd_diag_tile_t;
69 :
70 : FD_FN_CONST static inline ulong
71 0 : scratch_align( void ) {
72 0 : return alignof(fd_diag_tile_t);
73 0 : }
74 :
75 : FD_FN_PURE static inline ulong
76 0 : scratch_footprint( fd_topo_tile_t const * tile ) {
77 0 : (void)tile;
78 0 : return sizeof(fd_diag_tile_t);
79 0 : }
80 :
81 : static int
82 : read_stat_file( int fd,
83 : ulong ns_per_tick,
84 0 : volatile ulong * metrics ) {
85 0 : if( FD_UNLIKELY( -1==lseek( fd, 0, SEEK_SET ) ) ) FD_LOG_ERR(( "lseek failed (%i-%s)", errno, strerror( errno ) ));
86 :
87 0 : char contents[ 4096 ] = {0};
88 0 : ulong contents_len = 0UL;
89 :
90 0 : while( 1 ) {
91 0 : if( FD_UNLIKELY( contents_len>=sizeof( contents ) ) ) FD_LOG_ERR(( "stat contents overflow" ));
92 0 : long n = read( fd, contents + contents_len, sizeof( contents ) - contents_len );
93 0 : if( FD_UNLIKELY( -1==n ) ) {
94 0 : if( FD_UNLIKELY( errno==ESRCH ) ) return 1;
95 0 : FD_LOG_ERR(( "read failed (%i-%s)", errno, strerror( errno ) ));
96 0 : }
97 0 : if( FD_LIKELY( 0==n ) ) break;
98 0 : contents_len += (ulong)n;
99 0 : }
100 :
101 : /* Parse stat file: fields are space-separated.
102 : Field 10 (1-indexed) = minflt, field 12 = majflt,
103 : field 14 = utime, field 15 = stime (all in clock ticks). */
104 0 : char * saveptr;
105 0 : char * token = strtok_r( contents, " ", &saveptr );
106 0 : ulong field_idx = 0UL;
107 :
108 0 : while( token ) {
109 0 : if( FD_UNLIKELY( 9UL==field_idx ) ) {
110 0 : char * endptr;
111 0 : ulong minflt = strtoul( token, &endptr, 10 );
112 0 : if( FD_UNLIKELY( *endptr!='\0' || minflt==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul failed for minflt" ));
113 0 : metrics[ FD_METRICS_COUNTER_TILE_PAGE_FAULT_MINOR_OFF ] = minflt;
114 0 : } else if( FD_UNLIKELY( 11UL==field_idx ) ) {
115 0 : char * endptr;
116 0 : ulong majflt = strtoul( token, &endptr, 10 );
117 0 : if( FD_UNLIKELY( *endptr!='\0' || majflt==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul failed for majflt" ));
118 0 : metrics[ FD_METRICS_COUNTER_TILE_PAGE_FAULT_MAJOR_OFF ] = majflt;
119 0 : } else if( FD_UNLIKELY( 13UL==field_idx ) ) {
120 0 : char * endptr;
121 0 : ulong utime_ticks = strtoul( token, &endptr, 10 );
122 0 : if( FD_UNLIKELY( *endptr!='\0' || utime_ticks==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul failed for utime" ));
123 0 : metrics[ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_USER_OFF ] = utime_ticks*ns_per_tick;
124 0 : } else if( FD_UNLIKELY( 14UL==field_idx ) ) {
125 0 : char * endptr;
126 0 : ulong stime_ticks = strtoul( token, &endptr, 10 );
127 0 : if( FD_UNLIKELY( *endptr!='\0' || stime_ticks==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul failed for stime" ));
128 0 : metrics[ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_SYSTEM_OFF ] = stime_ticks*ns_per_tick;
129 0 : } else if( FD_UNLIKELY( 38UL==field_idx ) ) {
130 0 : char * endptr;
131 0 : ulong last_cpu = strtoul( token, &endptr, 10 );
132 0 : if( FD_UNLIKELY( *endptr!='\0' || last_cpu==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul failed for processor" ));
133 0 : metrics[ FD_METRICS_GAUGE_TILE_LAST_CPU_OFF ] = last_cpu;
134 0 : break; /* No need to parse stat further */
135 0 : }
136 0 : token = strtok_r( NULL, " ", &saveptr );
137 0 : field_idx++;
138 0 : }
139 :
140 0 : if( FD_UNLIKELY( field_idx!=38UL ) ) FD_LOG_ERR(( "failed to parse /proc/<pid>/task/<tid>/stat" ));
141 :
142 0 : return 0;
143 0 : }
144 :
145 : static int
146 : read_sched_file( int fd,
147 0 : volatile ulong * metrics ) {
148 0 : if( FD_UNLIKELY( -1==lseek( fd, 0, SEEK_SET ) ) ) FD_LOG_ERR(( "lseek failed (%i-%s)", errno, strerror( errno ) ));
149 :
150 0 : char contents[ 8192 ] = {0};
151 0 : ulong contents_len = 0UL;
152 :
153 0 : while( 1 ) {
154 0 : if( FD_UNLIKELY( contents_len>=sizeof( contents ) ) ) FD_LOG_ERR(( "sched contents overflow" ));
155 0 : long n = read( fd, contents + contents_len, sizeof( contents ) - contents_len );
156 0 : if( FD_UNLIKELY( -1==n ) ) {
157 0 : if( FD_UNLIKELY( errno==ESRCH ) ) return 1;
158 0 : FD_LOG_ERR(( "read failed (%i-%s)", errno, strerror( errno ) ));
159 0 : }
160 0 : if( FD_LIKELY( 0==n ) ) break;
161 0 : contents_len += (ulong)n;
162 0 : }
163 :
164 0 : int found_wait_sum = 0;
165 0 : int found_voluntary = 0;
166 0 : int found_involuntary = 0;
167 :
168 0 : char * line = contents;
169 0 : while( 1 ) {
170 0 : char * next_line = strchr( line, '\n' );
171 0 : if( FD_UNLIKELY( NULL==next_line ) ) break;
172 0 : *next_line = '\0';
173 :
174 0 : if( FD_UNLIKELY( !strncmp( line, "wait_sum", 8UL ) ) ) {
175 0 : char * colon = strchr( line, ':' );
176 0 : if( FD_LIKELY( colon ) ) {
177 0 : char * value = colon + 1;
178 0 : while( ' '==*value || '\t'==*value ) value++;
179 : /* wait_sum is displayed as seconds.microseconds (e.g., "123.456789").
180 : Parse both components as integers and convert to nanoseconds. */
181 0 : char * endptr;
182 0 : ulong seconds = strtoul( value, &endptr, 10 );
183 0 : if( FD_UNLIKELY( '.'!=*endptr ) ) FD_LOG_ERR(( "expected '.' after seconds in wait_sum" ));
184 0 : if( FD_UNLIKELY( seconds==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul overflow for wait_sum seconds" ));
185 0 : ulong microseconds = strtoul( endptr + 1, &endptr, 10 );
186 0 : if( FD_UNLIKELY( '\0'!=*endptr ) ) FD_LOG_ERR(( "unexpected char after microseconds in wait_sum" ));
187 0 : if( FD_UNLIKELY( microseconds==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul overflow for wait_sum microseconds" ));
188 0 : ulong wait_sum_ns = seconds*1000000000UL + microseconds*1000UL;
189 0 : metrics[ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_WAIT_OFF ] = wait_sum_ns;
190 0 : found_wait_sum = 1;
191 0 : }
192 0 : } else if( FD_UNLIKELY( !strncmp( line, "nr_voluntary_switches", 21UL ) ) ) {
193 0 : char * colon = strchr( line, ':' );
194 0 : if( FD_LIKELY( colon ) ) {
195 0 : char * value = colon + 1;
196 0 : while( ' '==*value || '\t'==*value ) value++;
197 0 : char * endptr;
198 0 : ulong voluntary_switches = strtoul( value, &endptr, 10 );
199 0 : if( FD_UNLIKELY( '\0'!=*endptr ) ) FD_LOG_ERR(( "unexpected char after nr_voluntary_switches" ));
200 0 : if( FD_UNLIKELY( voluntary_switches==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul overflow for nr_voluntary_switches" ));
201 0 : metrics[ FD_METRICS_COUNTER_TILE_CONTEXT_SWITCH_VOLUNTARY_OFF ] = voluntary_switches;
202 0 : found_voluntary = 1;
203 0 : }
204 0 : } else if( FD_UNLIKELY( !strncmp( line, "nr_involuntary_switches", 23UL ) ) ) {
205 0 : char * colon = strchr( line, ':' );
206 0 : if( FD_LIKELY( colon ) ) {
207 0 : char * value = colon + 1;
208 0 : while( ' '==*value || '\t'==*value ) value++;
209 0 : char * endptr;
210 0 : ulong involuntary_switches = strtoul( value, &endptr, 10 );
211 0 : if( FD_UNLIKELY( '\0'!=*endptr ) ) FD_LOG_ERR(( "unexpected char after nr_involuntary_switches" ));
212 0 : if( FD_UNLIKELY( involuntary_switches==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul overflow for nr_involuntary_switches" ));
213 0 : metrics[ FD_METRICS_COUNTER_TILE_CONTEXT_SWITCH_INVOLUNTARY_OFF ] = involuntary_switches;
214 0 : found_involuntary = 1;
215 0 : }
216 0 : }
217 :
218 0 : line = next_line + 1;
219 0 : }
220 :
221 : // wait_sum not present on kernels compiled without CONFIG_SCHEDSTATS=y
222 : // if( FD_UNLIKELY( !found_wait_sum ) ) FD_LOG_ERR(( "wait_sum not found in sched file" ));
223 0 : (void)found_wait_sum;
224 0 : if( FD_UNLIKELY( !found_voluntary ) ) FD_LOG_ERR(( "nr_voluntary_switches not found in sched file" ));
225 0 : if( FD_UNLIKELY( !found_involuntary ) ) FD_LOG_ERR(( "nr_involuntary_switches not found in sched file" ));
226 :
227 0 : return 0;
228 0 : }
229 :
230 : static void
231 0 : check_engine_metric( fd_diag_tile_t * ctx, long now ) {
232 0 : static ulong const vote_distance_threshold = 150UL;
233 0 : static long const vote_stall_threshold_ns = 60L*1000L*1000L*1000L;
234 0 : static ulong const replay_distance_threshold = 12UL;
235 0 : static long const replay_stall_threshold_ns = 12L*1000L*1000L*1000L;
236 0 : static long const turbine_stall_threshold_ns = 12L*1000L*1000L*1000L;
237 0 : static long const turbine_byte_cmp_window_ns = 12L*1000L*1000L*1000L;
238 :
239 0 : ulong bundle_cnt = ctx->tiles.bundle_cnt;
240 0 : ulong bundle_status = FD_DIAG_BUNDLE_STATUS_DISABLED;
241 0 : if( FD_LIKELY( bundle_cnt ) ) {
242 : /* Find the best state across all bundle tiles.
243 : Priority: connected > sleeping > connecting > disconnected */
244 0 : int any_connected = 0;
245 0 : int any_sleeping = 0;
246 0 : int any_connecting = 0;
247 0 : for( ulong i=0UL; i<bundle_cnt; i++ ) {
248 0 : volatile ulong * m = ctx->metrics[ ctx->tiles.bundle_tile_idx[ i ] ];
249 0 : ulong state = m[ FD_METRICS_GAUGE_BUNDLE_STATE_OFF ];
250 0 : if( FD_LIKELY( state==FD_BUNDLE_STATE_CONNECTED ) ) any_connected = 1;
251 0 : else if( state==FD_BUNDLE_STATE_SLEEPING ) any_sleeping = 1;
252 0 : else if( state==FD_BUNDLE_STATE_CONNECTING ) any_connecting = 1;
253 0 : }
254 0 : if( any_connected ) bundle_status = FD_DIAG_BUNDLE_STATUS_CONNECTED;
255 0 : else if( any_sleeping ) bundle_status = FD_DIAG_BUNDLE_STATUS_SLEEPING;
256 0 : else if( any_connecting ) bundle_status = FD_DIAG_BUNDLE_STATUS_CONNECTING;
257 0 : else bundle_status = FD_DIAG_BUNDLE_STATUS_DISCONNECTED;
258 0 : }
259 :
260 0 : ulong tower_idx = ctx->tiles.tower_idx;
261 0 : ulong vote_status = FD_DIAG_VOTE_STATUS_DISABLED;
262 0 : if( FD_LIKELY( ctx->is_voting && tower_idx!=ULONG_MAX ) ) {
263 0 : if( FD_UNLIKELY( ctx->metrics[ tower_idx ][ FD_METRICS_GAUGE_TILE_STATUS_OFF ]!=1UL ) ) {
264 0 : vote_status = FD_DIAG_VOTE_STATUS_NOT_STARTED;
265 0 : } else {
266 0 : volatile ulong * m = ctx->metrics[ tower_idx ];
267 0 : ulong vote_slot = m[ FD_METRICS_GAUGE_TOWER_VOTE_SLOT_OFF ];
268 0 : ulong replay_slot = m[ FD_METRICS_GAUGE_TOWER_REPLAY_SLOT_OFF ];
269 0 : if( FD_UNLIKELY( vote_slot==ULONG_MAX || replay_slot==0UL ) ) {
270 0 : vote_status = FD_DIAG_VOTE_STATUS_NOT_STARTED;
271 0 : } else {
272 0 : if( FD_UNLIKELY( vote_slot!=ctx->check_engine.prev_vote_slot ) ) {
273 0 : ctx->check_engine.prev_vote_slot = vote_slot;
274 0 : ctx->check_engine.vote_slot_changed_ns = now;
275 0 : }
276 0 : int delinquent = (replay_slot>vote_slot && replay_slot-vote_slot>vote_distance_threshold) ||
277 0 : (now-ctx->check_engine.vote_slot_changed_ns>vote_stall_threshold_ns);
278 0 : vote_status = fd_ulong_if( delinquent,
279 0 : FD_DIAG_VOTE_STATUS_DELINQUENT,
280 0 : FD_DIAG_VOTE_STATUS_VOTING );
281 0 : }
282 0 : }
283 0 : }
284 :
285 0 : ulong replay_idx = ctx->tiles.replay_idx;
286 0 : int replay_running = replay_idx!=ULONG_MAX && ctx->metrics[ replay_idx ][ FD_METRICS_GAUGE_TILE_STATUS_OFF ]==1UL;
287 0 : ulong replay_status = FD_DIAG_REPLAY_STATUS_DISABLED;
288 0 : if( FD_LIKELY( replay_idx!=ULONG_MAX ) ) {
289 0 : if( FD_UNLIKELY( !replay_running ) ) {
290 0 : replay_status = FD_DIAG_REPLAY_STATUS_NOT_STARTED;
291 0 : } else {
292 0 : volatile ulong * m = ctx->metrics[ replay_idx ];
293 0 : ulong turbine_slot = m[ FD_METRICS_GAUGE_REPLAY_REASSEMBLY_LATEST_SLOT_OFF ];
294 0 : ulong reset_slot = m[ FD_METRICS_GAUGE_REPLAY_RESET_SLOT_OFF ];
295 0 : if( FD_UNLIKELY( reset_slot!=ctx->check_engine.prev_reset_slot ) ) {
296 0 : ctx->check_engine.prev_reset_slot = reset_slot;
297 0 : ctx->check_engine.reset_slot_changed_ns = now;
298 0 : }
299 0 : if( FD_UNLIKELY( (turbine_slot==0UL) || (reset_slot==0UL) ) ) {
300 0 : replay_status = FD_DIAG_REPLAY_STATUS_NOT_STARTED;
301 0 : } else if( FD_UNLIKELY( ((turbine_slot>reset_slot) && (turbine_slot-reset_slot>replay_distance_threshold)) ||
302 0 : (now-ctx->check_engine.reset_slot_changed_ns>replay_stall_threshold_ns) ) ) {
303 0 : replay_status = FD_DIAG_REPLAY_STATUS_BEHIND;
304 0 : } else {
305 0 : replay_status = FD_DIAG_REPLAY_STATUS_RUNNING;
306 0 : }
307 0 : }
308 0 : }
309 :
310 0 : ulong shred_cnt = ctx->tiles.shred_cnt;
311 0 : ulong turbine_status = FD_DIAG_TURBINE_STATUS_DISABLED;
312 0 : if( FD_LIKELY( replay_idx!=ULONG_MAX && shred_cnt>0UL ) ) {
313 0 : if( FD_UNLIKELY( !replay_running ) ) {
314 0 : turbine_status = FD_DIAG_TURBINE_STATUS_NOT_STARTED;
315 0 : } else {
316 0 : int all_shred_running = 1;
317 0 : ulong cur_turbine_bytes = 0UL, cur_repair_bytes = 0UL;
318 0 : for( ulong i=0UL; i<shred_cnt; i++ ) {
319 0 : volatile ulong * sm = ctx->metrics[ ctx->tiles.shred_tile_idx[ i ] ];
320 0 : cur_turbine_bytes += sm[ FD_METRICS_COUNTER_SHRED_SHRED_TURBINE_RX_BYTES_OFF ];
321 0 : cur_repair_bytes += sm[ FD_METRICS_COUNTER_SHRED_SHRED_REPAIR_RX_BYTES_OFF ];
322 0 : if( FD_UNLIKELY( sm[ FD_METRICS_GAUGE_TILE_STATUS_OFF ]!=1UL ) ) {
323 0 : all_shred_running = 0;
324 0 : break;
325 0 : }
326 0 : }
327 0 : if( FD_UNLIKELY( !all_shred_running ) ) {
328 0 : turbine_status = FD_DIAG_TURBINE_STATUS_NOT_STARTED;
329 0 : } else {
330 0 : ulong turbine_slot = ctx->metrics[ replay_idx ][ FD_METRICS_GAUGE_REPLAY_REASSEMBLY_LATEST_SLOT_OFF ];
331 0 : if( FD_UNLIKELY( turbine_slot!=ctx->check_engine.prev_turbine_slot ) ) {
332 0 : ctx->check_engine.prev_turbine_slot = turbine_slot;
333 0 : ctx->check_engine.turbine_slot_changed_ns = now;
334 0 : }
335 0 : if( FD_UNLIKELY( now-ctx->check_engine.byte_snapshot_ns>=turbine_byte_cmp_window_ns ) ) {
336 0 : ctx->check_engine.repair_outpacing = (cur_repair_bytes-ctx->check_engine.snapshot_repair_bytes)>(cur_turbine_bytes-ctx->check_engine.snapshot_turbine_bytes);
337 0 : ctx->check_engine.snapshot_turbine_bytes = cur_turbine_bytes;
338 0 : ctx->check_engine.snapshot_repair_bytes = cur_repair_bytes;
339 0 : ctx->check_engine.byte_snapshot_ns = now;
340 0 : }
341 :
342 0 : if( FD_UNLIKELY( turbine_slot==0UL ) ) {
343 0 : turbine_status = FD_DIAG_TURBINE_STATUS_NOT_STARTED;
344 0 : } else if( FD_UNLIKELY( now-ctx->check_engine.turbine_slot_changed_ns>turbine_stall_threshold_ns ) ) {
345 0 : turbine_status = FD_DIAG_TURBINE_STATUS_STALLED;
346 0 : } else if( FD_UNLIKELY( ctx->check_engine.repair_outpacing ) ) {
347 0 : turbine_status = FD_DIAG_TURBINE_STATUS_REPAIR_OUTPACING;
348 0 : } else {
349 0 : turbine_status = FD_DIAG_TURBINE_STATUS_RUNNING;
350 0 : }
351 0 : }
352 0 : }
353 0 : }
354 :
355 0 : FD_MGAUGE_SET( DIAG, BUNDLE_STATUS, bundle_status );
356 0 : FD_MGAUGE_SET( DIAG, VOTE_STATUS, vote_status );
357 0 : FD_MGAUGE_SET( DIAG, REPLAY_STATUS, replay_status );
358 0 : FD_MGAUGE_SET( DIAG, TURBINE_STATUS, turbine_status );
359 0 : }
360 :
361 : static void
362 0 : irq_metrics( fd_diag_tile_t * ctx ) {
363 0 : if( FD_UNLIKELY( -1==lseek( ctx->proc_softirqs_fd, 0, SEEK_SET ) ) ) FD_LOG_ERR(( "lseek failed (%i-%s)", errno, strerror( errno ) ));
364 0 : ulong softirq_cpu_cnt = fd_proc_softirqs_sum( ctx->proc_softirqs_fd, ctx->irq_cnt );
365 0 : if( FD_UNLIKELY( !softirq_cpu_cnt ) ) return; /* parse fail */
366 :
367 0 : ulong volatile * softirq_total = &fd_metrics_tl[ MIDX( COUNTER, DIAG, SOFTIRQ ) ];
368 0 : ulong volatile * softirq_undesired = &fd_metrics_tl[ MIDX( COUNTER, DIAG, SOFTIRQ_UNDESIRED ) ];
369 0 : for( ulong j=0UL; j<FD_METRICS_ENUM_SOFTIRQ_CNT; j++ ) {
370 0 : ulong tot_cnt = 0UL;
371 0 : ulong undesired_cnt = 0UL;
372 0 : for( ulong i=0UL; i<softirq_cpu_cnt; i++ ) {
373 0 : ulong since = fd_ulong_sat_sub( ctx->irq_cnt[ j ][ i ], ctx->softirq_baseline[ j ][ i ] );
374 0 : tot_cnt += since;
375 0 : if( fd_cpuset_test( ctx->cpu_has_tile, i ) ) {
376 0 : undesired_cnt += since;
377 0 : }
378 0 : }
379 0 : softirq_total [ j ] = tot_cnt;
380 0 : softirq_undesired[ j ] = undesired_cnt;
381 0 : }
382 :
383 0 : ulong * cpu_irq = ctx->irq_cnt[ 0 ]; /* re-use as scratch memory */
384 0 : if( FD_UNLIKELY( -1==lseek( ctx->proc_interrupts_fd, 0, SEEK_SET ) ) ) FD_LOG_ERR(( "lseek failed (%i-%s)", errno, strerror( errno ) ));
385 0 : ulong device_cpu_cnt = fd_proc_interrupts_colwise( ctx->proc_interrupts_fd, cpu_irq );
386 0 : if( FD_UNLIKELY( !device_cpu_cnt ) ) return; /* parse fail */
387 :
388 0 : ulong tot_cnt = 0UL;
389 0 : ulong undesired_cnt = 0UL;
390 0 : for( ulong i=0UL; i<device_cpu_cnt; i++ ) {
391 0 : ulong since = fd_ulong_sat_sub( cpu_irq[ i ], ctx->device_irq_baseline[ i ] );
392 0 : tot_cnt += since;
393 0 : if( fd_cpuset_test( ctx->cpu_has_tile, i ) ) {
394 0 : undesired_cnt += since;
395 0 : }
396 0 : ulong tile_id = ctx->cpu_to_tile[ i ];
397 0 : if( tile_id!=USHORT_MAX ) {
398 0 : ctx->metrics[ tile_id ][ FD_METRICS_COUNTER_TILE_IRQ_PREEMPTED_OFF ] = since;
399 0 : }
400 0 : }
401 0 : FD_MCNT_SET( DIAG, DEVICE_IRQ, tot_cnt );
402 0 : FD_MCNT_SET( DIAG, DEVICE_IRQ_UNDESIRED, undesired_cnt );
403 :
404 0 : ulong * cpu_tlb = ctx->irq_cnt[ 0 ]; /* re-use as scratch memory */
405 0 : if( FD_UNLIKELY( -1==lseek( ctx->proc_interrupts_fd, 0, SEEK_SET ) ) ) FD_LOG_ERR(( "lseek failed (%i-%s)", errno, strerror( errno ) ));
406 0 : ulong tlb_cpu_cnt = fd_proc_interrupts_tlb( ctx->proc_interrupts_fd, cpu_tlb );
407 0 : if( FD_UNLIKELY( !tlb_cpu_cnt ) ) return; /* parse fail */
408 :
409 0 : for( ulong i=0UL; i<tlb_cpu_cnt; i++ ) {
410 0 : ulong tile_id = ctx->cpu_to_tile[ i ];
411 0 : if( tile_id!=USHORT_MAX ) {
412 0 : ulong since = fd_ulong_sat_sub( cpu_tlb[ i ], ctx->tlb_baseline[ i ] );
413 0 : ctx->metrics[ tile_id ][ FD_METRICS_COUNTER_TILE_TLB_SHOOTDOWN_OFF ] = since;
414 0 : }
415 0 : }
416 0 : }
417 :
418 : static void
419 : before_credit( fd_diag_tile_t * ctx,
420 : fd_stem_context_t * stem,
421 0 : int * charge_busy ) {
422 0 : (void)stem;
423 :
424 0 : long now = fd_log_wallclock();
425 0 : if( now<ctx->next_report_nanos ) {
426 0 : long diff = ctx->next_report_nanos - now;
427 0 : diff = fd_long_min( diff, 2e6 /* 2ms */ );
428 0 : struct timespec const ts = {
429 0 : .tv_sec = diff / (long)1e9,
430 0 : .tv_nsec = diff % (long)1e9
431 0 : };
432 0 : clock_nanosleep( CLOCK_REALTIME, 0, &ts, NULL );
433 0 : return;
434 0 : }
435 0 : ctx->next_report_nanos += REPORT_INTERVAL_MILLIS*1000L*1000L;
436 :
437 0 : *charge_busy = 1;
438 :
439 0 : struct timespec boottime;
440 0 : if( FD_UNLIKELY( -1==clock_gettime( CLOCK_BOOTTIME, &boottime ) ) ) FD_LOG_ERR(( "clock_gettime(CLOCK_BOOTTIME) failed (%i-%s)", errno, strerror( errno ) ));
441 0 : ulong now_since_boot_nanos = (ulong)boottime.tv_sec*1000000000UL + (ulong)boottime.tv_nsec;
442 :
443 0 : for( ulong i=0UL; i<ctx->tile_cnt; i++ ) {
444 0 : if( FD_UNLIKELY( -1==ctx->stat_fds[ i ] ) ) continue;
445 :
446 : /* CLK_TCK is typically 100, so 1 tick = 10ms = 10,000,000 ns */
447 0 : int process_died1 = read_stat_file( ctx->stat_fds[ i ], 10000000UL, ctx->metrics[ i ] );
448 0 : int process_died2 = read_sched_file( ctx->sched_fds[ i ], ctx->metrics[ i ] );
449 :
450 0 : if( FD_UNLIKELY( process_died1 || process_died2 ) ) {
451 0 : ctx->stat_fds[ i ] = -1;
452 0 : continue;
453 0 : }
454 :
455 0 : ulong task_lifetime_nanos = now_since_boot_nanos - ctx->starttime_nanos[ i ];
456 0 : ulong user_nanos = ctx->metrics[ i ][ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_USER_OFF ];
457 0 : ulong system_nanos = ctx->metrics[ i ][ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_SYSTEM_OFF ];
458 0 : ulong wait_nanos = ctx->metrics[ i ][ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_WAIT_OFF ];
459 0 : ulong busy_nanos = user_nanos+system_nanos+wait_nanos;
460 0 : ulong idle_nanos = (task_lifetime_nanos>busy_nanos) ? (task_lifetime_nanos-busy_nanos) : 0UL;
461 :
462 : /* Counter can't go backwards in Prometheus else it thinks the
463 : application restarted. Use max to ensure monotonicity. */
464 0 : ctx->metrics[ i ][ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_IDLE_OFF ] = fd_ulong_max( idle_nanos, ctx->metrics[ i ][ FD_METRICS_COUNTER_TILE_CPU_DURATION_NANOS_IDLE_OFF ] );
465 0 : }
466 :
467 0 : for( ulong i=0UL; i<ctx->tile_cnt; i++ ) {
468 0 : if( FD_LIKELY( -1!=ctx->stat_fds[ i ] ) ) continue;
469 :
470 : /* The tile died, but it's a tile which is allowed to shutdown, so
471 : just stop updating metrics for it. */
472 0 : if( FD_LIKELY( 2UL==ctx->metrics[ i ][ FD_METRICS_GAUGE_TILE_STATUS_OFF ] ) ) continue;
473 :
474 : /* Supervisor is going to bring the whole process tree down if any
475 : of the target PIDs died, so we can ignore this and wait. */
476 0 : if( FD_UNLIKELY( !ctx->first_seen_died[ i ] ) ) {
477 0 : ctx->first_seen_died[ i ] = now;
478 0 : } else if( FD_LIKELY( ctx->first_seen_died[ i ]==LONG_MAX ) ) {
479 : /* We already reported this, so we can ignore it. */
480 0 : } else if( FD_UNLIKELY( now-ctx->first_seen_died[ i ] < 10L*1000L*1000L*1000L ) ) {
481 : /* Wait 10 seconds for supervisor to kill us before reporting WARNING */
482 0 : } else {
483 0 : FD_LOG_WARNING(( "cannot get metrics for dead tile idx %lu", i ));
484 0 : ctx->first_seen_died[ i ] = LONG_MAX;
485 0 : }
486 0 : }
487 :
488 0 : check_engine_metric( ctx, now );
489 0 : irq_metrics( ctx );
490 0 : }
491 :
492 : static void
493 : privileged_init( fd_topo_t const * topo,
494 0 : fd_topo_tile_t const * tile ) {
495 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
496 :
497 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
498 0 : fd_diag_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_diag_tile_t), sizeof(fd_diag_tile_t) );
499 :
500 0 : FD_TEST( topo->tile_cnt<FD_TILE_MAX );
501 :
502 0 : FD_TEST( 100L == sysconf( _SC_CLK_TCK ) );
503 :
504 0 : ctx->tile_cnt = topo->tile_cnt;
505 0 : for( ulong i=0UL; i<FD_TILE_MAX; i++ ) {
506 0 : ctx->stat_fds[ i ] = -1;
507 0 : ctx->sched_fds[ i ] = -1;
508 0 : }
509 :
510 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
511 0 : ulong * metrics = fd_metrics_join( fd_topo_obj_laddr( topo, topo->tiles[ i ].metrics_obj_id ) );
512 :
513 0 : for(;;) {
514 0 : ulong pid, tid;
515 0 : if( FD_UNLIKELY( tile->id==i ) ) {
516 0 : pid = fd_sandbox_getpid();
517 0 : tid = fd_sandbox_gettid();
518 0 : } else {
519 0 : pid = fd_metrics_tile( metrics )[ FD_METRICS_GAUGE_TILE_PID_OFF ];
520 0 : tid = fd_metrics_tile( metrics )[ FD_METRICS_GAUGE_TILE_TID_OFF ];
521 0 : if( FD_UNLIKELY( !pid || !tid ) ) {
522 0 : FD_SPIN_PAUSE();
523 0 : continue;
524 0 : }
525 0 : }
526 :
527 0 : ctx->metrics[ i ] = fd_metrics_tile( metrics );
528 :
529 0 : char path[ 64UL ];
530 0 : FD_TEST( fd_cstr_printf_check( path, sizeof( path ), NULL, "/proc/%lu/task/%lu/stat", pid, tid ) );
531 0 : ctx->stat_fds[ i ] = open( path, O_RDONLY );
532 0 : if( FD_UNLIKELY( -1==ctx->stat_fds[ i ] ) ) {
533 : /* Might be a tile that's allowed to shutdown already did so
534 : before we got to here, due to a race condition. Just
535 : proceed, we will not be able to get metrics for the shut
536 : down process. */
537 0 : if( FD_LIKELY( 2UL!=ctx->metrics[ i ][ FD_METRICS_GAUGE_TILE_STATUS_OFF ] ) ) FD_LOG_ERR(( "open stat failed (%i-%s)", errno, strerror( errno ) ));
538 0 : break;
539 0 : }
540 :
541 0 : FD_TEST( fd_cstr_printf_check( path, sizeof( path ), NULL, "/proc/%lu/task/%lu/sched", pid, tid ) );
542 0 : ctx->sched_fds[ i ] = open( path, O_RDONLY );
543 0 : if( FD_UNLIKELY( -1==ctx->sched_fds[ i ] ) ) {
544 0 : if( FD_LIKELY( 2UL!=ctx->metrics[ i ][ FD_METRICS_GAUGE_TILE_STATUS_OFF ] ) ) FD_LOG_ERR(( "open sched failed (%i-%s)", errno, strerror( errno ) ));
545 0 : ctx->stat_fds[ i ] = -1;
546 0 : }
547 0 : break;
548 0 : }
549 0 : }
550 :
551 0 : ctx->proc_interrupts_fd = open( "/proc/interrupts", O_RDONLY );
552 0 : if( FD_UNLIKELY( -1==ctx->proc_interrupts_fd ) ) FD_LOG_ERR(( "open(/proc/interrupts) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
553 :
554 0 : ctx->proc_softirqs_fd = open( "/proc/softirqs", O_RDONLY );
555 0 : if( FD_UNLIKELY( -1==ctx->proc_softirqs_fd ) ) FD_LOG_ERR(( "open(/proc/softirqs) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
556 0 : }
557 :
558 : /* Read starttime (field 22) from stat file. Returns 0 on success, 1 if
559 : process died (ESRCH). */
560 :
561 : static int
562 : read_starttime( int fd,
563 : ulong ns_per_tick,
564 0 : ulong * out_starttime_nanos ) {
565 0 : char contents[ 4096 ] = {0};
566 0 : ulong contents_len = 0UL;
567 :
568 0 : while( 1 ) {
569 0 : if( FD_UNLIKELY( contents_len>=sizeof( contents ) ) ) FD_LOG_ERR(( "stat contents overflow" ));
570 0 : long n = read( fd, contents + contents_len, sizeof( contents ) - contents_len );
571 0 : if( FD_UNLIKELY( -1==n ) ) {
572 0 : if( FD_UNLIKELY( errno==ESRCH ) ) return 1;
573 0 : FD_LOG_ERR(( "read stat failed (%i-%s)", errno, strerror( errno ) ));
574 0 : }
575 0 : if( FD_LIKELY( 0L==n ) ) break;
576 0 : contents_len += (ulong)n;
577 0 : }
578 :
579 : /* Parse field 22 (starttime) from stat file */
580 0 : char * saveptr;
581 0 : char * token = strtok_r( contents, " ", &saveptr );
582 0 : ulong field_idx = 0UL;
583 :
584 0 : while( token && field_idx<21UL ) {
585 0 : token = strtok_r( NULL, " ", &saveptr );
586 0 : field_idx++;
587 0 : }
588 :
589 0 : if( FD_UNLIKELY( !token || field_idx!=21UL ) ) FD_LOG_ERR(( "starttime (field 22) not found in stat" ));
590 :
591 0 : char * endptr;
592 0 : ulong starttime_ticks = strtoul( token, &endptr, 10 );
593 0 : if( FD_UNLIKELY( *endptr!=' ' && *endptr!='\0' ) ) FD_LOG_ERR(( "strtoul failed for starttime" ));
594 0 : if( FD_UNLIKELY( starttime_ticks==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul overflow for starttime" ));
595 :
596 0 : *out_starttime_nanos = starttime_ticks * ns_per_tick;
597 0 : return 0;
598 0 : }
599 :
600 : static void
601 : unprivileged_init( fd_topo_t const * topo,
602 0 : fd_topo_tile_t const * tile ) {
603 0 : fd_diag_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id );
604 :
605 0 : memset( ctx->first_seen_died, 0, sizeof( ctx->first_seen_died ) );
606 0 : ctx->next_report_nanos = fd_log_wallclock();
607 :
608 : /* Snapshot the cumulative-since-boot /proc interrupt/softirq counters
609 : so the metrics we report are counted since process startup. */
610 0 : memset( ctx->softirq_baseline, 0, sizeof( ctx->softirq_baseline ) );
611 0 : memset( ctx->device_irq_baseline, 0, sizeof( ctx->device_irq_baseline ) );
612 0 : memset( ctx->tlb_baseline, 0, sizeof( ctx->tlb_baseline ) );
613 0 : if( FD_UNLIKELY( -1==lseek( ctx->proc_softirqs_fd, 0, SEEK_SET ) ) ) FD_LOG_ERR(( "lseek failed (%i-%s)", errno, strerror( errno ) ));
614 0 : ulong softirq_cpu_cnt = fd_proc_softirqs_sum( ctx->proc_softirqs_fd, ctx->softirq_baseline );
615 0 : if( FD_UNLIKELY( !softirq_cpu_cnt ) ) FD_LOG_WARNING(( "failed to read softirq baseline from /proc/softirqs" ));
616 :
617 0 : if( FD_UNLIKELY( -1==lseek( ctx->proc_interrupts_fd, 0, SEEK_SET ) ) ) FD_LOG_ERR(( "lseek failed (%i-%s)", errno, strerror( errno ) ));
618 0 : ulong device_cpu_cnt = fd_proc_interrupts_colwise( ctx->proc_interrupts_fd, ctx->device_irq_baseline );
619 0 : if( FD_UNLIKELY( !device_cpu_cnt ) ) FD_LOG_WARNING(( "failed to read device IRQ baseline from /proc/interrupts" ));
620 :
621 0 : if( FD_UNLIKELY( -1==lseek( ctx->proc_interrupts_fd, 0, SEEK_SET ) ) ) FD_LOG_ERR(( "lseek failed (%i-%s)", errno, strerror( errno ) ));
622 0 : ulong tlb_cpu_cnt = fd_proc_interrupts_tlb( ctx->proc_interrupts_fd, ctx->tlb_baseline );
623 0 : if( FD_UNLIKELY( !tlb_cpu_cnt ) ) FD_LOG_WARNING(( "failed to read TLB baseline from /proc/interrupts" ));
624 :
625 : /* Read starttime (field 22) once at init for idle time calculation.
626 : CLK_TCK is always 100, so 1 tick = 10ms = 10,000,000 ns. */
627 0 : for( ulong i=0UL; i<ctx->tile_cnt; i++ ) {
628 0 : if( FD_LIKELY( -1!=ctx->stat_fds[ i ] ) ) {
629 0 : int died = read_starttime( ctx->stat_fds[ i ], 10000000UL, &ctx->starttime_nanos[ i ] );
630 0 : if( FD_UNLIKELY( died ) ) ctx->stat_fds[ i ] = -1;
631 0 : }
632 0 : }
633 :
634 0 : memset( &ctx->check_engine, 0, sizeof(ctx->check_engine) );
635 :
636 0 : ctx->tiles.bundle_cnt = fd_topo_tile_name_cnt( topo, "bundle" );
637 0 : for( ulong i=0UL; i<ctx->tiles.bundle_cnt; i++ ) ctx->tiles.bundle_tile_idx[ i ] = fd_topo_find_tile( topo, "bundle", i );
638 0 : ctx->tiles.shred_cnt = fd_topo_tile_name_cnt( topo, "shred" );
639 0 : for( ulong i=0UL; i<ctx->tiles.shred_cnt; i++ ) ctx->tiles.shred_tile_idx[ i ] = fd_topo_find_tile( topo, "shred", i );
640 0 : ctx->tiles.tower_idx = fd_topo_find_tile( topo, "tower", 0UL );
641 0 : ctx->tiles.replay_idx = fd_topo_find_tile( topo, "replay", 0UL );
642 :
643 0 : fd_cpuset_new( &ctx->cpu_has_tile );
644 0 : for( ulong i=0UL; i<(topo->tile_cnt); i++ ) {
645 0 : ulong cpu_idx = topo->tiles[ i ].cpu_idx;
646 0 : if( cpu_idx>=FD_TILE_MAX ) continue;
647 0 : fd_cpuset_insert( ctx->cpu_has_tile, cpu_idx );
648 0 : }
649 :
650 0 : for( ulong i=0UL; i<FD_TILE_MAX; i++ ) ctx->cpu_to_tile[ i ] = USHORT_MAX;
651 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
652 0 : ulong cpu_idx = topo->tiles[ i ].cpu_idx;
653 0 : if( cpu_idx>=FD_TILE_MAX ) continue;
654 0 : ctx->cpu_to_tile[ cpu_idx ] = (ushort)i;
655 0 : }
656 :
657 0 : long now = fd_log_wallclock();
658 0 : ctx->is_voting = tile->diag.is_voting;
659 0 : ctx->check_engine.vote_slot_changed_ns = now;
660 0 : ctx->check_engine.reset_slot_changed_ns = now;
661 0 : ctx->check_engine.turbine_slot_changed_ns = now;
662 0 : ctx->check_engine.byte_snapshot_ns = now;
663 0 : }
664 :
665 : static ulong
666 : populate_allowed_seccomp( fd_topo_t const * topo,
667 : fd_topo_tile_t const * tile,
668 : ulong out_cnt,
669 0 : struct sock_filter * out ) {
670 0 : (void)topo;
671 0 : (void)tile;
672 :
673 0 : populate_sock_filter_policy_fd_diag_tile( out_cnt, out, (uint)fd_log_private_logfile_fd() );
674 0 : return sock_filter_policy_fd_diag_tile_instr_cnt;
675 0 : }
676 :
677 : static ulong
678 : populate_allowed_fds( fd_topo_t const * topo,
679 : fd_topo_tile_t const * tile,
680 : ulong out_fds_cnt,
681 0 : int * out_fds ) {
682 0 : fd_diag_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id );
683 :
684 0 : if( FD_UNLIKELY( out_fds_cnt<4UL+2UL*ctx->tile_cnt ) ) FD_LOG_ERR(( "out_fds_cnt %lu", out_fds_cnt ));
685 :
686 0 : ulong out_cnt = 0UL;
687 0 : out_fds[ out_cnt++ ] = 2; /* stderr */
688 0 : if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) )
689 0 : out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
690 0 : out_fds[ out_cnt++ ] = ctx->proc_interrupts_fd; /* /proc/interrupts */
691 0 : out_fds[ out_cnt++ ] = ctx->proc_softirqs_fd; /* /proc/softirqs */
692 0 : for( ulong i=0UL; i<ctx->tile_cnt; i++ ) {
693 0 : if( -1!=ctx->stat_fds[ i ] ) out_fds[ out_cnt++ ] = ctx->stat_fds[ i ]; /* /proc/<pid>/task/<tid>/stat */
694 0 : if( -1!=ctx->sched_fds[ i ] ) out_fds[ out_cnt++ ] = ctx->sched_fds[ i ]; /* /proc/<pid>/task/<tid>/sched */
695 0 : }
696 0 : return out_cnt;
697 0 : }
698 :
699 0 : #define STEM_BURST (1UL)
700 0 : #define STEM_LAZY ((long)10e6) /* 10ms */
701 :
702 0 : #define STEM_CALLBACK_CONTEXT_TYPE fd_diag_tile_t
703 0 : #define STEM_CALLBACK_CONTEXT_ALIGN alignof(fd_diag_tile_t)
704 :
705 0 : #define STEM_CALLBACK_BEFORE_CREDIT before_credit
706 :
707 : #include "../../disco/stem/fd_stem.c"
708 :
709 : fd_topo_run_tile_t fd_tile_diag = {
710 : .name = "diag",
711 : .populate_allowed_seccomp = populate_allowed_seccomp,
712 : .populate_allowed_fds = populate_allowed_fds,
713 : .scratch_align = scratch_align,
714 : .scratch_footprint = scratch_footprint,
715 : .privileged_init = privileged_init,
716 : .unprivileged_init = unprivileged_init,
717 : .run = stem_run,
718 : };
|