Line data Source code
1 : #include "../metrics/fd_metrics.h"
2 : #include "../stem/fd_stem.h"
3 : #include "../topo/fd_topo.h"
4 :
5 : #include <fcntl.h>
6 : #include <errno.h>
7 : #include <stdlib.h>
8 : #include <sys/types.h> /* SEEK_SET */
9 : #include <time.h>
10 : #include <unistd.h>
11 :
12 : #include "generated/fd_cswtch_tile_seccomp.h"
13 :
14 0 : #define REPORT_INTERVAL_MILLIS (100L)
15 :
16 : typedef struct {
17 : long next_report_nanos;
18 :
19 : ulong tile_cnt;
20 : long first_seen_died[ FD_TILE_MAX ];
21 : int status_fds[ FD_TILE_MAX ];
22 : volatile ulong * metrics[ FD_TILE_MAX ];
23 : } fd_cswtch_ctx_t;
24 :
25 : FD_FN_CONST static inline ulong
26 0 : scratch_align( void ) {
27 0 : return 128UL;
28 0 : }
29 :
30 : FD_FN_PURE static inline ulong
31 0 : scratch_footprint( fd_topo_tile_t const * tile ) {
32 0 : (void)tile;
33 0 : ulong l = FD_LAYOUT_INIT;
34 0 : l = FD_LAYOUT_APPEND( l, alignof( fd_cswtch_ctx_t ), sizeof( fd_cswtch_ctx_t ) );
35 0 : return FD_LAYOUT_FINI( l, scratch_align() );
36 0 : }
37 :
38 : static void
39 : before_credit( fd_cswtch_ctx_t * ctx,
40 : fd_stem_context_t * stem,
41 0 : int * charge_busy ) {
42 0 : (void)stem;
43 :
44 0 : long now = fd_log_wallclock();
45 0 : if( now<ctx->next_report_nanos ) {
46 0 : long diff = ctx->next_report_nanos - now;
47 0 : diff = fd_long_min( diff, 2e6 /* 2ms */ );
48 0 : struct timespec const ts = {
49 0 : .tv_sec = diff / (long)1e9,
50 0 : .tv_nsec = diff % (long)1e9
51 0 : };
52 0 : clock_nanosleep( CLOCK_REALTIME, 0, &ts, NULL );
53 0 : return;
54 0 : }
55 0 : ctx->next_report_nanos += REPORT_INTERVAL_MILLIS*1000L*1000L;
56 :
57 0 : *charge_busy = 1;
58 :
59 0 : for( ulong i=0UL; i<ctx->tile_cnt; i++ ) {
60 0 : if( FD_UNLIKELY( -1==ctx->status_fds[ i ] ) ) continue;
61 :
62 0 : if( FD_UNLIKELY( -1==lseek( ctx->status_fds[ i ], 0, SEEK_SET ) ) ) FD_LOG_ERR(( "lseek failed (%i-%s)", errno, strerror( errno ) ));
63 :
64 0 : char contents[ 4096 ] = {0};
65 0 : ulong contents_len = 0UL;
66 :
67 0 : int process_died = 0;
68 0 : while( 1 ) {
69 0 : if( FD_UNLIKELY( contents_len>=sizeof( contents ) ) ) FD_LOG_ERR(( "contents overflow" ));
70 0 : long n = read( ctx->status_fds[ i ], contents + contents_len, sizeof( contents ) - contents_len );
71 0 : if( FD_UNLIKELY( -1==n ) ) {
72 0 : if( FD_UNLIKELY( errno==ESRCH ) ) {
73 0 : process_died = 1;
74 0 : break;
75 0 : }
76 0 : FD_LOG_ERR(( "read failed (%i-%s)", errno, strerror( errno ) ));
77 0 : }
78 0 : if( FD_LIKELY( 0==n ) ) break;
79 0 : contents_len += (ulong)n;
80 0 : }
81 :
82 0 : if( FD_UNLIKELY( process_died ) ) {
83 : /* The tile died, but it's a tile which is allowed to shutdown, so
84 : just stop updating metrics for it. */
85 0 : if( FD_LIKELY( 2UL==ctx->metrics[ i ][ FD_METRICS_GAUGE_TILE_STATUS_OFF ] ) ) {
86 0 : ctx->status_fds[ i ] = -1; /* stop trying to read from it */
87 0 : continue;
88 0 : }
89 0 : }
90 :
91 : /* Supervisor is going to bring the whole process tree down if any
92 : of the target PIDs died, so we can ignore this and wait. */
93 0 : if( FD_UNLIKELY( process_died ) ) {
94 0 : if( FD_UNLIKELY( !ctx->first_seen_died[ i ] ) ) {
95 0 : ctx->first_seen_died[ i ] = now;
96 0 : } else if( FD_LIKELY( ctx->first_seen_died[ i ]==LONG_MAX ) ) {
97 : /* We already reported this, so we can ignore it. */
98 0 : } else if( FD_UNLIKELY( now-ctx->first_seen_died[ i ] < 10L*1000L*1000L*1000L ) ) {
99 : /* Wait 10 seconds for supervisor to kill us before reporting WARNING */
100 0 : } else {
101 0 : FD_LOG_WARNING(( "cannot get context switch metrics for dead tile idx %lu", i ));
102 0 : ctx->first_seen_died[ i ] = LONG_MAX;
103 0 : }
104 0 : continue;
105 0 : }
106 :
107 0 : int found_voluntary = 0;
108 0 : int found_involuntary = 0;
109 :
110 0 : char * line = contents;
111 0 : while( 1 ) {
112 0 : char * next_line = strchr( line, '\n' );
113 0 : if( FD_UNLIKELY( NULL==next_line ) ) break;
114 0 : *next_line = '\0';
115 :
116 0 : char * colon = strchr( line, ':' );
117 0 : if( FD_UNLIKELY( NULL==colon ) ) FD_LOG_ERR(( "no colon in line '%s'", line ));
118 :
119 0 : *colon = '\0';
120 0 : char * key = line;
121 0 : char * value = colon + 1;
122 :
123 0 : while( ' '==*value || '\t'==*value ) value++;
124 :
125 0 : if( FD_LIKELY( !strncmp( key, "voluntary_ctxt_switches", 23UL ) ) ) {
126 0 : char * endptr;
127 0 : ulong voluntary_ctxt_switches = strtoul( value, &endptr, 10 );
128 0 : if( FD_UNLIKELY( *endptr!='\0' || voluntary_ctxt_switches==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul failed" ));
129 0 : ctx->metrics[ i ][ FD_METRICS_COUNTER_TILE_CONTEXT_SWITCH_VOLUNTARY_COUNT_OFF ] = voluntary_ctxt_switches;
130 0 : found_voluntary = 1;
131 0 : } else if( FD_LIKELY( !strncmp( key, "nonvoluntary_ctxt_switches", 26UL ) ) ) {
132 0 : char * endptr;
133 0 : ulong involuntary_ctxt_switches = strtoul( value, &endptr, 10 );
134 0 : if( FD_UNLIKELY( *endptr!='\0' || involuntary_ctxt_switches==ULONG_MAX ) ) FD_LOG_ERR(( "strtoul failed" ));
135 0 : ctx->metrics[ i ][ FD_METRICS_COUNTER_TILE_CONTEXT_SWITCH_INVOLUNTARY_COUNT_OFF ] = involuntary_ctxt_switches;
136 0 : found_involuntary = 1;
137 0 : }
138 :
139 0 : line = next_line + 1;
140 0 : }
141 :
142 0 : if( FD_UNLIKELY( !found_voluntary ) ) FD_LOG_ERR(( "voluntary_ctxt_switches not found" ));
143 0 : if( FD_UNLIKELY( !found_involuntary ) ) FD_LOG_ERR(( "nonvoluntary_ctxt_switches not found" ));
144 0 : }
145 0 : }
146 :
147 : static void
148 : privileged_init( fd_topo_t * topo,
149 0 : fd_topo_tile_t * tile ) {
150 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
151 :
152 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
153 0 : fd_cswtch_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_cswtch_ctx_t ), sizeof( fd_cswtch_ctx_t ) );
154 :
155 0 : FD_TEST( topo->tile_cnt<FD_TILE_MAX );
156 :
157 0 : ctx->tile_cnt = topo->tile_cnt;
158 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
159 0 : ulong * metrics = fd_metrics_join( fd_topo_obj_laddr( topo, topo->tiles[ i ].metrics_obj_id ) );
160 :
161 0 : for(;;) {
162 0 : ulong pid, tid;
163 0 : if( FD_UNLIKELY( tile->id==i ) ) {
164 0 : pid = fd_sandbox_getpid();
165 0 : tid = fd_sandbox_gettid();
166 0 : } else {
167 0 : pid = fd_metrics_tile( metrics )[ FD_METRICS_GAUGE_TILE_PID_OFF ];
168 0 : tid = fd_metrics_tile( metrics )[ FD_METRICS_GAUGE_TILE_TID_OFF ];
169 0 : if( FD_UNLIKELY( !pid || !tid ) ) {
170 0 : FD_SPIN_PAUSE();
171 0 : continue;
172 0 : }
173 0 : }
174 :
175 0 : char path[ 64 ];
176 0 : FD_TEST( fd_cstr_printf_check( path, sizeof( path ), NULL, "/proc/%lu/task/%lu/status", pid, tid ) );
177 0 : ctx->status_fds[ i ] = open( path, O_RDONLY );
178 0 : ctx->metrics[ i ] = fd_metrics_tile( metrics );
179 0 : if( FD_UNLIKELY( -1==ctx->status_fds[ i ] ) ) {
180 : /* Might be a tile that's allowed to shutdown already did so
181 : before we got to here, due to a race condition. Just
182 : proceed, we will not be able to get context switch metrics
183 : for the shut down process. */
184 0 : if( FD_LIKELY( 2UL!=ctx->metrics[ i ][ FD_METRICS_GAUGE_TILE_STATUS_OFF ] ) ) FD_LOG_ERR(( "open failed (%i-%s)", errno, strerror( errno ) ));
185 0 : }
186 0 : break;
187 0 : }
188 0 : }
189 0 : }
190 :
191 : static void
192 : unprivileged_init( fd_topo_t * topo,
193 0 : fd_topo_tile_t * tile ) {
194 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
195 :
196 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
197 0 : fd_cswtch_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_cswtch_ctx_t ), sizeof( fd_cswtch_ctx_t ) );
198 :
199 0 : memset( ctx->first_seen_died, 0, sizeof( ctx->first_seen_died ) );
200 0 : ctx->next_report_nanos = fd_log_wallclock();
201 :
202 0 : ulong scratch_top = FD_SCRATCH_ALLOC_FINI( l, 1UL );
203 0 : if( FD_UNLIKELY( scratch_top > (ulong)scratch + scratch_footprint( tile ) ) )
204 0 : FD_LOG_ERR(( "scratch overflow %lu %lu %lu", scratch_top - (ulong)scratch - scratch_footprint( tile ), scratch_top, (ulong)scratch + scratch_footprint( tile ) ));
205 0 : }
206 :
207 : static ulong
208 : populate_allowed_seccomp( fd_topo_t const * topo,
209 : fd_topo_tile_t const * tile,
210 : ulong out_cnt,
211 0 : struct sock_filter * out ) {
212 0 : (void)topo;
213 0 : (void)tile;
214 :
215 0 : populate_sock_filter_policy_fd_cswtch_tile( out_cnt, out, (uint)fd_log_private_logfile_fd() );
216 0 : return sock_filter_policy_fd_cswtch_tile_instr_cnt;
217 0 : }
218 :
219 : static ulong
220 : populate_allowed_fds( fd_topo_t const * topo,
221 : fd_topo_tile_t const * tile,
222 : ulong out_fds_cnt,
223 0 : int * out_fds ) {
224 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
225 :
226 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
227 0 : fd_cswtch_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_cswtch_ctx_t ), sizeof( fd_cswtch_ctx_t ) );
228 :
229 0 : if( FD_UNLIKELY( out_fds_cnt<2UL+ctx->tile_cnt ) ) FD_LOG_ERR(( "out_fds_cnt %lu", out_fds_cnt ));
230 :
231 0 : ulong out_cnt = 0UL;
232 0 : out_fds[ out_cnt++ ] = 2; /* stderr */
233 0 : if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) )
234 0 : out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
235 0 : for( ulong i=0UL; i<ctx->tile_cnt; i++ ) {
236 0 : if( -1!=ctx->status_fds[ i ] ) out_fds[ out_cnt++ ] = ctx->status_fds[ i ]; /* /proc/<pid>/task/<tid>/status descriptor */
237 0 : }
238 0 : return out_cnt;
239 0 : }
240 :
241 0 : #define STEM_BURST (1UL)
242 0 : #define STEM_LAZY ((long)10e6) /* 10ms */
243 :
244 0 : #define STEM_CALLBACK_CONTEXT_TYPE fd_cswtch_ctx_t
245 0 : #define STEM_CALLBACK_CONTEXT_ALIGN alignof(fd_cswtch_ctx_t)
246 :
247 0 : #define STEM_CALLBACK_BEFORE_CREDIT before_credit
248 :
249 : #include "../../disco/stem/fd_stem.c"
250 :
251 : fd_topo_run_tile_t fd_tile_cswtch = {
252 : .name = "cswtch",
253 : .populate_allowed_seccomp = populate_allowed_seccomp,
254 : .populate_allowed_fds = populate_allowed_fds,
255 : .scratch_align = scratch_align,
256 : .scratch_footprint = scratch_footprint,
257 : .privileged_init = privileged_init,
258 : .unprivileged_init = unprivileged_init,
259 : .run = stem_run,
260 : };
|