Line data Source code
1 : #define _GNU_SOURCE
2 : #include "fd_topo.h"
3 :
4 : #include "../metrics/fd_metrics.h"
5 : #include "../../util/tile/fd_tile_private.h"
6 :
7 : #include <unistd.h>
8 : #include <signal.h>
9 : #include <errno.h>
10 : #include <pthread.h>
11 : #include <sys/syscall.h>
12 : #include <linux/futex.h>
13 : #include <sys/resource.h>
14 : #include <sys/prctl.h>
15 : #include <sys/stat.h>
16 : #include <sys/mman.h>
17 : #include <net/if.h>
18 :
19 : static void
20 : initialize_logging( char const * tile_name,
21 : ulong tile_kind_id,
22 0 : ulong tid ) {
23 0 : fd_log_cpu_set( NULL );
24 0 : fd_log_private_tid_set( tid );
25 0 : char thread_name[ 20 ];
26 0 : FD_TEST( fd_cstr_printf_check( thread_name, sizeof( thread_name ), NULL, "%s:%lu", tile_name, tile_kind_id ) );
27 0 : fd_log_thread_set( thread_name );
28 0 : fd_log_private_stack_discover( FD_TILE_PRIVATE_STACK_SZ,
29 0 : &fd_tile_private_stack0, &fd_tile_private_stack1 );
30 0 : FD_LOG_INFO(( "booting tile %s pid:%lu tid:%lu", thread_name, fd_log_group_id(), tid ));
31 :
32 : /* FD_LOG_* calls fd_log_wallclock_cstr, which calls localtime_r. In
33 : glibc, this ends up calling a function called tzset_internal. The
34 : first time tzset_internal is called by a process, it may (and
35 : almost always does) call __tzfile_read, which invokes the openat
36 : syscall, and possibly several others on the time zone file
37 : (typically /etc/localtime) or on a file in the time zone directory.
38 : This kind of behavior is tricky to sandbox, so the easiest thing to
39 : do is initialize it prior to the sandbox and hope whatever libc is
40 : used behaves like glibc. This only matters when both the logfile
41 : and stderr filters are strict enough so that the immediately prior
42 : FD_LOG_INFO call is a no-op, since otherwise that call would have
43 : taken care of it. */
44 0 : char wallclock[FD_LOG_WALLCLOCK_CSTR_BUF_SZ];
45 0 : fd_log_wallclock_cstr( 0L, wallclock );
46 0 : }
47 :
48 : static void
49 : check_wait_debugger( ulong pid,
50 : volatile int * wait,
51 0 : volatile int * debugger ) {
52 0 : if( FD_UNLIKELY( debugger ) ) {
53 0 : FD_LOG_WARNING(( "waiting for debugger to attach to tile pid:%lu", pid ));
54 0 : if( FD_UNLIKELY( -1==kill( getpid(), SIGSTOP ) ) )
55 0 : FD_LOG_ERR(( "kill(SIGSTOP) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
56 0 : *FD_VOLATILE( debugger ) = 1;
57 0 : }
58 :
59 0 : if( FD_UNLIKELY( wait ) ) {
60 0 : while( FD_LIKELY( !*FD_VOLATILE( wait ) ) ) FD_SPIN_PAUSE();
61 0 : }
62 0 : }
63 :
64 : void
65 : fd_topo_run_tile( fd_topo_t * topo,
66 : fd_topo_tile_t * tile,
67 : int sandbox,
68 : int keep_controlling_terminal,
69 : int core_dump_level,
70 : uint uid,
71 : uint gid,
72 : int allow_fd,
73 : volatile int * wait,
74 : volatile int * debugger,
75 0 : fd_topo_run_tile_t * tile_run ) {
76 0 : char thread_name[ 20 ];
77 0 : FD_TEST( fd_cstr_printf_check( thread_name, sizeof( thread_name ), NULL, "%s:%lu", tile->name, tile->kind_id ) );
78 0 : if( FD_UNLIKELY( prctl( PR_SET_NAME, thread_name, 0, 0, 0 ) ) ) FD_LOG_ERR(( "prctl(PR_SET_NAME) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
79 :
80 0 : ulong pid = fd_sandbox_getpid(); /* Need to read /proc again.. we got a new PID from clone */
81 0 : ulong tid = fd_sandbox_gettid(); /* Need to read /proc again.. we got a new TID from clone */
82 :
83 0 : check_wait_debugger( pid, wait, debugger );
84 0 : initialize_logging( tile->name, tile->kind_id, tid );
85 :
86 : /* preload shared memory before sandboxing, so it is already mapped */
87 0 : fd_topo_join_tile_workspaces( topo, tile, core_dump_level );
88 :
89 0 : if( FD_UNLIKELY( tile_run->privileged_init ) )
90 0 : tile_run->privileged_init( topo, tile );
91 :
92 0 : ulong allow_fds_offset = 0UL;
93 0 : int allow_fds[ 256 ] = { 0 };
94 0 : if( FD_LIKELY( -1!=allow_fd ) ) {
95 0 : allow_fds_offset = 1UL;
96 0 : allow_fds[ 0 ] = allow_fd;
97 0 : }
98 0 : ulong allow_fds_cnt = 0UL;
99 0 : if( FD_LIKELY( tile_run->populate_allowed_fds ) ) {
100 0 : allow_fds_cnt = tile_run->populate_allowed_fds( topo,
101 0 : tile,
102 0 : (sizeof(allow_fds)/sizeof(allow_fds[ 0 ]))-allow_fds_offset,
103 0 : allow_fds+allow_fds_offset );
104 0 : }
105 :
106 :
107 0 : struct sock_filter seccomp_filter[ 256UL ];
108 0 : ulong seccomp_filter_cnt = 0UL;
109 0 : if( FD_LIKELY( tile_run->populate_allowed_seccomp ) ) {
110 0 : seccomp_filter_cnt = tile_run->populate_allowed_seccomp( topo,
111 0 : tile,
112 0 : sizeof(seccomp_filter)/sizeof(seccomp_filter[ 0 ]),
113 0 : seccomp_filter );
114 0 : }
115 :
116 0 : ulong rlimit_file_cnt = tile_run->rlimit_file_cnt;
117 0 : if( tile_run->rlimit_file_cnt_fn ) {
118 0 : rlimit_file_cnt = tile_run->rlimit_file_cnt_fn( topo, tile );
119 0 : }
120 :
121 0 : if( FD_LIKELY( sandbox ) ) {
122 0 : int dumpable = core_dump_level == FD_TOPO_CORE_DUMP_LEVEL_DISABLED ? 0 : 1;
123 0 : fd_sandbox_enter( uid,
124 0 : gid,
125 0 : tile_run->keep_host_networking,
126 0 : tile_run->allow_connect,
127 0 : tile_run->allow_renameat,
128 0 : keep_controlling_terminal,
129 0 : dumpable,
130 0 : rlimit_file_cnt,
131 0 : tile_run->rlimit_address_space,
132 0 : tile_run->rlimit_data,
133 0 : allow_fds_cnt+allow_fds_offset,
134 0 : allow_fds,
135 0 : seccomp_filter_cnt,
136 0 : seccomp_filter );
137 0 : } else {
138 0 : fd_sandbox_switch_uid_gid( uid, gid );
139 0 : }
140 :
141 : /* Now we are sandboxed, join all the tango IPC objects in the workspaces */
142 0 : fd_topo_fill_tile( topo, tile );
143 :
144 0 : FD_TEST( tile->metrics );
145 0 : fd_metrics_register( tile->metrics );
146 :
147 0 : FD_MGAUGE_SET( TILE, PID, pid );
148 0 : FD_MGAUGE_SET( TILE, TID, tid );
149 :
150 0 : if( FD_UNLIKELY( tile_run->unprivileged_init ) )
151 0 : tile_run->unprivileged_init( topo, tile );
152 :
153 0 : tile_run->run( topo, tile );
154 0 : if( FD_UNLIKELY( !tile->allow_shutdown ) ) FD_LOG_ERR(( "tile %s:%lu run loop returned", tile->name, tile->kind_id ));
155 :
156 0 : FD_MGAUGE_SET( TILE, STATUS, 2UL );
157 0 : }
158 :
159 : typedef struct {
160 : fd_topo_t * topo;
161 : fd_topo_tile_t * tile;
162 : fd_topo_run_tile_t tile_run;
163 : uint uid;
164 : uint gid;
165 : volatile int copied;
166 : void * stack_lo;
167 : void * stack_hi;
168 : } fd_topo_run_thread_args_t;
169 :
170 : static void *
171 0 : run_tile_thread_main( void * _args ) {
172 0 : fd_topo_run_thread_args_t args = *(fd_topo_run_thread_args_t *)_args;
173 0 : FD_COMPILER_MFENCE();
174 0 : ((fd_topo_run_thread_args_t *)_args)->copied = 1;
175 0 : FD_COMPILER_MFENCE();
176 :
177 : /* Prevent fork() from smashing the stack */
178 0 : if( FD_UNLIKELY( madvise( args.stack_lo, (ulong)args.stack_hi - (ulong)args.stack_lo, MADV_DONTFORK ) ) ) {
179 0 : FD_LOG_ERR(( "madvise(stack,MADV_DONTFORK) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
180 0 : }
181 :
182 0 : fd_topo_run_tile( args.topo, args.tile, 0, 1, 1, args.uid, args.gid, -1, NULL, NULL, &args.tile_run );
183 0 : FD_TEST( args.tile->allow_shutdown );
184 0 : return NULL;
185 0 : }
186 :
187 : /* fd_topo_tile_stack_join_anon is a variant of fd_topo_tile_stack_join
188 : that acquires private anonymous memory instead of shared pages.
189 :
190 : This is required for fork() to work, as the parent and child process
191 : would otherwise share a stack and corrupt each other. While fork()
192 : is banned in tile user code, some dynamic analysis tools (like MSan)
193 : unfortunately rely on it. */
194 :
195 : FD_FN_UNUSED static void *
196 0 : fd_topo_tile_stack_join_anon( void ) {
197 0 :
198 0 : ulong sz = 2*FD_TILE_PRIVATE_STACK_SZ;
199 0 : int prot = PROT_READ|PROT_WRITE;
200 0 : int flags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK;
201 0 :
202 0 : uchar * stack = MAP_FAILED;
203 0 : #if !FD_HAS_ASAN && !FD_HAS_MSAN
204 0 : stack = mmap( NULL, sz, prot, flags|MAP_HUGETLB, -1, 0 );
205 0 : #endif
206 0 :
207 0 : if( stack==MAP_FAILED ) {
208 0 : stack = mmap( NULL, sz, prot, flags, -1, 0 );
209 0 : if( FD_UNLIKELY( stack==MAP_FAILED ) ) {
210 0 : FD_LOG_ERR(( "mmap() for stack failed (%i-%s)", errno, fd_io_strerror( errno ) ));
211 0 : }
212 0 : }
213 0 :
214 0 : /* Create the guard regions in the extra space */
215 0 : void * guard_lo = (void *)( stack - FD_SHMEM_NORMAL_PAGE_SZ );
216 0 : if( FD_UNLIKELY( mmap( guard_lo, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
217 0 : MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_lo ) )
218 0 : FD_LOG_ERR(( "mmap(%p) failed (%i-%s)", guard_lo, errno, fd_io_strerror( errno ) ));
219 0 :
220 0 : void * guard_hi = (void *)( stack + FD_TILE_PRIVATE_STACK_SZ );
221 0 : if( FD_UNLIKELY( mmap( guard_hi, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
222 0 : MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_hi ) )
223 0 : FD_LOG_ERR(( "mmap(%p) failed (%i-%s)", guard_hi, errno, fd_io_strerror( errno ) ));
224 0 :
225 0 : return stack;
226 0 : }
227 :
228 : void *
229 : fd_topo_tile_stack_join( char const * app_name,
230 : char const * tile_name,
231 0 : ulong tile_kind_id ) {
232 : #if FD_HAS_MSAN
233 : return fd_topo_tile_stack_join_anon();
234 : #endif
235 :
236 0 : char name[ PATH_MAX ];
237 0 : FD_TEST( fd_cstr_printf_check( name, PATH_MAX, NULL, "%s_stack_%s%lu", app_name, tile_name, tile_kind_id ) );
238 :
239 0 : int dump = strcmp( tile_name, "sign" ) ? 1 : 0; /* avoid core dumps of sign tile stacks */
240 0 : uchar * stack = fd_shmem_join( name, FD_SHMEM_JOIN_MODE_READ_WRITE, dump, NULL, NULL, NULL );
241 0 : if( FD_UNLIKELY( !stack ) ) FD_LOG_ERR(( "fd_shmem_join failed" ));
242 :
243 : /* Make space for guard lo and guard hi */
244 0 : if( FD_UNLIKELY( fd_shmem_release( stack, FD_SHMEM_HUGE_PAGE_SZ, 1UL ) ) )
245 0 : FD_LOG_ERR(( "fd_shmem_release (%d-%s)", errno, fd_io_strerror( errno ) ));
246 0 : stack += FD_SHMEM_HUGE_PAGE_SZ;
247 0 : if( FD_UNLIKELY( fd_shmem_release( stack + FD_TILE_PRIVATE_STACK_SZ, FD_SHMEM_HUGE_PAGE_SZ, 1UL ) ) )
248 0 : FD_LOG_ERR(( "fd_shmem_release (%d-%s)", errno, fd_io_strerror( errno ) ));
249 :
250 : /* Create the guard regions in the extra space */
251 0 : void * guard_lo = (void *)(stack - FD_SHMEM_NORMAL_PAGE_SZ );
252 0 : if( FD_UNLIKELY( mmap( guard_lo, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
253 0 : MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_lo ) )
254 0 : FD_LOG_ERR(( "mmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
255 :
256 0 : void * guard_hi = (void *)(stack + FD_TILE_PRIVATE_STACK_SZ);
257 0 : if( FD_UNLIKELY( mmap( guard_hi, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
258 0 : MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_hi ) )
259 0 : FD_LOG_ERR(( "mmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
260 :
261 0 : return stack;
262 0 : }
263 :
264 : static inline void
265 : run_tile_thread( fd_topo_t * topo,
266 : fd_topo_tile_t * tile,
267 : fd_topo_run_tile_t tile_run,
268 : uint uid,
269 : uint gid,
270 : fd_cpuset_t const * floating_cpu_set,
271 : int floating_priority,
272 0 : fd_topo_run_thread_args_t * args ) {
273 : /* tpool will assign a thread later */
274 0 : if( FD_UNLIKELY( tile_run.for_tpool ) ) return;
275 0 : void * stack = fd_topo_tile_stack_join( topo->app_name, tile->name, tile->kind_id );
276 :
277 0 : pthread_attr_t attr[ 1 ];
278 0 : if( FD_UNLIKELY( pthread_attr_init( attr ) ) ) FD_LOG_ERR(( "pthread_attr_init() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
279 0 : if( FD_UNLIKELY( pthread_attr_setstack( attr, stack, FD_TILE_PRIVATE_STACK_SZ ) ) ) FD_LOG_ERR(( "pthread_attr_setstacksize() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
280 :
281 0 : FD_CPUSET_DECL( cpu_set );
282 0 : if( FD_LIKELY( tile->cpu_idx<65535UL ) ) {
283 : /* set the thread affinity before we clone the new process to ensure
284 : kernel first touch happens on the desired thread. */
285 0 : fd_cpuset_insert( cpu_set, tile->cpu_idx );
286 0 : if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, -19 ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
287 0 : } else {
288 0 : fd_memcpy( cpu_set, floating_cpu_set, fd_cpuset_footprint() );
289 0 : if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, floating_priority ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
290 0 : }
291 :
292 0 : if( FD_UNLIKELY( fd_cpuset_setaffinity( 0, cpu_set ) ) ) {
293 0 : if( FD_LIKELY( errno==EINVAL ) ) {
294 0 : FD_LOG_ERR(( "Unable to set the thread affinity for tile %s:%lu on cpu %lu. It is likely that the affinity "
295 0 : "you have specified for this tile in [layout.affinity] of your configuration file contains a "
296 0 : "CPU (%lu) which does not exist on this machine.",
297 0 : tile->name, tile->kind_id, tile->cpu_idx, tile->cpu_idx ));
298 0 : } else {
299 0 : FD_LOG_ERR(( "sched_setaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
300 0 : }
301 0 : }
302 :
303 0 : *args = (fd_topo_run_thread_args_t) {
304 0 : .topo = topo,
305 0 : .tile = tile,
306 0 : .tile_run = tile_run,
307 0 : .uid = uid,
308 0 : .gid = gid,
309 0 : .copied = 0,
310 0 : .stack_lo = stack,
311 0 : .stack_hi = (uchar *)stack + FD_TILE_PRIVATE_STACK_SZ
312 0 : };
313 :
314 0 : pthread_t pthread;
315 0 : if( FD_UNLIKELY( pthread_create( &pthread, attr, run_tile_thread_main, args ) ) ) FD_LOG_ERR(( "pthread_create() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
316 0 : }
317 :
318 : void
319 : fd_topo_run_single_process( fd_topo_t * topo,
320 : int agave,
321 : uint uid,
322 : uint gid,
323 0 : fd_topo_run_tile_t (* tile_run )( fd_topo_tile_t const * tile ) ) {
324 0 : FD_LOG_NOTICE(( "running single threaded topology with %lu tiles and %lu GiB memory",
325 0 : topo->tile_cnt, fd_topo_mlock( topo ) / (1UL << 30) ));
326 :
327 : /* Save the current affinity, it will be restored after creating any child tiles */
328 0 : FD_CPUSET_DECL( floating_cpu_set );
329 0 : if( FD_UNLIKELY( fd_cpuset_getaffinity( 0, floating_cpu_set ) ) )
330 0 : FD_LOG_ERR(( "sched_getaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
331 :
332 0 : errno = 0;
333 0 : int save_priority = getpriority( PRIO_PROCESS, 0 );
334 0 : if( FD_UNLIKELY( -1==save_priority && errno ) ) FD_LOG_ERR(( "getpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
335 :
336 0 : fd_topo_run_thread_args_t args[ FD_TOPO_MAX_TILES ];
337 :
338 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
339 0 : fd_topo_tile_t * tile = &topo->tiles[ i ];
340 0 : if( !agave && tile->is_agave ) continue;
341 0 : if( agave==1 && !tile->is_agave ) continue;
342 :
343 0 : fd_topo_run_tile_t run_tile = tile_run( tile );
344 0 : run_tile_thread( topo, tile, run_tile, uid, gid, floating_cpu_set, save_priority, &args[ i ] );
345 0 : }
346 :
347 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
348 0 : fd_topo_tile_t * tile = &topo->tiles[ i ];
349 0 : if( !agave && tile->is_agave ) continue;
350 0 : if( agave==1 && !tile->is_agave ) continue;
351 :
352 0 : while( !FD_VOLATILE( args[ i ].copied ) ) FD_SPIN_PAUSE();
353 0 : }
354 :
355 0 : fd_sandbox_switch_uid_gid( uid, gid );
356 :
357 0 : if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, save_priority ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
358 0 : if( FD_UNLIKELY( fd_cpuset_setaffinity( 0, floating_cpu_set ) ) )
359 0 : FD_LOG_ERR(( "sched_setaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
360 0 : }
|