Line data Source code
1 : #define _GNU_SOURCE
2 : #include "run.h"
3 :
4 : #include <sys/wait.h>
5 : #include "generated/main_seccomp.h"
6 : #if defined(__aarch64__)
7 : #include "generated/pidns_arm64_seccomp.h"
8 : #else
9 : #include "generated/pidns_seccomp.h"
10 : #endif
11 :
12 : #include "../../../platform/fd_sys_util.h"
13 : #include "../../../platform/fd_file_util.h"
14 : #include "../../../platform/fd_net_util.h"
15 : #include "../../../../disco/net/fd_net_tile.h"
16 :
17 : #include "../configure/configure.h"
18 :
19 : #include <dirent.h>
20 : #include <sched.h>
21 : #include <stdio.h>
22 : #include <stdlib.h> /* getenv */
23 : #include <poll.h>
24 : #include <unistd.h>
25 : #include <errno.h>
26 : #include <fcntl.h>
27 : #include <sys/prctl.h>
28 : #include <sys/resource.h>
29 : #include <sys/mman.h>
30 : #include <sys/stat.h>
31 : #include <linux/capability.h>
32 :
33 : #include "../../../../util/tile/fd_tile_private.h"
34 :
35 : extern fd_topo_obj_callbacks_t * CALLBACKS[];
36 :
37 0 : #define NAME "run"
38 :
39 : void
40 : run_cmd_perm( args_t * args,
41 : fd_cap_chk_t * chk,
42 0 : config_t const * config ) {
43 0 : (void)args;
44 :
45 0 : ulong mlock_limit = fd_topo_mlock_max_tile( &config->topo );
46 :
47 0 : fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_MEMLOCK, mlock_limit, "call `rlimit(2)` to increase `RLIMIT_MEMLOCK` so all memory can be locked with `mlock(2)`" );
48 0 : fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_NICE, 40, "call `setpriority(2)` to increase thread priorities" );
49 0 : fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_NOFILE, CONFIGURE_NR_OPEN_FILES,
50 0 : "call `rlimit(2) to increase `RLIMIT_NOFILE` to allow more open files for Agave" );
51 0 : fd_cap_chk_cap( chk, NAME, CAP_NET_RAW, "call `socket(2)` to bind to a raw socket for use by XDP" );
52 0 : fd_cap_chk_cap( chk, NAME, CAP_SYS_ADMIN, "call `bpf(2)` with the `BPF_OBJ_GET` command to initialize XDP" );
53 0 : if( fd_sandbox_requires_cap_sys_admin( config->uid, config->gid ) )
54 0 : fd_cap_chk_cap( chk, NAME, CAP_SYS_ADMIN, "call `unshare(2)` with `CLONE_NEWUSER` to sandbox the process in a user namespace" );
55 0 : if( FD_LIKELY( getuid() != config->uid ) )
56 0 : fd_cap_chk_cap( chk, NAME, CAP_SETUID, "call `setresuid(2)` to switch uid to the sandbox user" );
57 0 : if( FD_LIKELY( getgid()!=config->gid ) )
58 0 : fd_cap_chk_cap( chk, NAME, CAP_SETGID, "call `setresgid(2)` to switch gid to the sandbox user" );
59 0 : if( FD_UNLIKELY( config->development.netns.enabled ) )
60 0 : fd_cap_chk_cap( chk, NAME, CAP_SYS_ADMIN, "call `setns(2)` to enter a network namespace" );
61 0 : if( FD_UNLIKELY( config->tiles.metric.prometheus_listen_port<1024 ) )
62 0 : fd_cap_chk_cap( chk, NAME, CAP_NET_BIND_SERVICE, "call `bind(2)` to bind to a privileged port for serving metrics" );
63 0 : if( FD_UNLIKELY( config->tiles.gui.gui_listen_port<1024 ) )
64 0 : fd_cap_chk_cap( chk, NAME, CAP_NET_BIND_SERVICE, "call `bind(2)` to bind to a privileged port for serving the GUI" );
65 0 : }
66 :
67 : struct pidns_clone_args {
68 : config_t const * config;
69 : int * pipefd;
70 : int closefd;
71 : };
72 :
73 : extern char fd_log_private_path[ 1024 ]; /* empty string on start */
74 :
75 : static pid_t pid_namespace;
76 :
77 0 : #define FD_LOG_ERR_NOEXIT(a) do { long _fd_log_msg_now = fd_log_wallclock(); fd_log_private_1( 4, _fd_log_msg_now, __FILE__, __LINE__, __func__, fd_log_private_0 a ); } while(0)
78 :
79 : extern int * fd_log_private_shared_lock;
80 :
81 : static void
82 0 : parent_signal( int sig ) {
83 0 : if( FD_LIKELY( pid_namespace ) ) kill( pid_namespace, SIGKILL );
84 :
85 : /* A pretty gross hack. For the local process, clear the lock so that
86 : we can always print the messages without waiting on another process,
87 : particularly if one of those processes might have just died. The
88 : signal handler is re-entrant so this also avoids a deadlock since
89 : the log lock is not re-entrant. */
90 0 : int lock = 0;
91 0 : fd_log_private_shared_lock = &lock;
92 :
93 0 : if( -1!=fd_log_private_logfile_fd() ) FD_LOG_ERR_NOEXIT(( "Received signal %s\nLog at \"%s\"", fd_io_strsignal( sig ), fd_log_private_path ));
94 0 : else FD_LOG_ERR_NOEXIT(( "Received signal %s", fd_io_strsignal( sig ) ));
95 :
96 0 : if( FD_LIKELY( sig==SIGINT ) ) fd_sys_util_exit_group( 128+SIGINT );
97 0 : else fd_sys_util_exit_group( 0 );
98 0 : }
99 :
100 : static void
101 0 : install_parent_signals( void ) {
102 0 : struct sigaction sa = {
103 0 : .sa_handler = parent_signal,
104 0 : .sa_flags = 0,
105 0 : };
106 0 : if( FD_UNLIKELY( sigaction( SIGTERM, &sa, NULL ) ) )
107 0 : FD_LOG_ERR(( "sigaction(SIGTERM) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
108 0 : if( FD_UNLIKELY( sigaction( SIGINT, &sa, NULL ) ) )
109 0 : FD_LOG_ERR(( "sigaction(SIGINT) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
110 :
111 0 : sa.sa_handler = SIG_IGN;
112 0 : if( FD_UNLIKELY( sigaction( SIGUSR1, &sa, NULL ) ) )
113 0 : FD_LOG_ERR(( "sigaction(SIGUSR1) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
114 0 : if( FD_UNLIKELY( sigaction( SIGUSR2, &sa, NULL ) ) )
115 0 : FD_LOG_ERR(( "sigaction(SIGUSR2) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
116 0 : }
117 :
118 : void *
119 0 : create_clone_stack( void ) {
120 0 : ulong mmap_sz = FD_TILE_PRIVATE_STACK_SZ + 2UL*FD_SHMEM_NORMAL_PAGE_SZ;
121 0 : uchar * stack = (uchar *)mmap( NULL, mmap_sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, (off_t)0 );
122 0 : if( FD_UNLIKELY( stack==MAP_FAILED ) )
123 0 : FD_LOG_ERR(( "mmap() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
124 :
125 : /* Make space for guard lo and guard hi */
126 0 : if( FD_UNLIKELY( munmap( stack, FD_SHMEM_NORMAL_PAGE_SZ ) ) )
127 0 : FD_LOG_ERR(( "munmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
128 0 : stack += FD_SHMEM_NORMAL_PAGE_SZ;
129 0 : if( FD_UNLIKELY( munmap( stack + FD_TILE_PRIVATE_STACK_SZ, FD_SHMEM_NORMAL_PAGE_SZ ) ) )
130 0 : FD_LOG_ERR(( "munmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
131 :
132 : /* Create the guard regions in the extra space */
133 0 : void * guard_lo = (void *)(stack - FD_SHMEM_NORMAL_PAGE_SZ );
134 0 : if( FD_UNLIKELY( mmap( guard_lo, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
135 0 : MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_lo ) )
136 0 : FD_LOG_ERR(( "mmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
137 :
138 0 : void * guard_hi = (void *)(stack + FD_TILE_PRIVATE_STACK_SZ);
139 0 : if( FD_UNLIKELY( mmap( guard_hi, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
140 0 : MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_hi ) )
141 0 : FD_LOG_ERR(( "mmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
142 :
143 0 : return stack;
144 0 : }
145 :
146 :
147 : static int
148 : execve_agave( int config_memfd,
149 0 : int pipefd ) {
150 0 : if( FD_UNLIKELY( -1==fcntl( pipefd, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
151 0 : pid_t child = fork();
152 0 : if( FD_UNLIKELY( -1==child ) ) FD_LOG_ERR(( "fork() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
153 0 : if( FD_LIKELY( !child ) ) {
154 0 : char _current_executable_path[ PATH_MAX ];
155 0 : FD_TEST( -1!=fd_file_util_self_exe( _current_executable_path ) );
156 :
157 0 : char config_fd[ 32 ];
158 0 : FD_TEST( fd_cstr_printf_check( config_fd, sizeof( config_fd ), NULL, "%d", config_memfd ) );
159 0 : char * args[ 5 ] = { _current_executable_path, "run-agave", "--config-fd", config_fd, NULL };
160 :
161 0 : char * envp[] = { NULL, NULL };
162 0 : char * google_creds = getenv( "GOOGLE_APPLICATION_CREDENTIALS" );
163 0 : char provide_creds[ PATH_MAX+30UL ];
164 0 : if( FD_UNLIKELY( google_creds ) ) {
165 0 : FD_TEST( fd_cstr_printf_check( provide_creds, sizeof( provide_creds ), NULL, "GOOGLE_APPLICATION_CREDENTIALS=%s", google_creds ) );
166 0 : envp[ 0 ] = provide_creds;
167 0 : }
168 :
169 0 : if( FD_UNLIKELY( -1==execve( _current_executable_path, args, envp ) ) ) FD_LOG_ERR(( "execve() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
170 0 : } else {
171 0 : if( FD_UNLIKELY( -1==fcntl( pipefd, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
172 0 : return child;
173 0 : }
174 0 : return 0;
175 0 : }
176 :
177 : static pid_t
178 : execve_tile( fd_topo_tile_t const * tile,
179 : fd_cpuset_t const * floating_cpu_set,
180 : int floating_priority,
181 : int config_memfd,
182 0 : int pipefd ) {
183 0 : FD_CPUSET_DECL( cpu_set );
184 0 : if( FD_LIKELY( tile->cpu_idx!=ULONG_MAX ) ) {
185 : /* set the thread affinity before we clone the new process to ensure
186 : kernel first touch happens on the desired thread. */
187 0 : fd_cpuset_insert( cpu_set, tile->cpu_idx );
188 0 : if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, -19 ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
189 0 : } else {
190 0 : fd_memcpy( cpu_set, floating_cpu_set, fd_cpuset_footprint() );
191 0 : if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, floating_priority ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
192 0 : }
193 :
194 0 : if( FD_UNLIKELY( fd_cpuset_setaffinity( 0, cpu_set ) ) ) {
195 0 : if( FD_LIKELY( errno==EINVAL ) ) {
196 0 : FD_LOG_ERR(( "Unable to set the thread affinity for tile %s:%lu on cpu %lu. It is likely that the affinity "
197 0 : "you have specified for this tile in [layout.affinity] of your configuration file contains a "
198 0 : "CPU (%lu) which does not exist on this machine.",
199 0 : tile->name, tile->kind_id, tile->cpu_idx, tile->cpu_idx ));
200 0 : } else {
201 0 : FD_LOG_ERR(( "sched_setaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
202 0 : }
203 0 : }
204 :
205 : /* Clear CLOEXEC on the side of the pipe we want to pass to the tile. */
206 0 : if( FD_UNLIKELY( -1==fcntl( pipefd, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
207 0 : pid_t child = fork();
208 0 : if( FD_UNLIKELY( -1==child ) ) FD_LOG_ERR(( "fork() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
209 0 : if( FD_LIKELY( !child ) ) {
210 0 : char _current_executable_path[ PATH_MAX ];
211 0 : FD_TEST( -1!=fd_file_util_self_exe( _current_executable_path ) );
212 :
213 0 : char kind_id[ 32 ], config_fd[ 32 ], pipe_fd[ 32 ];
214 0 : FD_TEST( fd_cstr_printf_check( kind_id, sizeof( kind_id ), NULL, "%lu", tile->kind_id ) );
215 0 : FD_TEST( fd_cstr_printf_check( config_fd, sizeof( config_fd ), NULL, "%d", config_memfd ) );
216 0 : FD_TEST( fd_cstr_printf_check( pipe_fd, sizeof( pipe_fd ), NULL, "%d", pipefd ) );
217 0 : char const * args[ 9 ] = { _current_executable_path, "run1", tile->name, kind_id, "--pipe-fd", pipe_fd, "--config-fd", config_fd, NULL };
218 0 : if( FD_UNLIKELY( -1==execve( _current_executable_path, (char **)args, NULL ) ) ) FD_LOG_ERR(( "execve() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
219 0 : } else {
220 0 : if( FD_UNLIKELY( -1==fcntl( pipefd, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
221 0 : return child;
222 0 : }
223 0 : return 0;
224 0 : }
225 :
226 : extern int * fd_log_private_shared_lock;
227 :
228 : int
229 0 : main_pid_namespace( void * _args ) {
230 0 : struct pidns_clone_args * args = _args;
231 0 : if( FD_UNLIKELY( close( args->pipefd[ 0 ] ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
232 0 : if( FD_UNLIKELY( -1!=args->closefd ) ) {
233 0 : if( FD_UNLIKELY( close( args->closefd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
234 0 : }
235 :
236 0 : config_t const * config = args->config;
237 :
238 0 : fd_log_thread_set( "pidns" );
239 0 : ulong pid = fd_sandbox_getpid(); /* Need to read /proc again.. we got a new PID from clone */
240 0 : fd_log_private_group_id_set( pid );
241 0 : fd_log_private_thread_id_set( pid );
242 0 : fd_log_private_stack_discover( FD_TILE_PRIVATE_STACK_SZ,
243 0 : &fd_tile_private_stack0, &fd_tile_private_stack1 );
244 :
245 0 : if( FD_UNLIKELY( !config->development.sandbox ) ) {
246 : /* If no sandbox, then there's no actual PID namespace so we can't
247 : wait() grandchildren for the exit code. Do this as a workaround. */
248 0 : if( FD_UNLIKELY( -1==prctl( PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0 ) ) )
249 0 : FD_LOG_ERR(( "prctl(PR_SET_CHILD_SUBREAPER) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
250 0 : }
251 :
252 : /* Save the current affinity, it will be restored after creating any child tiles */
253 0 : FD_CPUSET_DECL( floating_cpu_set );
254 0 : if( FD_UNLIKELY( fd_cpuset_getaffinity( 0, floating_cpu_set ) ) )
255 0 : FD_LOG_ERR(( "fd_cpuset_getaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
256 :
257 0 : pid_t child_pids[ FD_TOPO_MAX_TILES+1 ];
258 0 : ulong actual_pids[ FD_TOPO_MAX_TILES+1 ];
259 0 : for( ulong i=0UL; i<FD_TOPO_MAX_TILES+1; i++ ) actual_pids[ i ] = ULONG_MAX;
260 0 : char child_names[ FD_TOPO_MAX_TILES+1 ][ 32 ];
261 0 : ulong child_idxs[ FD_TOPO_MAX_TILES+1 ];
262 0 : struct pollfd fds[ FD_TOPO_MAX_TILES+2 ];
263 :
264 0 : int config_memfd = fd_config_to_memfd( config );
265 0 : if( FD_UNLIKELY( -1==config_memfd ) ) FD_LOG_ERR(( "fd_config_to_memfd() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
266 :
267 0 : if( FD_UNLIKELY( config->development.debug_tile ) ) {
268 0 : fd_log_private_shared_lock[1] = 1;
269 0 : }
270 :
271 0 : ulong child_cnt = 0UL;
272 0 : if( FD_LIKELY( !config->is_firedancer && !config->development.no_agave ) ) {
273 0 : int pipefd[ 2 ];
274 0 : if( FD_UNLIKELY( pipe2( pipefd, O_CLOEXEC ) ) ) FD_LOG_ERR(( "pipe2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
275 0 : fds[ child_cnt ] = (struct pollfd){ .fd = pipefd[ 0 ], .events = 0 };
276 0 : child_pids[ child_cnt ] = execve_agave( config_memfd, pipefd[ 1 ] );
277 0 : FD_TEST( child_pids[ child_cnt ]>0 );
278 0 : actual_pids[ child_cnt ] = (ulong)child_pids[ child_cnt ];
279 0 : child_idxs[ child_cnt ] = ULONG_MAX;
280 0 : if( FD_UNLIKELY( close( pipefd[ 1 ] ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
281 0 : strncpy( child_names[ child_cnt ], "agave", 32 );
282 0 : child_cnt++;
283 0 : }
284 :
285 0 : if( FD_UNLIKELY( config->development.netns.enabled ) ) {
286 0 : if( FD_UNLIKELY( -1==fd_net_util_netns_enter( config->net.interface, NULL ) ) )
287 0 : FD_LOG_ERR(( "failed to enter network namespace `%s` (%i-%s)", config->net.interface, errno, fd_io_strerror( errno ) ));
288 0 : }
289 :
290 0 : errno = 0;
291 0 : int save_priority = getpriority( PRIO_PROCESS, 0 );
292 0 : if( FD_UNLIKELY( -1==save_priority && errno ) ) FD_LOG_ERR(( "getpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
293 :
294 0 : int need_xdp = 0==strcmp( config->net.provider, "xdp" );
295 0 : fd_xdp_fds_t xdp_fds[ FD_TOPO_XDP_FDS_MAX ];
296 0 : uint xdp_fds_cnt = FD_TOPO_XDP_FDS_MAX;
297 0 : if( need_xdp ) {
298 0 : fd_topo_install_xdp( &config->topo, xdp_fds, &xdp_fds_cnt, config->net.bind_address_parsed, 0 );
299 0 : }
300 :
301 0 : for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
302 0 : fd_topo_tile_t const * tile = &config->topo.tiles[ i ];
303 0 : if( FD_UNLIKELY( tile->is_agave ) ) continue;
304 :
305 0 : if( need_xdp ) {
306 0 : if( FD_UNLIKELY( strcmp( tile->name, "net" ) ) ) {
307 0 : for( uint i=0U; i<xdp_fds_cnt; i++ ) {
308 : /* close XDP related file descriptors */
309 0 : if( FD_UNLIKELY( -1==fcntl( xdp_fds[i].xsk_map_fd, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
310 0 : if( FD_UNLIKELY( -1==fcntl( xdp_fds[i].prog_link_fd, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
311 0 : }
312 0 : } else {
313 0 : for( uint i=0U; i<xdp_fds_cnt; i++ ) {
314 0 : if( FD_UNLIKELY( -1==fcntl( xdp_fds[i].xsk_map_fd, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
315 0 : if( FD_UNLIKELY( -1==fcntl( xdp_fds[i].prog_link_fd, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
316 0 : }
317 0 : }
318 0 : }
319 :
320 0 : int pipefd[ 2 ];
321 0 : if( FD_UNLIKELY( pipe2( pipefd, O_CLOEXEC ) ) ) FD_LOG_ERR(( "pipe2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
322 0 : fds[ child_cnt ] = (struct pollfd){ .fd = pipefd[ 0 ], .events = 0 };
323 0 : child_pids[ child_cnt ] = execve_tile( tile, floating_cpu_set, save_priority, config_memfd, pipefd[ 1 ] );
324 0 : child_idxs[ child_cnt ] = i;
325 0 : if( FD_UNLIKELY( close( pipefd[ 1 ] ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
326 0 : strncpy( child_names[ child_cnt ], tile->name, 32 );
327 0 : child_cnt++;
328 0 : }
329 :
330 : /* Obtain the actual grandchild PID from the pipe */
331 0 : for( ulong i=0UL; i<child_cnt; i++ ) {
332 0 : if( FD_UNLIKELY( actual_pids[ i ]!=ULONG_MAX ) ) continue;
333 0 : FD_TEST( 8UL==read( fds[ i ].fd, &actual_pids[ i ], 8UL ) );
334 0 : }
335 :
336 0 : if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, save_priority ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
337 0 : if( FD_UNLIKELY( fd_cpuset_setaffinity( 0, floating_cpu_set ) ) )
338 0 : FD_LOG_ERR(( "fd_cpuset_setaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
339 :
340 0 : if( FD_UNLIKELY( close( config_memfd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
341 0 : if( FD_UNLIKELY( close( config->log.lock_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
342 0 : if( need_xdp ) {
343 0 : for( uint i=0U; i<xdp_fds_cnt; i++ ) {
344 0 : if( FD_UNLIKELY( close( xdp_fds[i].xsk_map_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
345 0 : if( FD_UNLIKELY( close( xdp_fds[i].prog_link_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
346 0 : }
347 0 : }
348 :
349 0 : int allow_fds[ 4+FD_TOPO_MAX_TILES ];
350 0 : ulong allow_fds_cnt = 0;
351 0 : allow_fds[ allow_fds_cnt++ ] = 2; /* stderr */
352 0 : if( FD_LIKELY( fd_log_private_logfile_fd()!=-1 ) )
353 0 : allow_fds[ allow_fds_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
354 0 : allow_fds[ allow_fds_cnt++ ] = args->pipefd[ 1 ]; /* write end of main pipe */
355 0 : for( ulong i=0UL; i<child_cnt; i++ )
356 0 : allow_fds[ allow_fds_cnt++ ] = fds[ i ].fd; /* read end of child pipes */
357 :
358 0 : struct sock_filter seccomp_filter[ 128UL ];
359 0 : unsigned int instr_cnt;
360 : #if defined(__aarch64__)
361 : populate_sock_filter_policy_pidns_arm64( 128UL, seccomp_filter, (uint)fd_log_private_logfile_fd() );
362 : instr_cnt = sock_filter_policy_pidns_arm64_instr_cnt;
363 : #else
364 0 : populate_sock_filter_policy_pidns( 128UL, seccomp_filter, (uint)fd_log_private_logfile_fd() );
365 0 : instr_cnt = sock_filter_policy_pidns_instr_cnt;
366 0 : #endif
367 :
368 0 : if( FD_LIKELY( config->development.sandbox ) ) {
369 0 : fd_sandbox_enter( config->uid,
370 0 : config->gid,
371 0 : 0,
372 0 : 0,
373 0 : 0,
374 0 : 0,
375 0 : 0,
376 0 : 1UL+child_cnt, /* RLIMIT_NOFILE needs to be set to the nfds argument of poll() */
377 0 : 0UL,
378 0 : 0UL,
379 0 : allow_fds_cnt,
380 0 : allow_fds,
381 0 : instr_cnt,
382 0 : seccomp_filter );
383 0 : } else {
384 0 : fd_sandbox_switch_uid_gid( config->uid, config->gid );
385 0 : }
386 :
387 : /* The supervsior process should not share the log lock, because a
388 : child process might die while holding it and we still need to
389 : reap and print errors. */
390 0 : int lock = 0;
391 0 : fd_log_private_shared_lock = &lock;
392 :
393 : /* Reap child process PIDs so they don't show up in `ps` etc. All of
394 : these children should have exited immediately after clone(2)'ing
395 : another child with a huge page based stack. */
396 0 : for( ulong i=0UL; i<child_cnt; i++ ) {
397 0 : int wstatus;
398 0 : int exited_pid = wait4( child_pids[ i ], &wstatus, (int)__WALL, NULL );
399 0 : if( FD_UNLIKELY( -1==exited_pid ) ) {
400 0 : FD_LOG_ERR(( "pidns wait4() failed (%i-%s) %lu %hu", errno, fd_io_strerror( errno ), i, fds[i].revents ));
401 0 : } else if( FD_UNLIKELY( child_pids[ i ]!=exited_pid ) ) {
402 0 : FD_LOG_ERR(( "pidns wait4() returned unexpected pid %d %d", child_pids[ i ], exited_pid ));
403 0 : } else if( FD_UNLIKELY( !WIFEXITED( wstatus ) ) ) {
404 0 : FD_LOG_ERR_NOEXIT(( "tile %lu (%s) exited while booting with signal %d (%s)\n", i, child_names[ i ], WTERMSIG( wstatus ), fd_io_strsignal( WTERMSIG( wstatus ) ) ));
405 0 : fd_sys_util_exit_group( WTERMSIG( wstatus ) ? WTERMSIG( wstatus ) : 1 );
406 0 : }
407 0 : if( FD_UNLIKELY( WEXITSTATUS( wstatus ) ) ) {
408 0 : FD_LOG_ERR_NOEXIT(( "tile %lu (%s) exited while booting with code %d\n", i, child_names[ i ], WEXITSTATUS( wstatus ) ));
409 0 : fd_sys_util_exit_group( WEXITSTATUS( wstatus ) ? WEXITSTATUS( wstatus ) : 1 );
410 0 : }
411 0 : }
412 :
413 0 : fds[ child_cnt ] = (struct pollfd){ .fd = args->pipefd[ 1 ], .events = 0 };
414 0 : strncpy( child_names[ child_cnt ], "parent", 32UL );
415 0 : child_idxs[ child_cnt ] = ULONG_MAX;
416 :
417 : /* We are now the init process of the pid namespace. If the init
418 : process dies, all children are terminated. If any child dies, we
419 : terminate the init process, which will cause the kernel to
420 : terminate all other children bringing all of our processes down as
421 : a group. The parent process will also die if this process dies,
422 : due to getting SIGHUP on the pipe. */
423 0 : while( 1 ) {
424 0 : if( FD_UNLIKELY( -1==poll( fds, 1UL+child_cnt, -1 ) ) ) FD_LOG_ERR(( "poll() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
425 :
426 : /* Parent process died, probably SIGINT, exit gracefully. */
427 0 : if( FD_UNLIKELY( fds[ child_cnt ].revents ) ) fd_sys_util_exit_group( 0 );
428 :
429 : /* Child process died, reap it to figure out exit code. */
430 0 : int wstatus;
431 0 : int exited_pid = wait4( -1, &wstatus, (int)__WALL | (int)WNOHANG, NULL );
432 0 : if( FD_UNLIKELY( -1==exited_pid ) ) {
433 0 : FD_LOG_ERR(( "pidns wait4() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
434 0 : } else if( FD_UNLIKELY( !exited_pid ) ) {
435 : /* Spurious wakeup, no child actually dead yet. */
436 0 : continue;
437 0 : }
438 :
439 : /* Now find the tile corresponding to that PID */
440 0 : FD_TEST( exited_pid>0 );
441 0 : int found = 0;
442 0 : for( ulong i=0UL; i<child_cnt; i++ ) {
443 0 : if( FD_LIKELY( actual_pids[ i ]!=(ulong)exited_pid ) ) continue;
444 :
445 0 : found = 1;
446 0 : fds[ i ].fd = -1; /* Don't poll on this tile anymore */
447 :
448 0 : char * tile_name = child_names[ i ];
449 0 : ulong tile_idx = child_idxs[ i ];
450 0 : ulong tile_id = config->topo.tiles[ tile_idx ].kind_id;
451 :
452 0 : if( FD_UNLIKELY( !WIFEXITED( wstatus ) ) ) {
453 0 : FD_LOG_ERR_NOEXIT(( "tile %s:%lu exited with signal %d (%s)", tile_name, tile_id, WTERMSIG( wstatus ), fd_io_strsignal( WTERMSIG( wstatus ) ) ));
454 0 : fd_sys_util_exit_group( WTERMSIG( wstatus ) ? WTERMSIG( wstatus ) : 1 );
455 0 : } else {
456 0 : int exit_code = WEXITSTATUS( wstatus );
457 0 : if( FD_LIKELY( !exit_code && tile_idx!=ULONG_MAX && config->topo.tiles[ tile_idx ].allow_shutdown ) ) {
458 0 : found = 1;
459 0 : FD_LOG_INFO(( "tile %s:%lu exited gracefully with code %d", tile_name, tile_id, exit_code ));
460 0 : } else {
461 0 : FD_LOG_ERR_NOEXIT(( "tile %s:%lu exited with code %d", tile_name, tile_id, exit_code ));
462 0 : fd_sys_util_exit_group( exit_code ? exit_code : 1 );
463 0 : }
464 0 : }
465 0 : }
466 :
467 0 : if( FD_UNLIKELY( !found ) ) FD_LOG_ERR(( "wait4() returned unexpected pid %d", exited_pid ));
468 0 : }
469 :
470 0 : return 0;
471 0 : }
472 :
473 : int
474 : clone_firedancer( config_t const * config,
475 : int close_fd,
476 0 : int * out_pipe ) {
477 : /* This pipe is here just so that the child process knows when the
478 : parent has died (it will get a HUP). */
479 0 : int pipefd[2];
480 0 : if( FD_UNLIKELY( pipe2( pipefd, O_CLOEXEC | O_NONBLOCK ) ) ) FD_LOG_ERR(( "pipe2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
481 :
482 : /* clone into a pid namespace */
483 0 : int flags = config->development.sandbox ? CLONE_NEWPID : 0;
484 0 : struct pidns_clone_args args = { .config = config, .closefd = close_fd, .pipefd = pipefd, };
485 :
486 0 : void * stack = create_clone_stack();
487 :
488 0 : int pid_namespace = clone( main_pid_namespace, (uchar *)stack + FD_TILE_PRIVATE_STACK_SZ, flags, &args );
489 0 : if( FD_UNLIKELY( pid_namespace<0 ) ) FD_LOG_ERR(( "clone() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
490 :
491 0 : if( FD_UNLIKELY( close( pipefd[ 1 ] ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
492 :
493 0 : *out_pipe = pipefd[ 0 ];
494 0 : return pid_namespace;
495 0 : }
496 :
497 : static void
498 : workspace_path( config_t const * config,
499 : fd_topo_wksp_t const * wksp,
500 0 : char out[ PATH_MAX ] ) {
501 0 : char const * mount_path;
502 0 : switch( wksp->page_sz ) {
503 0 : case FD_SHMEM_HUGE_PAGE_SZ:
504 0 : mount_path = config->hugetlbfs.huge_page_mount_path;
505 0 : break;
506 0 : case FD_SHMEM_GIGANTIC_PAGE_SZ:
507 0 : mount_path = config->hugetlbfs.gigantic_page_mount_path;
508 0 : break;
509 0 : case FD_SHMEM_NORMAL_PAGE_SZ:
510 0 : mount_path = config->hugetlbfs.normal_page_mount_path;
511 0 : break;
512 0 : default:
513 0 : FD_LOG_ERR(( "invalid page size %lu", wksp->page_sz ));
514 0 : }
515 :
516 0 : FD_TEST( fd_cstr_printf_check( out, PATH_MAX, NULL, "%s/%s_%s.wksp", mount_path, config->name, wksp->name ) );
517 0 : }
518 :
519 : static void
520 : warn_unknown_files( config_t const * config,
521 0 : ulong mount_type ) {
522 0 : char const * mount_path;
523 0 : switch( mount_type ) {
524 0 : case 0UL:
525 0 : mount_path = config->hugetlbfs.huge_page_mount_path;
526 0 : break;
527 0 : case 1UL:
528 0 : mount_path = config->hugetlbfs.gigantic_page_mount_path;
529 0 : break;
530 0 : default:
531 0 : FD_LOG_ERR(( "invalid mount type %lu", mount_type ));
532 0 : }
533 :
534 : /* Check if there are any files in mount_path */
535 0 : DIR * dir = opendir( mount_path );
536 0 : if( FD_UNLIKELY( !dir ) ) {
537 0 : if( FD_UNLIKELY( errno!=ENOENT ) ) FD_LOG_ERR(( "error opening `%s` (%i-%s)", mount_path, errno, fd_io_strerror( errno ) ));
538 0 : return;
539 0 : }
540 :
541 0 : struct dirent * entry;
542 0 : while(( FD_LIKELY( entry = readdir( dir ) ) )) {
543 0 : if( FD_UNLIKELY( !strcmp( entry->d_name, ".") || !strcmp( entry->d_name, ".." ) ) ) continue;
544 :
545 0 : char entry_path[ PATH_MAX ];
546 0 : FD_TEST( fd_cstr_printf_check( entry_path, PATH_MAX, NULL, "%s/%s", mount_path, entry->d_name ));
547 :
548 0 : int known_file = 0;
549 0 : for( ulong i=0UL; i<config->topo.wksp_cnt; i++ ) {
550 0 : fd_topo_wksp_t const * wksp = &config->topo.workspaces[ i ];
551 :
552 0 : char expected_path[ PATH_MAX ];
553 0 : workspace_path( config, wksp, expected_path );
554 :
555 0 : if( !strcmp( entry_path, expected_path ) ) {
556 0 : known_file = 1;
557 0 : break;
558 0 : }
559 0 : }
560 :
561 0 : if( mount_type==0UL ) {
562 0 : for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
563 0 : fd_topo_tile_t const * tile = &config->topo.tiles [ i ];
564 :
565 0 : char expected_path[ PATH_MAX ];
566 0 : FD_TEST( fd_cstr_printf_check( expected_path, PATH_MAX, NULL, "%s/%s_stack_%s%lu", config->hugetlbfs.huge_page_mount_path, config->name, tile->name, tile->kind_id ) );
567 :
568 0 : if( !strcmp( entry_path, expected_path ) ) {
569 0 : known_file = 1;
570 0 : break;
571 0 : }
572 0 : }
573 0 : }
574 :
575 0 : if( FD_UNLIKELY( !known_file ) ) FD_LOG_WARNING(( "unknown file `%s` found in `%s`", entry->d_name, mount_path ));
576 0 : }
577 :
578 0 : if( FD_UNLIKELY( errno && errno!=ENOENT ) ) FD_LOG_ERR(( "error reading dir `%s` (%i-%s)", mount_path, errno, fd_io_strerror( errno ) ));
579 0 : if( FD_UNLIKELY( closedir( dir ) ) ) FD_LOG_ERR(( "error closing `%s` (%i-%s)", mount_path, errno, fd_io_strerror( errno ) ));
580 0 : }
581 :
582 : void
583 0 : initialize_workspaces( config_t * config ) {
584 : /* Switch to non-root uid/gid for workspace creation. Permissions
585 : checks are still done as the current user. */
586 0 : uint gid = getgid();
587 0 : uint uid = getuid();
588 0 : if( FD_LIKELY( gid!=config->gid && -1==setegid( config->gid ) ) )
589 0 : FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
590 0 : if( FD_LIKELY( uid!=config->uid && -1==seteuid( config->uid ) ) )
591 0 : FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
592 :
593 0 : for( ulong i=0UL; i<config->topo.wksp_cnt; i++ ) {
594 0 : fd_topo_wksp_t * wksp = &config->topo.workspaces[ i ];
595 :
596 0 : char path[ PATH_MAX ];
597 0 : workspace_path( config, wksp, path );
598 :
599 0 : struct stat st;
600 0 : int result = stat( path, &st );
601 :
602 0 : int update_existing;
603 0 : if( FD_UNLIKELY( !result && config->is_live_cluster ) ) {
604 0 : if( FD_UNLIKELY( -1==unlink( path ) && errno!=ENOENT ) ) FD_LOG_ERR(( "unlink() failed when trying to create workspace `%s` (%i-%s)", path, errno, fd_io_strerror( errno ) ));
605 0 : update_existing = 0;
606 0 : } else if( FD_UNLIKELY( !result ) ) {
607 : /* Creating all of the workspaces is very expensive because the
608 : kernel has to zero out all of the pages. There can be tens or
609 : hundreds of gigabytes of zeroing to do.
610 :
611 : What would be really nice is if the kernel let us create huge
612 : pages without zeroing them, but it's not possible. The
613 : ftruncate and fallocate calls do not support this type of
614 : resize with the hugetlbfs filesystem.
615 :
616 : Instead.. to prevent repeatedly doing this zeroing every time
617 : we start the validator, we have a small hack here to re-use the
618 : workspace files if they exist. */
619 0 : update_existing = 1;
620 0 : } else if( FD_LIKELY( result && errno==ENOENT ) ) {
621 0 : update_existing = 0;
622 0 : } else {
623 0 : FD_LOG_ERR(( "stat failed when trying to create workspace `%s` (%i-%s)", path, errno, fd_io_strerror( errno ) ));
624 0 : }
625 :
626 0 : if( FD_UNLIKELY( -1==fd_topo_create_workspace( &config->topo, wksp, update_existing ) ) ) {
627 0 : FD_TEST( errno==ENOMEM );
628 :
629 0 : warn_unknown_files( config, wksp->page_sz!=FD_SHMEM_HUGE_PAGE_SZ );
630 :
631 0 : char path[ PATH_MAX ];
632 0 : workspace_path( config, wksp, path );
633 0 : FD_LOG_ERR(( "ENOMEM-Out of memory when trying to create workspace `%s` at `%s` "
634 0 : "with %lu %s pages. Firedancer reserves enough memory for all of its workspaces "
635 0 : "during the `hugetlbfs` configure step, so it is likely you have unknown files "
636 0 : "left over in this directory which are consuming memory, or another program on "
637 0 : "the system is using pages from the same mount.",
638 0 : wksp->name, path, wksp->page_cnt, fd_shmem_page_sz_to_cstr( wksp->page_sz ) ));
639 0 : }
640 0 : fd_topo_join_workspace( &config->topo, wksp, FD_SHMEM_JOIN_MODE_READ_WRITE );
641 0 : fd_topo_wksp_new( &config->topo, wksp, CALLBACKS );
642 0 : fd_topo_leave_workspace( &config->topo, wksp );
643 0 : }
644 :
645 0 : if( FD_UNLIKELY( seteuid( uid ) ) ) FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
646 0 : if( FD_UNLIKELY( setegid( gid ) ) ) FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
647 0 : }
648 :
649 : void
650 0 : initialize_stacks( config_t const * config ) {
651 : # if FD_HAS_MSAN
652 : /* MSan calls an external symbolizer using fork() on crashes, which is
653 : incompatible with Firedancer's MAP_SHARED stacks. */
654 : (void)config;
655 : return;
656 : # endif
657 :
658 : /* Switch to non-root uid/gid for workspace creation. Permissions
659 : checks are still done as the current user. */
660 0 : uint gid = getgid();
661 0 : uint uid = getuid();
662 0 : if( FD_LIKELY( gid!=config->gid && -1==setegid( config->gid ) ) )
663 0 : FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
664 0 : if( FD_LIKELY( uid!=config->uid && -1==seteuid( config->uid ) ) )
665 0 : FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
666 :
667 0 : for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
668 0 : fd_topo_tile_t const * tile = &config->topo.tiles[ i ];
669 :
670 0 : char path[ PATH_MAX ];
671 0 : FD_TEST( fd_cstr_printf_check( path, PATH_MAX, NULL, "%s/%s_stack_%s%lu", config->hugetlbfs.huge_page_mount_path, config->name, tile->name, tile->kind_id ) );
672 :
673 0 : struct stat st;
674 0 : int result = stat( path, &st );
675 :
676 0 : int update_existing;
677 0 : if( FD_UNLIKELY( !result && config->is_live_cluster ) ) {
678 0 : if( FD_UNLIKELY( -1==unlink( path ) && errno!=ENOENT ) ) FD_LOG_ERR(( "unlink() failed when trying to create stack workspace `%s` (%i-%s)", path, errno, fd_io_strerror( errno ) ));
679 0 : update_existing = 0;
680 0 : } else if( FD_UNLIKELY( !result ) ) {
681 : /* See above note about zeroing out pages. */
682 0 : update_existing = 1;
683 0 : } else if( FD_LIKELY( result && errno==ENOENT ) ) {
684 0 : update_existing = 0;
685 0 : } else {
686 0 : FD_LOG_ERR(( "stat failed when trying to create workspace `%s` (%i-%s)", path, errno, fd_io_strerror( errno ) ));
687 0 : }
688 :
689 : /* TODO: Use a better CPU idx for the stack if tile is floating */
690 0 : ulong stack_cpu_idx = 0UL;
691 0 : if( FD_LIKELY( tile->cpu_idx<65535UL ) ) stack_cpu_idx = tile->cpu_idx;
692 :
693 0 : char name[ PATH_MAX ];
694 0 : FD_TEST( fd_cstr_printf_check( name, PATH_MAX, NULL, "%s_stack_%s%lu", config->name, tile->name, tile->kind_id ) );
695 :
696 0 : ulong sub_page_cnt[ 1 ] = { 6 };
697 0 : ulong sub_cpu_idx [ 1 ] = { stack_cpu_idx };
698 0 : int err;
699 0 : if( FD_UNLIKELY( update_existing ) ) {
700 0 : err = fd_shmem_update_multi( name, FD_SHMEM_HUGE_PAGE_SZ, 1, sub_page_cnt, sub_cpu_idx, S_IRUSR | S_IWUSR ); /* logs details */
701 0 : } else {
702 0 : err = fd_shmem_create_multi( name, FD_SHMEM_HUGE_PAGE_SZ, 1, sub_page_cnt, sub_cpu_idx, S_IRUSR | S_IWUSR ); /* logs details */
703 0 : }
704 0 : if( FD_UNLIKELY( err && errno==ENOMEM ) ) {
705 0 : warn_unknown_files( config, 0UL );
706 :
707 0 : char path[ PATH_MAX ];
708 0 : FD_TEST( fd_cstr_printf_check( path, PATH_MAX, NULL, "%s/%s_stack_%s%lu", config->hugetlbfs.huge_page_mount_path, config->name, tile->name, tile->kind_id ) );
709 0 : FD_LOG_ERR(( "ENOMEM-Out of memory when trying to create huge page stack for tile `%s` at `%s`. "
710 0 : "Firedancer reserves enough memory for all of its stacks during the `hugetlbfs` configure "
711 0 : "step, so it is likely you have unknown files left over in this directory which are "
712 0 : "consuming memory, or another program on the system is using pages from the same mount.",
713 0 : tile->name, path ));
714 0 : } else if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_shmem_create_multi failed" ));
715 0 : }
716 :
717 0 : if( FD_UNLIKELY( seteuid( uid ) ) ) FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
718 0 : if( FD_UNLIKELY( setegid( gid ) ) ) FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
719 0 : }
720 :
721 : void
722 0 : fdctl_check_configure( config_t const * config ) {
723 0 : configure_result_t check = fd_cfg_stage_hugetlbfs.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
724 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
725 0 : FD_LOG_ERR(( "Huge pages are not configured correctly: %s. You can run `fdctl configure init hugetlbfs` "
726 0 : "to create the mounts correctly. This must be done after every system restart before running "
727 0 : "Firedancer.", check.message ));
728 :
729 0 : if( FD_LIKELY( !config->development.netns.enabled && 0==strcmp( config->net.provider, "xdp" ) ) ) {
730 0 : if( fd_cfg_stage_bonding.enabled( config ) ) {
731 0 : check = fd_cfg_stage_bonding.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
732 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
733 0 : FD_LOG_ERR(( "Bonded network device is not configured correctly: %s. You can run `fdctl configure init bonding` "
734 0 : "to configure the bonding driver.", check.message ));
735 0 : }
736 :
737 0 : check = fd_cfg_stage_ethtool_channels.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
738 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
739 0 : FD_LOG_ERR(( "Network %s. You can run `fdctl configure init ethtool-channels` to set the number of channels on the "
740 0 : "network device correctly.", check.message ));
741 :
742 0 : check = fd_cfg_stage_ethtool_offloads.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
743 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
744 0 : FD_LOG_ERR(( "Network %s. You can run `fdctl configure init ethtool-offloads` to disable features "
745 0 : "as required.", check.message ));
746 :
747 0 : check = fd_cfg_stage_ethtool_loopback.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
748 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
749 0 : FD_LOG_ERR(( "Network %s. You can run `fdctl configure init ethtool-loopback` to disable tx-udp-segmentation "
750 0 : "on the loopback device.", check.message ));
751 0 : }
752 :
753 0 : check = fd_cfg_stage_sysctl.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
754 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
755 0 : FD_LOG_ERR(( "Kernel parameters are not configured correctly: %s. You can run `fdctl configure init sysctl` "
756 0 : "to set kernel parameters correctly.", check.message ));
757 :
758 0 : check = fd_cfg_stage_hyperthreads.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
759 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
760 0 : FD_LOG_ERR(( "Hyperthreading is not configured correctly: %s. You can run `fdctl configure init hyperthreads` "
761 0 : "to configure hyperthreading correctly.", check.message ));
762 0 : }
763 :
764 : void
765 : run_firedancer_init( config_t * config,
766 : int init_workspaces,
767 0 : int check_configure ) {
768 0 : struct stat st;
769 0 : int err = stat( config->paths.identity_key, &st );
770 0 : if( FD_UNLIKELY( -1==err && errno==ENOENT ) ) FD_LOG_ERR(( "[consensus.identity_path] key does not exist `%s`. You can generate an identity key at this path by running `fdctl keys new identity --config <toml>`", config->paths.identity_key ));
771 0 : else if( FD_UNLIKELY( -1==err ) ) FD_LOG_ERR(( "could not stat [consensus.identity_path] `%s` (%i-%s)", config->paths.identity_key, errno, fd_io_strerror( errno ) ));
772 :
773 0 : if( FD_UNLIKELY( !config->is_firedancer ) ) {
774 0 : for( ulong i=0UL; i<config->frankendancer.paths.authorized_voter_paths_cnt; i++ ) {
775 0 : err = stat( config->frankendancer.paths.authorized_voter_paths[ i ], &st );
776 0 : if( FD_UNLIKELY( -1==err && errno==ENOENT ) ) FD_LOG_ERR(( "[consensus.authorized_voter_paths] key does not exist `%s`", config->frankendancer.paths.authorized_voter_paths[ i ] ));
777 0 : else if( FD_UNLIKELY( -1==err ) ) FD_LOG_ERR(( "could not stat [consensus.authorized_voter_paths] `%s` (%i-%s)", config->frankendancer.paths.authorized_voter_paths[ i ], errno, fd_io_strerror( errno ) ));
778 0 : }
779 0 : }
780 :
781 : /* FIXME: fdctl_check_configure unconditionally checks for network
782 : stack prerequisites even if the command being run does not
783 : require networking. Hack around that here for now. */
784 0 : if( check_configure ) fdctl_check_configure( config );
785 0 : if( FD_LIKELY( init_workspaces ) ) initialize_workspaces( config );
786 0 : initialize_stacks( config );
787 0 : }
788 :
789 : void
790 : fdctl_setup_netns( config_t * config,
791 0 : int stay ) {
792 0 : if( !config->development.netns.enabled ) return;
793 :
794 0 : int original_netns_;
795 0 : int * original_netns = stay ? NULL : &original_netns_;
796 0 : if( FD_UNLIKELY( -1==fd_net_util_netns_enter( config->net.interface, original_netns ) ) )
797 0 : FD_LOG_ERR(( "failed to enter network namespace `%s` (%i-%s)", config->net.interface, errno, fd_io_strerror( errno ) ));
798 :
799 0 : if( 0==strcmp( config->net.provider, "xdp" ) ) {
800 0 : fd_cfg_stage_ethtool_channels.init( config );
801 0 : fd_cfg_stage_ethtool_offloads.init( config );
802 0 : fd_cfg_stage_ethtool_loopback.init( config );
803 0 : }
804 :
805 0 : if( FD_UNLIKELY( original_netns && -1==fd_net_util_netns_restore( original_netns_ ) ) )
806 0 : FD_LOG_ERR(( "failed to restore network namespace (fd=%d) (%i-%s)", original_netns_, errno, fd_io_strerror( errno ) ));
807 0 : }
808 :
809 : /* The boot sequence is a little bit involved...
810 :
811 : A process tree is created that looks like,
812 :
813 : + main
814 : +-- pidns
815 : +-- agave
816 : +-- tile 0
817 : +-- tile 1
818 : ...
819 :
820 : What we want is that if any process in the tree dies, all other
821 : processes will also die. This is done as follows,
822 :
823 : (a) pidns is the init process of a PID namespace, so if it dies the
824 : kernel will terminate the child processes.
825 :
826 : (b) main is the parent of pidns, so it can issue a waitpid() on the
827 : child PID, and when it completes terminate itself.
828 :
829 : (c) pidns is the parent of agave and the tiles, so it could
830 : issue a waitpid() of -1 to wait for any of them to terminate,
831 : but how would it know if main has died?
832 :
833 : (d) main creates a pipe, and passes the write end to pidns. If main
834 : dies, the pipe will be closed, and pidns will get a HUP on the
835 : read end. Then pidns creates a pipe per child and passes the
836 : write end to the child. If any of the children die, the pipe
837 : will be closed, and pidns will get a HUP on the read end.
838 :
839 : Then pidns can call poll() on both the write end of the main
840 : pipe and the read end of all the child pipes. If any of them
841 : raises SIGHUP, then pidns knows that the parent or a child has
842 : died, and it can terminate itself, which due to (a) and (b)
843 : will kill all other processes. */
844 : void
845 : run_firedancer( config_t * config,
846 : int parent_pipefd,
847 0 : int init_workspaces ) {
848 : /* dump the topology we are using to the output log */
849 0 : fd_topo_print_log( 0, &config->topo );
850 :
851 0 : run_firedancer_init( config, init_workspaces, 1 );
852 :
853 0 : #if defined(__x86_64__) || defined(__aarch64__)
854 :
855 : #ifndef SYS_landlock_create_ruleset
856 : #define SYS_landlock_create_ruleset 444
857 : #endif
858 :
859 0 : #ifndef LANDLOCK_CREATE_RULESET_VERSION
860 0 : #define LANDLOCK_CREATE_RULESET_VERSION (1U << 0)
861 0 : #endif
862 :
863 0 : #endif
864 0 : long abi = syscall( SYS_landlock_create_ruleset, NULL, 0, LANDLOCK_CREATE_RULESET_VERSION );
865 0 : if( -1L==abi && (errno==ENOSYS || errno==EOPNOTSUPP ) ) {
866 0 : FD_LOG_WARNING(( "The Landlock access control system is not supported by your Linux kernel. Firedancer uses landlock to "
867 0 : "provide an additional layer of security to the sandbox, but it is not required." ));
868 0 : }
869 :
870 0 : if( FD_UNLIKELY( close( 0 ) ) ) FD_LOG_ERR(( "close(0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
871 0 : if( FD_UNLIKELY( fd_log_private_logfile_fd()!=1 && close( 1 ) ) ) FD_LOG_ERR(( "close(1) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
872 :
873 0 : int pipefd;
874 0 : pid_namespace = clone_firedancer( config, parent_pipefd, &pipefd );
875 :
876 : /* Print the location of the logfile on SIGINT or SIGTERM, and also
877 : kill the child. They are connected by a pipe which the child is
878 : polling so we don't strictly need to kill the child, but its helpful
879 : to do that before printing the log location line, else it might
880 : get interleaved due to timing windows in the shutdown. */
881 0 : install_parent_signals();
882 :
883 0 : if( FD_UNLIKELY( close( config->log.lock_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
884 :
885 0 : struct sock_filter seccomp_filter[ 128UL ];
886 0 : populate_sock_filter_policy_main( 128UL, seccomp_filter, (uint)fd_log_private_logfile_fd(), (uint)pid_namespace );
887 :
888 0 : int allow_fds[ 4 ];
889 0 : ulong allow_fds_cnt = 0;
890 0 : allow_fds[ allow_fds_cnt++ ] = 2; /* stderr */
891 0 : if( FD_LIKELY( fd_log_private_logfile_fd()!=-1 ) )
892 0 : allow_fds[ allow_fds_cnt++ ] = fd_log_private_logfile_fd(); /* logfile, or maybe stdout */
893 0 : allow_fds[ allow_fds_cnt++ ] = pipefd; /* read end of main pipe */
894 0 : if( FD_UNLIKELY( parent_pipefd!=-1 ) )
895 0 : allow_fds[ allow_fds_cnt++ ] = parent_pipefd; /* write end of parent pipe */
896 :
897 0 : if( FD_LIKELY( config->development.sandbox ) ) {
898 0 : fd_sandbox_enter( config->uid,
899 0 : config->gid,
900 0 : 0,
901 0 : 0,
902 0 : 0,
903 0 : 1, /* Keep controlling terminal for main so it can receive Ctrl+C */
904 0 : 0,
905 0 : 0UL,
906 0 : 0UL,
907 0 : 0UL,
908 0 : allow_fds_cnt,
909 0 : allow_fds,
910 0 : sock_filter_policy_main_instr_cnt,
911 0 : seccomp_filter );
912 0 : } else {
913 0 : fd_sandbox_switch_uid_gid( config->uid, config->gid );
914 0 : }
915 :
916 : /* The supervsior process should not share the log lock, because a
917 : child process might die while holding it and we still need to
918 : reap and print errors. */
919 0 : int lock = 0;
920 0 : fd_log_private_shared_lock = &lock;
921 :
922 : /* the only clean way to exit is SIGINT or SIGTERM on this parent process,
923 : so if wait4() completes, it must be an error */
924 0 : int wstatus;
925 0 : if( FD_UNLIKELY( -1==wait4( pid_namespace, &wstatus, (int)__WALL, NULL ) ) )
926 0 : FD_LOG_ERR(( "main wait4() failed (%i-%s)\nLog at \"%s\"", errno, fd_io_strerror( errno ), fd_log_private_path ));
927 :
928 0 : if( FD_UNLIKELY( WIFSIGNALED( wstatus ) ) ) fd_sys_util_exit_group( WTERMSIG( wstatus ) ? WTERMSIG( wstatus ) : 1 );
929 0 : else fd_sys_util_exit_group( WEXITSTATUS( wstatus ) ? WEXITSTATUS( wstatus ) : 1 );
930 0 : }
931 :
932 : void
933 : run_cmd_fn( args_t * args FD_PARAM_UNUSED,
934 0 : config_t * config ) {
935 0 : #define CHECK_PORT_NON_ZERO( field ) \
936 0 : if( FD_UNLIKELY( config->field==0 ) ) { \
937 0 : FD_LOG_ERR(( #field " is not set in your configuration file. Please set it to a non-zero value." )); \
938 0 : }
939 :
940 0 : if( FD_UNLIKELY( !config->gossip.entrypoints_cnt && !config->development.bootstrap ) )
941 0 : FD_LOG_ERR(( "No entrypoints specified in configuration file under [gossip.entrypoints], but "
942 0 : "at least one is needed to determine how to connect to the Solana cluster. If "
943 0 : "you want to start a new cluster in a development environment, use `fddev` instead "
944 0 : "of `fdctl`. If you want to use an existing genesis, set [development.bootstrap] "
945 0 : "to \"true\" in the configuration file." ));
946 :
947 0 : for( ulong i=0; i<config->gossip.entrypoints_cnt; i++ ) {
948 0 : if( FD_UNLIKELY( !strcmp( config->gossip.entrypoints[ i ], "" ) ) )
949 0 : FD_LOG_ERR(( "One of the entrypoints in your configuration file under [gossip.entrypoints] is "
950 0 : "empty. Please remove the empty entrypoint or set it correctly. "));
951 0 : }
952 :
953 0 : CHECK_PORT_NON_ZERO( gossip.port );
954 0 : CHECK_PORT_NON_ZERO( tiles.quic.quic_transaction_listen_port );
955 0 : CHECK_PORT_NON_ZERO( tiles.quic.regular_transaction_listen_port );
956 0 : CHECK_PORT_NON_ZERO( tiles.shred.shred_listen_port );
957 0 : CHECK_PORT_NON_ZERO( tiles.metric.prometheus_listen_port );
958 0 : CHECK_PORT_NON_ZERO( tiles.gui.gui_listen_port );
959 :
960 0 : #undef CHECK_PORT_NON_ZERO
961 :
962 0 : run_firedancer( config, -1, 1 );
963 0 : }
964 :
965 : action_t fd_action_run1 = {
966 : .name = "run1",
967 : .args = run1_cmd_args,
968 : .fn = run1_cmd_fn,
969 : .perm = NULL,
970 : .description = "Start up a single Firedancer tile"
971 : };
972 :
973 : action_t fd_action_run = {
974 : .name = "run",
975 : .args = NULL,
976 : .fn = run_cmd_fn,
977 : .require_config = 1,
978 : .perm = run_cmd_perm,
979 : .description = "Start up a Firedancer validator",
980 : .permission_err = "insufficient permissions to execute command `%s`. It is recommended "
981 : "to start Firedancer as the root user, but you can also start it "
982 : "with the missing capabilities listed above. The program only needs "
983 : "to start with elevated permissions to do privileged operations at "
984 : "boot, and will immediately drop permissions and switch to the user "
985 : "specified in your configuration file once they are complete. Firedancer "
986 : "will not execute outside of the boot process as root, and will refuse "
987 : "to start if it cannot drop privileges. Firedancer needs to be started "
988 : "privileged to configure high performance networking with XDP.",
989 : };
|