Line data Source code
1 : #define _GNU_SOURCE
2 : #include "run.h"
3 : #include "../../../../flamenco/accdb/fd_accdb.h"
4 :
5 : #include <sys/wait.h>
6 : #include "generated/main_seccomp.h"
7 : #if defined(__aarch64__)
8 : #include "generated/pidns_arm64_seccomp.h"
9 : #else
10 : #include "generated/pidns_seccomp.h"
11 : #endif
12 :
13 : #include "../../../platform/fd_sys_util.h"
14 : #include "../../../platform/fd_file_util.h"
15 : #include "../../../platform/fd_net_util.h"
16 : #include "../../../../disco/net/fd_net_tile.h"
17 :
18 : #include "../configure/configure.h"
19 :
20 : #include <dirent.h>
21 : #include <sched.h>
22 : #include <stdio.h>
23 : #include <stdlib.h> /* getenv */
24 : #include <poll.h>
25 : #include <unistd.h>
26 : #include <errno.h>
27 : #include <fcntl.h>
28 : #include <sys/prctl.h>
29 : #include <sys/resource.h>
30 : #include <sys/mman.h>
31 : #include <sys/stat.h>
32 : #include <linux/capability.h>
33 :
34 : #include "../../../../util/tile/fd_tile_private.h"
35 :
36 : extern fd_topo_obj_callbacks_t * CALLBACKS[];
37 :
38 0 : #define NAME "run"
39 :
40 : void
41 : run_cmd_perm( args_t * args,
42 : fd_cap_chk_t * chk,
43 0 : config_t const * config ) {
44 0 : (void)args;
45 :
46 0 : ulong mlock_limit = fd_topo_mlock_max_tile( &config->topo );
47 :
48 0 : fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_MEMLOCK, mlock_limit, "call `rlimit(2)` to increase `RLIMIT_MEMLOCK` so all memory can be locked with `mlock(2)`" );
49 0 : fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_NICE, 40, "call `setpriority(2)` to increase thread priorities" );
50 0 : fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_NOFILE, CONFIGURE_NR_OPEN_FILES,
51 0 : "call `rlimit(2) to increase `RLIMIT_NOFILE` to allow more open files for Agave" );
52 0 : fd_cap_chk_cap( chk, NAME, CAP_NET_RAW, "call `socket(2)` to bind to a raw socket for use by XDP" );
53 0 : fd_cap_chk_cap( chk, NAME, CAP_SYS_ADMIN, "call `bpf(2)` with the `BPF_OBJ_GET` command to initialize XDP" );
54 0 : if( fd_sandbox_requires_cap_sys_admin( config->uid, config->gid ) )
55 0 : fd_cap_chk_cap( chk, NAME, CAP_SYS_ADMIN, "call `unshare(2)` with `CLONE_NEWUSER` to sandbox the process in a user namespace" );
56 0 : if( FD_LIKELY( getuid() != config->uid ) )
57 0 : fd_cap_chk_cap( chk, NAME, CAP_SETUID, "call `setresuid(2)` to switch uid to the sandbox user" );
58 0 : if( FD_LIKELY( getgid()!=config->gid ) )
59 0 : fd_cap_chk_cap( chk, NAME, CAP_SETGID, "call `setresgid(2)` to switch gid to the sandbox user" );
60 0 : if( FD_UNLIKELY( config->tiles.metric.prometheus_listen_port<1024 ) )
61 0 : fd_cap_chk_cap( chk, NAME, CAP_NET_BIND_SERVICE, "call `bind(2)` to bind to a privileged port for serving metrics" );
62 0 : if( FD_UNLIKELY( config->tiles.gui.gui_listen_port<1024 ) )
63 0 : fd_cap_chk_cap( chk, NAME, CAP_NET_BIND_SERVICE, "call `bind(2)` to bind to a privileged port for serving the GUI" );
64 0 : }
65 :
66 : struct pidns_clone_args {
67 : config_t const * config;
68 : int * pipefd;
69 : int closefd;
70 : };
71 :
72 : extern char fd_log_private_path[ 1024 ]; /* empty string on start */
73 :
74 : static pid_t pid_namespace;
75 :
76 0 : #define FD_LOG_ERR_NOEXIT(a) do { long _fd_log_msg_now = fd_log_wallclock(); fd_log_private_1( 4, _fd_log_msg_now, __FILE__, __LINE__, __func__, fd_log_private_0 a ); } while(0)
77 :
78 : static void
79 0 : parent_signal( int sig ) {
80 0 : if( FD_LIKELY( pid_namespace ) ) kill( pid_namespace, SIGKILL );
81 :
82 0 : if( -1!=fd_log_private_logfile_fd() ) FD_LOG_ERR_NOEXIT(( "Received signal %s\nLog at \"%s\"", fd_io_strsignal( sig ), fd_log_private_path ));
83 0 : else FD_LOG_ERR_NOEXIT(( "Received signal %s", fd_io_strsignal( sig ) ));
84 :
85 0 : if( FD_LIKELY( sig==SIGINT ) ) fd_sys_util_exit_group( 128+SIGINT );
86 0 : else fd_sys_util_exit_group( 0 );
87 0 : }
88 :
89 : static void
90 0 : install_parent_signals( void ) {
91 0 : struct sigaction sa = {
92 0 : .sa_handler = parent_signal,
93 0 : .sa_flags = 0,
94 0 : };
95 0 : if( FD_UNLIKELY( sigaction( SIGTERM, &sa, NULL ) ) )
96 0 : FD_LOG_ERR(( "sigaction(SIGTERM) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
97 0 : if( FD_UNLIKELY( sigaction( SIGINT, &sa, NULL ) ) )
98 0 : FD_LOG_ERR(( "sigaction(SIGINT) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
99 :
100 0 : sa.sa_handler = SIG_IGN;
101 0 : if( FD_UNLIKELY( sigaction( SIGUSR1, &sa, NULL ) ) )
102 0 : FD_LOG_ERR(( "sigaction(SIGUSR1) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
103 0 : if( FD_UNLIKELY( sigaction( SIGUSR2, &sa, NULL ) ) )
104 0 : FD_LOG_ERR(( "sigaction(SIGUSR2) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
105 0 : }
106 :
107 : void *
108 0 : create_clone_stack( void ) {
109 0 : ulong mmap_sz = FD_TILE_PRIVATE_STACK_SZ + 2UL*FD_SHMEM_NORMAL_PAGE_SZ;
110 0 : uchar * stack = (uchar *)mmap( NULL, mmap_sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, (off_t)0 );
111 0 : if( FD_UNLIKELY( stack==MAP_FAILED ) )
112 0 : FD_LOG_ERR(( "mmap() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
113 :
114 : /* Make space for guard lo and guard hi */
115 0 : if( FD_UNLIKELY( munmap( stack, FD_SHMEM_NORMAL_PAGE_SZ ) ) )
116 0 : FD_LOG_ERR(( "munmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
117 0 : stack += FD_SHMEM_NORMAL_PAGE_SZ;
118 0 : if( FD_UNLIKELY( munmap( stack + FD_TILE_PRIVATE_STACK_SZ, FD_SHMEM_NORMAL_PAGE_SZ ) ) )
119 0 : FD_LOG_ERR(( "munmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
120 :
121 : /* Create the guard regions in the extra space */
122 0 : void * guard_lo = (void *)(stack - FD_SHMEM_NORMAL_PAGE_SZ );
123 0 : if( FD_UNLIKELY( mmap( guard_lo, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
124 0 : MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_lo ) )
125 0 : FD_LOG_ERR(( "mmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
126 :
127 0 : void * guard_hi = (void *)(stack + FD_TILE_PRIVATE_STACK_SZ);
128 0 : if( FD_UNLIKELY( mmap( guard_hi, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
129 0 : MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_hi ) )
130 0 : FD_LOG_ERR(( "mmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
131 :
132 0 : return stack;
133 0 : }
134 :
135 :
136 : static int
137 : execve_agave( int config_memfd,
138 0 : int pipefd ) {
139 0 : if( FD_UNLIKELY( -1==fcntl( pipefd, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
140 0 : pid_t child = fork();
141 0 : if( FD_UNLIKELY( -1==child ) ) FD_LOG_ERR(( "fork() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
142 0 : if( FD_LIKELY( !child ) ) {
143 0 : char _current_executable_path[ PATH_MAX ];
144 0 : FD_TEST( -1!=fd_file_util_self_exe( _current_executable_path ) );
145 :
146 0 : char config_fd[ 32 ];
147 0 : FD_TEST( fd_cstr_printf_check( config_fd, sizeof( config_fd ), NULL, "%d", config_memfd ) );
148 0 : char * args[ 5 ] = { _current_executable_path, "run-agave", "--config-fd", config_fd, NULL };
149 :
150 0 : char * envp[] = { NULL, NULL };
151 0 : char * google_creds = getenv( "GOOGLE_APPLICATION_CREDENTIALS" );
152 0 : char provide_creds[ PATH_MAX+30UL ];
153 0 : if( FD_UNLIKELY( google_creds ) ) {
154 0 : FD_TEST( fd_cstr_printf_check( provide_creds, sizeof( provide_creds ), NULL, "GOOGLE_APPLICATION_CREDENTIALS=%s", google_creds ) );
155 0 : envp[ 0 ] = provide_creds;
156 0 : }
157 :
158 0 : if( FD_UNLIKELY( -1==execve( _current_executable_path, args, envp ) ) ) FD_LOG_ERR(( "execve() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
159 0 : } else {
160 0 : if( FD_UNLIKELY( -1==fcntl( pipefd, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
161 0 : return child;
162 0 : }
163 0 : return 0;
164 0 : }
165 :
166 : static pid_t
167 : execve_tile( fd_topo_tile_t const * tile,
168 : fd_cpuset_t const * floating_cpu_set,
169 : int floating_priority,
170 : int config_memfd,
171 0 : int pipefd ) {
172 0 : FD_CPUSET_DECL( cpu_set );
173 0 : if( FD_LIKELY( tile->cpu_idx!=ULONG_MAX ) ) {
174 : /* set the thread affinity before we clone the new process to ensure
175 : kernel first touch happens on the desired thread. */
176 0 : fd_cpuset_insert( cpu_set, tile->cpu_idx );
177 0 : if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, -19 ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
178 0 : } else {
179 0 : fd_memcpy( cpu_set, floating_cpu_set, fd_cpuset_footprint() );
180 0 : if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, floating_priority ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
181 0 : }
182 :
183 0 : if( FD_UNLIKELY( fd_cpuset_setaffinity( 0, cpu_set ) ) ) {
184 0 : if( FD_LIKELY( errno==EINVAL ) ) {
185 0 : FD_LOG_ERR(( "Unable to set the thread affinity for tile %s:%lu on cpu %lu. It is likely that the affinity "
186 0 : "you have specified for this tile in [layout.affinity] of your configuration file contains a "
187 0 : "CPU (%lu) which does not exist on this machine.",
188 0 : tile->name, tile->kind_id, tile->cpu_idx, tile->cpu_idx ));
189 0 : } else {
190 0 : FD_LOG_ERR(( "sched_setaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
191 0 : }
192 0 : }
193 :
194 : /* Clear CLOEXEC on the side of the pipe we want to pass to the tile. */
195 0 : if( FD_UNLIKELY( -1==fcntl( pipefd, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
196 0 : pid_t child = fork();
197 0 : if( FD_UNLIKELY( -1==child ) ) FD_LOG_ERR(( "fork() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
198 0 : if( FD_LIKELY( !child ) ) {
199 0 : char _current_executable_path[ PATH_MAX ];
200 0 : FD_TEST( -1!=fd_file_util_self_exe( _current_executable_path ) );
201 :
202 0 : char kind_id[ 32 ], config_fd[ 32 ], pipe_fd[ 32 ];
203 0 : FD_TEST( fd_cstr_printf_check( kind_id, sizeof( kind_id ), NULL, "%lu", tile->kind_id ) );
204 0 : FD_TEST( fd_cstr_printf_check( config_fd, sizeof( config_fd ), NULL, "%d", config_memfd ) );
205 0 : FD_TEST( fd_cstr_printf_check( pipe_fd, sizeof( pipe_fd ), NULL, "%d", pipefd ) );
206 0 : char const * args[ 9 ] = { _current_executable_path, "run1", tile->name, kind_id, "--pipe-fd", pipe_fd, "--config-fd", config_fd, NULL };
207 0 : if( FD_UNLIKELY( -1==execve( _current_executable_path, (char **)args, NULL ) ) ) FD_LOG_ERR(( "execve() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
208 0 : } else {
209 0 : if( FD_UNLIKELY( -1==fcntl( pipefd, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
210 0 : return child;
211 0 : }
212 0 : return 0;
213 0 : }
214 :
215 : int
216 0 : main_pid_namespace( void * _args ) {
217 0 : struct pidns_clone_args * args = _args;
218 0 : if( FD_UNLIKELY( close( args->pipefd[ 0 ] ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
219 0 : if( FD_UNLIKELY( -1!=args->closefd ) ) {
220 0 : if( FD_UNLIKELY( close( args->closefd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
221 0 : }
222 :
223 0 : config_t const * config = args->config;
224 :
225 0 : fd_log_thread_set( "pidns" );
226 0 : ulong pid = fd_sandbox_getpid(); /* Need to read /proc again.. we got a new PID from clone */
227 0 : fd_log_private_group_id_set( pid );
228 0 : fd_log_private_thread_id_set( pid );
229 0 : fd_log_private_stack_discover( FD_TILE_PRIVATE_STACK_SZ,
230 0 : &fd_tile_private_stack0, &fd_tile_private_stack1 );
231 :
232 0 : if( FD_UNLIKELY( !config->development.sandbox ) ) {
233 : /* If no sandbox, then there's no actual PID namespace so we can't
234 : wait() grandchildren for the exit code. Do this as a workaround. */
235 0 : if( FD_UNLIKELY( -1==prctl( PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0 ) ) )
236 0 : FD_LOG_ERR(( "prctl(PR_SET_CHILD_SUBREAPER) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
237 0 : }
238 :
239 : /* Save the current affinity, it will be restored after creating any child tiles */
240 0 : FD_CPUSET_DECL( floating_cpu_set );
241 0 : if( FD_UNLIKELY( fd_cpuset_getaffinity( 0, floating_cpu_set ) ) )
242 0 : FD_LOG_ERR(( "fd_cpuset_getaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
243 :
244 0 : pid_t child_pids[ FD_TOPO_MAX_TILES+1 ];
245 0 : ulong actual_pids[ FD_TOPO_MAX_TILES+1 ];
246 0 : for( ulong i=0UL; i<FD_TOPO_MAX_TILES+1; i++ ) actual_pids[ i ] = ULONG_MAX;
247 0 : char child_names[ FD_TOPO_MAX_TILES+1 ][ 32 ];
248 0 : ulong child_idxs[ FD_TOPO_MAX_TILES+1 ];
249 0 : struct pollfd fds[ FD_TOPO_MAX_TILES+2 ];
250 :
251 0 : int config_memfd = fd_config_to_memfd( config );
252 0 : if( FD_UNLIKELY( -1==config_memfd ) ) FD_LOG_ERR(( "fd_config_to_memfd() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
253 :
254 0 : ulong child_cnt = 0UL;
255 0 : if( FD_LIKELY( !config->is_firedancer && !config->development.no_agave ) ) {
256 0 : int pipefd[ 2 ];
257 0 : if( FD_UNLIKELY( pipe2( pipefd, O_CLOEXEC ) ) ) FD_LOG_ERR(( "pipe2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
258 0 : fds[ child_cnt ] = (struct pollfd){ .fd = pipefd[ 0 ], .events = 0 };
259 0 : child_pids[ child_cnt ] = execve_agave( config_memfd, pipefd[ 1 ] );
260 0 : FD_TEST( child_pids[ child_cnt ]>0 );
261 0 : actual_pids[ child_cnt ] = (ulong)child_pids[ child_cnt ];
262 0 : child_idxs[ child_cnt ] = ULONG_MAX;
263 0 : if( FD_UNLIKELY( close( pipefd[ 1 ] ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
264 0 : strncpy( child_names[ child_cnt ], "agave", 32 );
265 0 : child_cnt++;
266 0 : }
267 :
268 0 : errno = 0;
269 0 : int save_priority = getpriority( PRIO_PROCESS, 0 );
270 0 : if( FD_UNLIKELY( -1==save_priority && errno ) ) FD_LOG_ERR(( "getpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
271 :
272 0 : int need_xdp = 0==strcmp( config->net.provider, "xdp" );
273 0 : fd_xdp_fds_t xdp_fds[ FD_TOPO_XDP_FDS_MAX ];
274 0 : uint xdp_fds_cnt = FD_TOPO_XDP_FDS_MAX;
275 0 : if( need_xdp ) {
276 0 : fd_topo_install_xdp( &config->topo, xdp_fds, &xdp_fds_cnt, config->net.bind_address_parsed, 0 );
277 0 : }
278 :
279 0 : initialize_accdb_fd( config );
280 :
281 0 : for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
282 0 : fd_topo_tile_t const * tile = &config->topo.tiles[ i ];
283 0 : if( FD_UNLIKELY( tile->is_agave ) ) continue;
284 :
285 0 : if( need_xdp ) {
286 0 : if( FD_UNLIKELY( strcmp( tile->name, "net" ) ) ) {
287 0 : for( uint i=0U; i<xdp_fds_cnt; i++ ) {
288 : /* close XDP related file descriptors */
289 0 : if( FD_UNLIKELY( -1==fcntl( xdp_fds[i].xsk_map_fd, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
290 0 : if( FD_UNLIKELY( -1==fcntl( xdp_fds[i].prog_link_fd, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
291 0 : }
292 0 : } else {
293 0 : for( uint i=0U; i<xdp_fds_cnt; i++ ) {
294 0 : if( FD_UNLIKELY( -1==fcntl( xdp_fds[i].xsk_map_fd, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
295 0 : if( FD_UNLIKELY( -1==fcntl( xdp_fds[i].prog_link_fd, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
296 0 : }
297 0 : }
298 0 : }
299 :
300 0 : if( FD_LIKELY( config->is_firedancer ) ) {
301 0 : int tile_uses_accdb = 0;
302 0 : int tile_uses_accdb_ro = 0;
303 0 : for( ulong i=0UL; i<tile->uses_obj_cnt; i++ ) {
304 0 : fd_topo_obj_t const * obj = &config->topo.objs[ tile->uses_obj_id[ i ] ];
305 0 : if( FD_UNLIKELY( !strcmp( obj->name, "accdb" ) ) ) {
306 0 : if( FD_UNLIKELY( tile->uses_obj_mode[ i ]==FD_SHMEM_JOIN_MODE_READ_ONLY ) ) tile_uses_accdb_ro = 1;
307 0 : else tile_uses_accdb = 1;
308 0 : break;
309 0 : }
310 0 : }
311 :
312 : /* The gui joins the accdb shmem read-only (for partition stats)
313 : but never reads account data from the on-disk file, so it does
314 : not need the accounts.db fd. Withhold it to keep the gui at
315 : least privilege. */
316 0 : if( FD_UNLIKELY( !strcmp( tile->name, "gui" ) ) ) tile_uses_accdb_ro = 0;
317 :
318 : /* snapwr writes accdb pwrite()s without joining accdb shmem, so
319 : it needs the RW fd despite not appearing as an accdb obj user
320 : in the topology. */
321 0 : if( FD_UNLIKELY( tile_uses_accdb || !strcmp( tile->name, "snapwr" ) ) ) {
322 0 : if( FD_UNLIKELY( -1==fcntl( FD_ACCDB_FD_RW, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
323 0 : } else {
324 0 : if( FD_UNLIKELY( -1==fcntl( FD_ACCDB_FD_RW, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
325 0 : }
326 :
327 0 : if( FD_UNLIKELY( tile_uses_accdb_ro ) ) {
328 0 : if( FD_UNLIKELY( -1==fcntl( FD_ACCDB_FD_RO, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
329 0 : } else {
330 0 : if( FD_UNLIKELY( -1==fcntl( FD_ACCDB_FD_RO, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
331 0 : }
332 0 : }
333 :
334 0 : int pipefd[ 2 ];
335 0 : if( FD_UNLIKELY( pipe2( pipefd, O_CLOEXEC ) ) ) FD_LOG_ERR(( "pipe2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
336 0 : fds[ child_cnt ] = (struct pollfd){ .fd = pipefd[ 0 ], .events = 0 };
337 0 : child_pids[ child_cnt ] = execve_tile( tile, floating_cpu_set, save_priority, config_memfd, pipefd[ 1 ] );
338 0 : child_idxs[ child_cnt ] = i;
339 0 : if( FD_UNLIKELY( close( pipefd[ 1 ] ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
340 0 : strncpy( child_names[ child_cnt ], tile->name, 32 );
341 0 : child_cnt++;
342 0 : }
343 :
344 : /* Obtain the actual grandchild PID from the pipe */
345 0 : for( ulong i=0UL; i<child_cnt; i++ ) {
346 0 : if( FD_UNLIKELY( actual_pids[ i ]!=ULONG_MAX ) ) continue;
347 0 : FD_TEST( 8UL==read( fds[ i ].fd, &actual_pids[ i ], 8UL ) );
348 0 : }
349 :
350 0 : if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, save_priority ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
351 0 : if( FD_UNLIKELY( fd_cpuset_setaffinity( 0, floating_cpu_set ) ) )
352 0 : FD_LOG_ERR(( "fd_cpuset_setaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
353 :
354 0 : if( FD_UNLIKELY( close( config_memfd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
355 0 : if( need_xdp ) {
356 0 : for( uint i=0U; i<xdp_fds_cnt; i++ ) {
357 0 : if( FD_UNLIKELY( close( xdp_fds[i].xsk_map_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
358 0 : if( FD_UNLIKELY( close( xdp_fds[i].prog_link_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
359 0 : }
360 0 : }
361 :
362 0 : if( FD_LIKELY( config->is_firedancer ) ) {
363 0 : if( FD_UNLIKELY( -1==close( FD_ACCDB_FD_RW ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
364 0 : if( FD_UNLIKELY( -1==close( FD_ACCDB_FD_RO ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
365 0 : }
366 :
367 0 : int allow_fds[ 4+FD_TOPO_MAX_TILES ];
368 0 : ulong allow_fds_cnt = 0;
369 0 : allow_fds[ allow_fds_cnt++ ] = 2; /* stderr */
370 0 : if( FD_LIKELY( fd_log_private_logfile_fd()!=-1 ) )
371 0 : allow_fds[ allow_fds_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
372 0 : allow_fds[ allow_fds_cnt++ ] = args->pipefd[ 1 ]; /* write end of main pipe */
373 0 : for( ulong i=0UL; i<child_cnt; i++ )
374 0 : allow_fds[ allow_fds_cnt++ ] = fds[ i ].fd; /* read end of child pipes */
375 :
376 0 : struct sock_filter seccomp_filter[ 128UL ];
377 0 : unsigned int instr_cnt;
378 : #if defined(__aarch64__)
379 : populate_sock_filter_policy_pidns_arm64( 128UL, seccomp_filter, (uint)fd_log_private_logfile_fd() );
380 : instr_cnt = sock_filter_policy_pidns_arm64_instr_cnt;
381 : #else
382 0 : populate_sock_filter_policy_pidns( 128UL, seccomp_filter, (uint)fd_log_private_logfile_fd() );
383 0 : instr_cnt = sock_filter_policy_pidns_instr_cnt;
384 0 : #endif
385 :
386 0 : if( FD_LIKELY( config->development.sandbox ) ) {
387 0 : fd_sandbox_enter( config->uid,
388 0 : config->gid,
389 0 : 0,
390 0 : 0,
391 0 : 0,
392 0 : 0,
393 0 : 0,
394 0 : 1UL+child_cnt, /* RLIMIT_NOFILE needs to be set to the nfds argument of poll() */
395 0 : 0UL,
396 0 : 0UL,
397 0 : 0UL,
398 0 : allow_fds_cnt,
399 0 : allow_fds,
400 0 : instr_cnt,
401 0 : seccomp_filter );
402 0 : } else {
403 0 : fd_sandbox_switch_uid_gid( config->uid, config->gid );
404 0 : }
405 :
406 : /* Reap child process PIDs so they don't show up in `ps` etc. All of
407 : these children should have exited immediately after clone(2)'ing
408 : another child with a huge page based stack. */
409 0 : for( ulong i=0UL; i<child_cnt; i++ ) {
410 0 : int wstatus;
411 0 : int exited_pid = wait4( child_pids[ i ], &wstatus, (int)__WALL, NULL );
412 0 : if( FD_UNLIKELY( -1==exited_pid ) ) {
413 0 : FD_LOG_ERR(( "pidns wait4() failed (%i-%s) %lu %hu", errno, fd_io_strerror( errno ), i, fds[i].revents ));
414 0 : } else if( FD_UNLIKELY( child_pids[ i ]!=exited_pid ) ) {
415 0 : FD_LOG_ERR(( "pidns wait4() returned unexpected pid %d %d", child_pids[ i ], exited_pid ));
416 0 : } else if( FD_UNLIKELY( !WIFEXITED( wstatus ) ) ) {
417 0 : FD_LOG_ERR_NOEXIT(( "tile %lu (%s) exited while booting with signal %d (%s)\n", i, child_names[ i ], WTERMSIG( wstatus ), fd_io_strsignal( WTERMSIG( wstatus ) ) ));
418 0 : fd_sys_util_exit_group( WTERMSIG( wstatus ) ? WTERMSIG( wstatus ) : 1 );
419 0 : }
420 0 : if( FD_UNLIKELY( WEXITSTATUS( wstatus ) ) ) {
421 0 : FD_LOG_ERR_NOEXIT(( "tile %lu (%s) exited while booting with code %d\n", i, child_names[ i ], WEXITSTATUS( wstatus ) ));
422 0 : fd_sys_util_exit_group( WEXITSTATUS( wstatus ) ? WEXITSTATUS( wstatus ) : 1 );
423 0 : }
424 0 : }
425 :
426 0 : fds[ child_cnt ] = (struct pollfd){ .fd = args->pipefd[ 1 ], .events = 0 };
427 0 : strncpy( child_names[ child_cnt ], "parent", 32UL );
428 0 : child_idxs[ child_cnt ] = ULONG_MAX;
429 :
430 : /* We are now the init process of the pid namespace. If the init
431 : process dies, all children are terminated. If any child dies, we
432 : terminate the init process, which will cause the kernel to
433 : terminate all other children bringing all of our processes down as
434 : a group. The parent process will also die if this process dies,
435 : due to getting SIGHUP on the pipe. */
436 0 : while( 1 ) {
437 0 : if( FD_UNLIKELY( -1==poll( fds, 1UL+child_cnt, (int)-1 ) ) ) FD_LOG_ERR(( "poll() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
438 :
439 : /* Parent process died, probably SIGINT, exit gracefully. */
440 0 : if( FD_UNLIKELY( fds[ child_cnt ].revents ) ) fd_sys_util_exit_group( 0 );
441 :
442 : /* Child process died, reap it to figure out exit code. */
443 0 : int wstatus;
444 0 : int exited_pid = wait4( -1, &wstatus, (int)__WALL | (int)WNOHANG, NULL );
445 0 : if( FD_UNLIKELY( -1==exited_pid ) ) {
446 0 : FD_LOG_ERR(( "pidns wait4() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
447 0 : } else if( FD_UNLIKELY( !exited_pid ) ) {
448 : /* Spurious wakeup, no child actually dead yet. */
449 0 : continue;
450 0 : }
451 :
452 : /* Now find the tile corresponding to that PID */
453 0 : FD_TEST( exited_pid>0 );
454 0 : int found = 0;
455 0 : for( ulong i=0UL; i<child_cnt; i++ ) {
456 0 : if( FD_LIKELY( actual_pids[ i ]!=(ulong)exited_pid ) ) continue;
457 :
458 0 : found = 1;
459 0 : fds[ i ].fd = -1; /* Don't poll on this tile anymore */
460 :
461 0 : char * tile_name = child_names[ i ];
462 0 : ulong tile_idx = child_idxs[ i ];
463 0 : ulong tile_id = config->topo.tiles[ tile_idx ].kind_id;
464 :
465 0 : if( FD_UNLIKELY( !WIFEXITED( wstatus ) ) ) {
466 0 : FD_LOG_ERR_NOEXIT(( "tile %s:%lu exited with signal %d (%s)", tile_name, tile_id, WTERMSIG( wstatus ), fd_io_strsignal( WTERMSIG( wstatus ) ) ));
467 0 : fd_sys_util_exit_group( WTERMSIG( wstatus ) ? WTERMSIG( wstatus ) : 1 );
468 0 : } else {
469 0 : int exit_code = WEXITSTATUS( wstatus );
470 0 : if( FD_LIKELY( !exit_code && tile_idx!=ULONG_MAX && config->topo.tiles[ tile_idx ].allow_shutdown ) ) {
471 0 : found = 1;
472 0 : FD_LOG_INFO(( "tile %s:%lu exited gracefully with code %d", tile_name, tile_id, exit_code ));
473 0 : } else {
474 0 : FD_LOG_ERR_NOEXIT(( "tile %s:%lu exited with code %d", tile_name, tile_id, exit_code ));
475 0 : fd_sys_util_exit_group( exit_code ? exit_code : 1 );
476 0 : }
477 0 : }
478 0 : }
479 :
480 0 : if( FD_UNLIKELY( !found ) ) FD_LOG_ERR(( "wait4() returned unexpected pid %d", exited_pid ));
481 0 : }
482 :
483 0 : return 0;
484 0 : }
485 :
486 : int
487 : clone_firedancer( config_t const * config,
488 : int close_fd,
489 0 : int * out_pipe ) {
490 : /* This pipe is here just so that the child process knows when the
491 : parent has died (it will get a HUP). */
492 0 : int pipefd[2];
493 0 : if( FD_UNLIKELY( pipe2( pipefd, O_CLOEXEC | O_NONBLOCK ) ) ) FD_LOG_ERR(( "pipe2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
494 :
495 : /* clone into a pid namespace */
496 0 : int flags = config->development.sandbox ? CLONE_NEWPID : 0;
497 0 : struct pidns_clone_args args = { .config = config, .closefd = close_fd, .pipefd = pipefd, };
498 :
499 0 : void * stack = create_clone_stack();
500 :
501 0 : int pid_namespace = clone( main_pid_namespace, (uchar *)stack + FD_TILE_PRIVATE_STACK_SZ, flags, &args );
502 0 : if( FD_UNLIKELY( pid_namespace<0 ) ) FD_LOG_ERR(( "clone() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
503 :
504 0 : if( FD_UNLIKELY( close( pipefd[ 1 ] ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
505 :
506 0 : *out_pipe = pipefd[ 0 ];
507 0 : return pid_namespace;
508 0 : }
509 :
510 : static void
511 : workspace_path( config_t const * config,
512 : fd_topo_wksp_t const * wksp,
513 0 : char out[ PATH_MAX ] ) {
514 0 : char const * mount_path;
515 0 : switch( wksp->page_sz ) {
516 0 : case FD_SHMEM_HUGE_PAGE_SZ:
517 0 : mount_path = config->hugetlbfs.huge_page_mount_path;
518 0 : break;
519 0 : case FD_SHMEM_GIGANTIC_PAGE_SZ:
520 0 : mount_path = config->hugetlbfs.gigantic_page_mount_path;
521 0 : break;
522 0 : case FD_SHMEM_NORMAL_PAGE_SZ:
523 0 : mount_path = config->hugetlbfs.normal_page_mount_path;
524 0 : break;
525 0 : default:
526 0 : FD_LOG_ERR(( "invalid page size %lu", wksp->page_sz ));
527 0 : }
528 :
529 0 : FD_TEST( fd_cstr_printf_check( out, PATH_MAX, NULL, "%s/%s_%s.wksp", mount_path, config->name, wksp->name ) );
530 0 : }
531 :
532 : static void
533 : warn_unknown_files( config_t const * config,
534 0 : ulong mount_type ) {
535 0 : char const * mount_path;
536 0 : switch( mount_type ) {
537 0 : case 0UL:
538 0 : mount_path = config->hugetlbfs.huge_page_mount_path;
539 0 : break;
540 0 : case 1UL:
541 0 : mount_path = config->hugetlbfs.gigantic_page_mount_path;
542 0 : break;
543 0 : default:
544 0 : FD_LOG_ERR(( "invalid mount type %lu", mount_type ));
545 0 : }
546 :
547 : /* Check if there are any files in mount_path */
548 0 : DIR * dir = opendir( mount_path );
549 0 : if( FD_UNLIKELY( !dir ) ) {
550 0 : if( FD_UNLIKELY( errno!=ENOENT ) ) FD_LOG_ERR(( "error opening `%s` (%i-%s)", mount_path, errno, fd_io_strerror( errno ) ));
551 0 : return;
552 0 : }
553 :
554 0 : struct dirent * entry;
555 0 : for(;;) {
556 0 : errno = 0;
557 0 : entry = readdir( dir );
558 0 : if( FD_UNLIKELY( !entry ) ) break;
559 0 : if( FD_UNLIKELY( !strcmp( entry->d_name, ".") || !strcmp( entry->d_name, ".." ) ) ) continue;
560 :
561 0 : char entry_path[ PATH_MAX ];
562 0 : FD_TEST( fd_cstr_printf_check( entry_path, PATH_MAX, NULL, "%s/%s", mount_path, entry->d_name ));
563 :
564 0 : int known_file = 0;
565 0 : for( ulong i=0UL; i<config->topo.wksp_cnt; i++ ) {
566 0 : fd_topo_wksp_t const * wksp = &config->topo.workspaces[ i ];
567 :
568 0 : char expected_path[ PATH_MAX ];
569 0 : workspace_path( config, wksp, expected_path );
570 :
571 0 : if( !strcmp( entry_path, expected_path ) ) {
572 0 : known_file = 1;
573 0 : break;
574 0 : }
575 0 : }
576 :
577 0 : if( mount_type==0UL ) {
578 0 : for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
579 0 : fd_topo_tile_t const * tile = &config->topo.tiles [ i ];
580 :
581 0 : char expected_path[ PATH_MAX ];
582 0 : FD_TEST( fd_cstr_printf_check( expected_path, PATH_MAX, NULL, "%s/%s_stack_%s%lu", config->hugetlbfs.huge_page_mount_path, config->name, tile->name, tile->kind_id ) );
583 :
584 0 : if( !strcmp( entry_path, expected_path ) ) {
585 0 : known_file = 1;
586 0 : break;
587 0 : }
588 0 : }
589 0 : }
590 :
591 0 : if( FD_UNLIKELY( !known_file ) ) FD_LOG_WARNING(( "unknown file `%s` found in `%s`", entry->d_name, mount_path ));
592 0 : }
593 :
594 0 : if( FD_UNLIKELY( errno ) ) FD_LOG_ERR(( "error reading dir `%s` (%i-%s)", mount_path, errno, fd_io_strerror( errno ) ));
595 0 : if( FD_UNLIKELY( closedir( dir ) ) ) FD_LOG_ERR(( "error closing `%s` (%i-%s)", mount_path, errno, fd_io_strerror( errno ) ));
596 0 : }
597 :
598 : void
599 0 : initialize_workspaces( config_t * config ) {
600 : /* Switch to non-root uid/gid for workspace creation. Permissions
601 : checks are still done as the current user. */
602 0 : uint gid = getgid();
603 0 : uint uid = getuid();
604 0 : if( FD_LIKELY( gid!=config->gid && -1==setegid( config->gid ) ) )
605 0 : FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
606 0 : if( FD_LIKELY( uid!=config->uid && -1==seteuid( config->uid ) ) )
607 0 : FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
608 :
609 0 : for( ulong i=0UL; i<config->topo.wksp_cnt; i++ ) {
610 0 : fd_topo_wksp_t * wksp = &config->topo.workspaces[ i ];
611 :
612 0 : char path[ PATH_MAX ];
613 0 : workspace_path( config, wksp, path );
614 :
615 0 : struct stat st;
616 0 : int result = stat( path, &st );
617 :
618 0 : int update_existing;
619 0 : if( FD_UNLIKELY( !result && config->is_live_cluster ) ) {
620 0 : if( FD_UNLIKELY( -1==unlink( path ) && errno!=ENOENT ) ) FD_LOG_ERR(( "unlink() failed when trying to create workspace `%s` (%i-%s)", path, errno, fd_io_strerror( errno ) ));
621 0 : update_existing = 0;
622 0 : } else if( FD_UNLIKELY( !result ) ) {
623 : /* Creating all of the workspaces is very expensive because the
624 : kernel has to zero out all of the pages. There can be tens or
625 : hundreds of gigabytes of zeroing to do.
626 :
627 : What would be really nice is if the kernel let us create huge
628 : pages without zeroing them, but it's not possible. The
629 : ftruncate and fallocate calls do not support this type of
630 : resize with the hugetlbfs filesystem.
631 :
632 : Instead.. to prevent repeatedly doing this zeroing every time
633 : we start the validator, we have a small hack here to re-use the
634 : workspace files if they exist. */
635 0 : update_existing = 1;
636 0 : } else if( FD_LIKELY( result && errno==ENOENT ) ) {
637 0 : update_existing = 0;
638 0 : } else {
639 0 : FD_LOG_ERR(( "stat failed when trying to create workspace `%s` (%i-%s)", path, errno, fd_io_strerror( errno ) ));
640 0 : }
641 :
642 0 : if( FD_UNLIKELY( -1==fd_topo_create_workspace( &config->topo, wksp, update_existing ) ) ) {
643 0 : FD_TEST( errno==ENOMEM );
644 :
645 0 : warn_unknown_files( config, wksp->page_sz!=FD_SHMEM_HUGE_PAGE_SZ );
646 :
647 0 : char path[ PATH_MAX ];
648 0 : workspace_path( config, wksp, path );
649 0 : FD_LOG_ERR(( "ENOMEM-Out of memory when trying to create workspace `%s` at `%s` "
650 0 : "with %lu %s pages. Firedancer reserves enough memory for all of its workspaces "
651 0 : "during the `hugetlbfs` configure step, so it is likely you have unknown files "
652 0 : "left over in this directory which are consuming memory, or another program on "
653 0 : "the system is using pages from the same mount.",
654 0 : wksp->name, path, wksp->page_cnt, fd_shmem_page_sz_to_cstr( wksp->page_sz ) ));
655 0 : }
656 0 : fd_topo_join_workspace( &config->topo, wksp, FD_SHMEM_JOIN_MODE_READ_WRITE, 0 );
657 0 : fd_topo_wksp_new( &config->topo, wksp, CALLBACKS );
658 0 : fd_topo_leave_workspace( &config->topo, wksp );
659 0 : }
660 :
661 0 : if( FD_UNLIKELY( seteuid( uid ) ) ) FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
662 0 : if( FD_UNLIKELY( setegid( gid ) ) ) FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
663 0 : }
664 :
665 : void
666 0 : initialize_stacks( config_t const * config ) {
667 : # if FD_HAS_MSAN
668 : /* MSan calls an external symbolizer using fork() on crashes, which is
669 : incompatible with Firedancer's MAP_SHARED stacks. */
670 : (void)config;
671 : return;
672 : # endif
673 :
674 : /* Switch to non-root uid/gid for workspace creation. Permissions
675 : checks are still done as the current user. */
676 0 : uint gid = getgid();
677 0 : uint uid = getuid();
678 0 : if( FD_LIKELY( gid!=config->gid && -1==setegid( config->gid ) ) )
679 0 : FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
680 0 : if( FD_LIKELY( uid!=config->uid && -1==seteuid( config->uid ) ) )
681 0 : FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
682 :
683 0 : for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
684 0 : fd_topo_tile_t const * tile = &config->topo.tiles[ i ];
685 :
686 0 : char path[ PATH_MAX ];
687 0 : FD_TEST( fd_cstr_printf_check( path, PATH_MAX, NULL, "%s/%s_stack_%s%lu", config->hugetlbfs.huge_page_mount_path, config->name, tile->name, tile->kind_id ) );
688 :
689 0 : struct stat st;
690 0 : int result = stat( path, &st );
691 :
692 0 : int update_existing;
693 0 : if( FD_UNLIKELY( !result && config->is_live_cluster ) ) {
694 0 : if( FD_UNLIKELY( -1==unlink( path ) && errno!=ENOENT ) ) FD_LOG_ERR(( "unlink() failed when trying to create stack workspace `%s` (%i-%s)", path, errno, fd_io_strerror( errno ) ));
695 0 : update_existing = 0;
696 0 : } else if( FD_UNLIKELY( !result ) ) {
697 : /* See above note about zeroing out pages. */
698 0 : update_existing = 1;
699 0 : } else if( FD_LIKELY( result && errno==ENOENT ) ) {
700 0 : update_existing = 0;
701 0 : } else {
702 0 : FD_LOG_ERR(( "stat failed when trying to create workspace `%s` (%i-%s)", path, errno, fd_io_strerror( errno ) ));
703 0 : }
704 :
705 : /* TODO: Use a better CPU idx for the stack if tile is floating */
706 0 : ulong stack_cpu_idx = 0UL;
707 0 : if( FD_LIKELY( tile->cpu_idx<65535UL ) ) stack_cpu_idx = tile->cpu_idx;
708 :
709 0 : char name[ PATH_MAX ];
710 0 : FD_TEST( fd_cstr_printf_check( name, PATH_MAX, NULL, "%s_stack_%s%lu", config->name, tile->name, tile->kind_id ) );
711 :
712 0 : ulong sub_page_cnt[ 1 ] = { 6 };
713 0 : ulong sub_cpu_idx [ 1 ] = { stack_cpu_idx };
714 0 : int err;
715 0 : if( FD_UNLIKELY( update_existing ) ) {
716 0 : err = fd_shmem_update_multi( name, FD_SHMEM_HUGE_PAGE_SZ, 1, sub_page_cnt, sub_cpu_idx, S_IRUSR | S_IWUSR ); /* logs details */
717 0 : } else {
718 0 : err = fd_shmem_create_multi( name, FD_SHMEM_HUGE_PAGE_SZ, 1, sub_page_cnt, sub_cpu_idx, S_IRUSR | S_IWUSR ); /* logs details */
719 0 : }
720 0 : if( FD_UNLIKELY( err && errno==ENOMEM ) ) {
721 0 : warn_unknown_files( config, 0UL );
722 :
723 0 : char path[ PATH_MAX ];
724 0 : FD_TEST( fd_cstr_printf_check( path, PATH_MAX, NULL, "%s/%s_stack_%s%lu", config->hugetlbfs.huge_page_mount_path, config->name, tile->name, tile->kind_id ) );
725 0 : FD_LOG_ERR(( "ENOMEM-Out of memory when trying to create huge page stack for tile `%s` at `%s`. "
726 0 : "Firedancer reserves enough memory for all of its stacks during the `hugetlbfs` configure "
727 0 : "step, so it is likely you have unknown files left over in this directory which are "
728 0 : "consuming memory, or another program on the system is using pages from the same mount.",
729 0 : tile->name, path ));
730 0 : } else if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_shmem_create_multi failed" ));
731 0 : }
732 :
733 0 : if( FD_UNLIKELY( seteuid( uid ) ) ) FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
734 0 : if( FD_UNLIKELY( setegid( gid ) ) ) FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
735 0 : }
736 :
737 : void
738 0 : fdctl_check_configure( config_t const * config ) {
739 0 : configure_result_t check = fd_cfg_stage_hugetlbfs.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
740 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
741 0 : FD_LOG_ERR(( "Huge pages are not configured correctly: %s. You can run `%s configure init hugetlbfs` "
742 0 : "to create the mounts correctly. This must be done after every system restart before running "
743 0 : "Firedancer.", check.message, FD_BINARY_NAME ));
744 :
745 0 : if( FD_LIKELY( 0==strcmp( config->net.provider, "xdp" ) ) ) {
746 0 : if( fd_cfg_stage_bonding.enabled( config ) ) {
747 0 : check = fd_cfg_stage_bonding.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
748 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
749 0 : FD_LOG_ERR(( "Bonded network device is not configured correctly: %s. You can run `%s configure init bonding` "
750 0 : "to configure the bonding driver.", check.message, FD_BINARY_NAME ));
751 0 : }
752 :
753 0 : check = fd_cfg_stage_ethtool_channels.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
754 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
755 0 : FD_LOG_ERR(( "Network %s. You can run `%s configure init ethtool-channels` to set the number of channels on the "
756 0 : "network device correctly.", check.message, FD_BINARY_NAME ));
757 :
758 0 : check = fd_cfg_stage_ethtool_offloads.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
759 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
760 0 : FD_LOG_ERR(( "Network %s. You can run `%s configure init ethtool-offloads` to disable features "
761 0 : "as required.", check.message, FD_BINARY_NAME ));
762 :
763 0 : check = fd_cfg_stage_ethtool_loopback.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
764 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
765 0 : FD_LOG_ERR(( "Network %s. You can run `%s configure init ethtool-loopback` to disable tx-udp-segmentation "
766 0 : "on the loopback device.", check.message, FD_BINARY_NAME ));
767 0 : }
768 :
769 0 : check = fd_cfg_stage_sysctl.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
770 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
771 0 : FD_LOG_ERR(( "Kernel parameters are not configured correctly: %s. You can run `%s configure init sysctl` "
772 0 : "to set kernel parameters correctly.", check.message, FD_BINARY_NAME ));
773 :
774 0 : check = fd_cfg_stage_hyperthreads.check( config, FD_CONFIGURE_CHECK_TYPE_RUN );
775 0 : if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
776 0 : FD_LOG_ERR(( "Hyperthreading is not configured correctly: %s. You can run `%s configure init hyperthreads` "
777 0 : "to configure hyperthreading correctly.", check.message, FD_BINARY_NAME ));
778 0 : }
779 :
780 : void
781 : run_firedancer_init( config_t * config,
782 : int init_workspaces,
783 0 : int check_configure ) {
784 0 : struct stat st;
785 0 : int err = stat( config->paths.identity_key, &st );
786 0 : if( FD_UNLIKELY( -1==err && errno==ENOENT ) ) FD_LOG_ERR(( "[consensus.identity_path] key does not exist `%s`. You can generate an identity key at this path by running `%s keys new %s --config <toml>`", config->paths.identity_key, FD_BINARY_NAME, config->paths.identity_key ));
787 0 : else if( FD_UNLIKELY( -1==err ) ) FD_LOG_ERR(( "could not stat [consensus.identity_path] `%s` (%i-%s)", config->paths.identity_key, errno, fd_io_strerror( errno ) ));
788 :
789 0 : if( FD_UNLIKELY( !config->is_firedancer ) ) {
790 0 : for( ulong i=0UL; i<config->frankendancer.paths.authorized_voter_paths_cnt; i++ ) {
791 0 : err = stat( config->frankendancer.paths.authorized_voter_paths[ i ], &st );
792 0 : if( FD_UNLIKELY( -1==err && errno==ENOENT ) ) FD_LOG_ERR(( "[consensus.authorized_voter_paths] key does not exist `%s`", config->frankendancer.paths.authorized_voter_paths[ i ] ));
793 0 : else if( FD_UNLIKELY( -1==err ) ) FD_LOG_ERR(( "could not stat [consensus.authorized_voter_paths] `%s` (%i-%s)", config->frankendancer.paths.authorized_voter_paths[ i ], errno, fd_io_strerror( errno ) ));
794 0 : }
795 0 : }
796 :
797 : /* FIXME: fdctl_check_configure unconditionally checks for network
798 : stack prerequisites even if the command being run does not
799 : require networking. Hack around that here for now. */
800 0 : if( check_configure ) fdctl_check_configure( config );
801 0 : if( FD_LIKELY( init_workspaces ) ) initialize_workspaces( config );
802 0 : initialize_stacks( config );
803 0 : }
804 :
805 : void
806 0 : initialize_accdb_fd( config_t const * config ) {
807 0 : if( FD_UNLIKELY( !config->is_firedancer ) ) return;
808 :
809 : /* TODO: O_TRUNC is a lot slower here, because it means we have to
810 : write out extents for the whole file instead of just marking them
811 : as free. Figure out performance implications of this and maybe
812 : resolve. */
813 0 : int accounts_fd = open( config->paths.accounts, O_RDWR|O_CREAT|O_NOATIME|O_TRUNC, S_IRUSR|S_IWUSR );
814 0 : if( FD_UNLIKELY( -1==accounts_fd ) ) FD_LOG_ERR(( "failed to open accounts.db (%i-%s)", errno, fd_io_strerror( errno ) ));
815 0 : if( FD_UNLIKELY( -1==dup2( accounts_fd, FD_ACCDB_FD_RW ) ) ) FD_LOG_ERR(( "dup2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
816 0 : if( FD_UNLIKELY( -1==close( accounts_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
817 :
818 : /* Read-only fd for tiles (e.g. rpc) that consume accdb but must not
819 : be able to mutate the on-disk file. Reopen via /proc/self/fd to
820 : guarantee it refers to the same inode as the RW fd, avoiding any
821 : race where the file at the path could be replaced between opens. */
822 0 : char proc_path[ PATH_MAX ];
823 0 : FD_TEST( fd_cstr_printf_check( proc_path, sizeof(proc_path), NULL, "/proc/self/fd/%d", FD_ACCDB_FD_RW ) );
824 0 : int accounts_ro_fd = open( proc_path, O_RDONLY|O_NOATIME );
825 0 : if( FD_UNLIKELY( -1==accounts_ro_fd ) ) FD_LOG_ERR(( "failed to open accounts.db read-only (%i-%s)", errno, fd_io_strerror( errno ) ));
826 0 : if( FD_UNLIKELY( -1==dup2( accounts_ro_fd, FD_ACCDB_FD_RO ) ) ) FD_LOG_ERR(( "dup2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
827 0 : if( FD_UNLIKELY( -1==close( accounts_ro_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
828 0 : }
829 :
830 : /* The boot sequence is a little bit involved...
831 :
832 : A process tree is created that looks like,
833 :
834 : + main
835 : +-- pidns
836 : +-- agave
837 : +-- tile 0
838 : +-- tile 1
839 : ...
840 :
841 : What we want is that if any process in the tree dies, all other
842 : processes will also die. This is done as follows,
843 :
844 : (a) pidns is the init process of a PID namespace, so if it dies the
845 : kernel will terminate the child processes.
846 :
847 : (b) main is the parent of pidns, so it can issue a waitpid() on the
848 : child PID, and when it completes terminate itself.
849 :
850 : (c) pidns is the parent of agave and the tiles, so it could
851 : issue a waitpid() of -1 to wait for any of them to terminate,
852 : but how would it know if main has died?
853 :
854 : (d) main creates a pipe, and passes the write end to pidns. If main
855 : dies, the pipe will be closed, and pidns will get a HUP on the
856 : read end. Then pidns creates a pipe per child and passes the
857 : write end to the child. If any of the children die, the pipe
858 : will be closed, and pidns will get a HUP on the read end.
859 :
860 : Then pidns can call poll() on both the write end of the main
861 : pipe and the read end of all the child pipes. If any of them
862 : raises SIGHUP, then pidns knows that the parent or a child has
863 : died, and it can terminate itself, which due to (a) and (b)
864 : will kill all other processes. */
865 : void
866 : run_firedancer( config_t * config,
867 : int parent_pipefd,
868 0 : int init_workspaces ) {
869 : /* dump the topology we are using to the output log */
870 0 : fd_topo_print_log( 0, &config->topo );
871 :
872 0 : run_firedancer_init( config, init_workspaces, 1 );
873 :
874 0 : #if defined(__x86_64__) || defined(__aarch64__)
875 :
876 : #ifndef SYS_landlock_create_ruleset
877 : #define SYS_landlock_create_ruleset 444
878 : #endif
879 :
880 0 : #ifndef LANDLOCK_CREATE_RULESET_VERSION
881 0 : #define LANDLOCK_CREATE_RULESET_VERSION (1U << 0)
882 0 : #endif
883 :
884 0 : #endif
885 0 : long abi = syscall( SYS_landlock_create_ruleset, NULL, 0, LANDLOCK_CREATE_RULESET_VERSION );
886 0 : if( -1L==abi && (errno==ENOSYS || errno==EOPNOTSUPP ) ) {
887 0 : FD_LOG_WARNING(( "The Landlock access control system is not supported by your Linux kernel. Firedancer uses landlock to "
888 0 : "provide an additional layer of security to the sandbox, but it is not required." ));
889 0 : }
890 :
891 0 : if( FD_UNLIKELY( close( 0 ) ) ) FD_LOG_ERR(( "close(0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
892 0 : if( FD_UNLIKELY( fd_log_private_logfile_fd()!=1 && close( 1 ) ) ) FD_LOG_ERR(( "close(1) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
893 :
894 0 : int pipefd;
895 0 : pid_namespace = clone_firedancer( config, parent_pipefd, &pipefd );
896 :
897 : /* Print the location of the logfile on SIGINT or SIGTERM, and also
898 : kill the child. They are connected by a pipe which the child is
899 : polling so we don't strictly need to kill the child, but its helpful
900 : to do that before printing the log location line, else it might
901 : get interleaved due to timing windows in the shutdown. */
902 0 : install_parent_signals();
903 :
904 0 : struct sock_filter seccomp_filter[ 128UL ];
905 0 : populate_sock_filter_policy_main( 128UL, seccomp_filter, (uint)fd_log_private_logfile_fd(), (uint)pid_namespace );
906 :
907 0 : int allow_fds[ 4 ];
908 0 : ulong allow_fds_cnt = 0;
909 0 : allow_fds[ allow_fds_cnt++ ] = 2; /* stderr */
910 0 : if( FD_LIKELY( fd_log_private_logfile_fd()!=-1 ) )
911 0 : allow_fds[ allow_fds_cnt++ ] = fd_log_private_logfile_fd(); /* logfile, or maybe stdout */
912 0 : allow_fds[ allow_fds_cnt++ ] = pipefd; /* read end of main pipe */
913 0 : if( FD_UNLIKELY( parent_pipefd!=-1 ) )
914 0 : allow_fds[ allow_fds_cnt++ ] = parent_pipefd; /* write end of parent pipe */
915 :
916 0 : if( FD_LIKELY( config->development.sandbox ) ) {
917 0 : fd_sandbox_enter( config->uid,
918 0 : config->gid,
919 0 : 0,
920 0 : 0,
921 0 : 0,
922 0 : 1, /* Keep controlling terminal for main so it can receive Ctrl+C */
923 0 : 0,
924 0 : 0UL,
925 0 : 0UL,
926 0 : 0UL,
927 0 : 0UL,
928 0 : allow_fds_cnt,
929 0 : allow_fds,
930 0 : sock_filter_policy_main_instr_cnt,
931 0 : seccomp_filter );
932 0 : } else {
933 0 : fd_sandbox_switch_uid_gid( config->uid, config->gid );
934 0 : }
935 :
936 : /* the only clean way to exit is SIGINT or SIGTERM on this parent process,
937 : so if wait4() completes, it must be an error */
938 0 : int wstatus;
939 0 : if( FD_UNLIKELY( -1==wait4( pid_namespace, &wstatus, (int)__WALL, NULL ) ) )
940 0 : FD_LOG_ERR(( "main wait4() failed (%i-%s)\nLog at \"%s\"", errno, fd_io_strerror( errno ), fd_log_private_path ));
941 :
942 0 : if( FD_UNLIKELY( WIFSIGNALED( wstatus ) ) ) fd_sys_util_exit_group( WTERMSIG( wstatus ) ? WTERMSIG( wstatus ) : 1 );
943 0 : else fd_sys_util_exit_group( WEXITSTATUS( wstatus ) ? WEXITSTATUS( wstatus ) : 1 );
944 0 : }
945 :
946 : void
947 : run_cmd_fn( args_t * args FD_PARAM_UNUSED,
948 0 : config_t * config ) {
949 0 : #define CHECK_PORT_NON_ZERO( field ) \
950 0 : if( FD_UNLIKELY( config->field==0 ) ) { \
951 0 : FD_LOG_ERR(( #field " is not set in your configuration file. Please set it to a non-zero value." )); \
952 0 : }
953 :
954 0 : if( FD_UNLIKELY( !config->gossip.entrypoints_cnt && !config->development.bootstrap ) )
955 0 : FD_LOG_ERR(( "No entrypoints specified in configuration file under [gossip.entrypoints], but "
956 0 : "at least one is needed to determine how to connect to the Solana cluster. If "
957 0 : "you want to start a new cluster in a development environment, use `fddev` instead "
958 0 : "of `fdctl`. If you want to use an existing genesis, set [development.bootstrap] "
959 0 : "to \"true\" in the configuration file." ));
960 :
961 0 : for( ulong i=0; i<config->gossip.entrypoints_cnt; i++ ) {
962 0 : if( FD_UNLIKELY( !strcmp( config->gossip.entrypoints[ i ], "" ) ) )
963 0 : FD_LOG_ERR(( "One of the entrypoints in your configuration file under [gossip.entrypoints] is "
964 0 : "empty. Please remove the empty entrypoint or set it correctly. "));
965 0 : }
966 :
967 0 : CHECK_PORT_NON_ZERO( gossip.port );
968 0 : CHECK_PORT_NON_ZERO( tiles.quic.quic_transaction_listen_port );
969 0 : CHECK_PORT_NON_ZERO( tiles.quic.regular_transaction_listen_port );
970 0 : CHECK_PORT_NON_ZERO( tiles.shred.shred_listen_port );
971 0 : CHECK_PORT_NON_ZERO( tiles.metric.prometheus_listen_port );
972 0 : CHECK_PORT_NON_ZERO( tiles.gui.gui_listen_port );
973 :
974 0 : #undef CHECK_PORT_NON_ZERO
975 :
976 0 : run_firedancer( config, -1, 1 );
977 0 : }
978 :
979 : static void
980 0 : run1_args_help( fd_action_help_t * help ) {
981 0 : fd_action_help_arg( help, "<tile-name>", NULL, "Type of tile to run (e.g. `net`, `quic`, `replay`). A tile is a single\n"
982 0 : "thread pinned to a CPU core that performs one part of the validator's work" );
983 : fd_action_help_arg( help, "<kind-id>", NULL, "Zero-based index selecting which instance of that tile type to run\n"
984 0 : "when the topology has more than one" );
985 0 : fd_action_help_arg( help, "--pipe-fd", "<fd>", "Internal use: file descriptor over which the parent supervisor process\n"
986 0 : "communicates with this tile (default -1, standalone)" );
987 0 : }
988 :
989 : action_t fd_action_run1 = {
990 : .name = "run1",
991 : .args = run1_cmd_args,
992 : .fn = run1_cmd_fn,
993 : .perm = NULL,
994 : .description = "Start up a single Firedancer tile",
995 : .detail = "Runs one tile of the validator topology in the current process. A tile is a\n"
996 : "single thread pinned to a CPU core that performs one part of the validator's\n"
997 : "work. This is primarily an internal command used by `run` to spawn individual\n"
998 : "tiles; most operators should use `run` instead.",
999 : .usage = "run1 <tile-name> <kind-id> [OPTIONS]",
1000 : .args_help = run1_args_help,
1001 : };
1002 :
1003 : action_t fd_action_run = {
1004 : .name = "run",
1005 : .args = NULL,
1006 : .fn = run_cmd_fn,
1007 : .require_config = 1,
1008 : .perm = run_cmd_perm,
1009 : .description = "Start up a Firedancer validator",
1010 : .detail = "Boots and runs the full validator described by the configuration file. This\n"
1011 : "is the main command operators use to run Firedancer. It must be started with\n"
1012 : "sufficient privileges to perform boot-time setup, after which it drops\n"
1013 : "privileges to the configured user.",
1014 : .usage = "run [OPTIONS]",
1015 : .permission_err = "insufficient permissions to execute command `%s`. It is recommended "
1016 : "to start Firedancer as the root user, but you can also start it "
1017 : "with the missing capabilities listed above. The program only needs "
1018 : "to start with elevated permissions to do privileged operations at "
1019 : "boot, and will immediately drop permissions and switch to the user "
1020 : "specified in your configuration file once they are complete. Firedancer "
1021 : "will not execute outside of the boot process as root, and will refuse "
1022 : "to start if it cannot drop privileges. Firedancer needs to be started "
1023 : "privileged to configure high performance networking with XDP.",
1024 : };
|