LCOV - code coverage report
Current view: top level - app/shared/commands/run - run.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 0 618 0.0 %
Date: 2025-10-16 04:31:23 Functions: 0 17 0.0 %

          Line data    Source code
       1             : #define _GNU_SOURCE
       2             : #include "run.h"
       3             : 
       4             : #include <sys/wait.h>
       5             : #include "generated/main_seccomp.h"
       6             : #if defined(__aarch64__)
       7             : #include "generated/pidns_arm64_seccomp.h"
       8             : #else
       9             : #include "generated/pidns_seccomp.h"
      10             : #endif
      11             : 
      12             : #include "../../../platform/fd_sys_util.h"
      13             : #include "../../../platform/fd_file_util.h"
      14             : #include "../../../platform/fd_net_util.h"
      15             : 
      16             : #include "../configure/configure.h"
      17             : 
      18             : #include <dirent.h>
      19             : #include <sched.h>
      20             : #include <stdio.h>
      21             : #include <stdlib.h> /* getenv */
      22             : #include <poll.h>
      23             : #include <unistd.h>
      24             : #include <errno.h>
      25             : #include <fcntl.h>
      26             : #include <sys/prctl.h>
      27             : #include <sys/resource.h>
      28             : #include <sys/mman.h>
      29             : #include <sys/stat.h>
      30             : #include <linux/capability.h>
      31             : #include <linux/unistd.h>
      32             : 
      33             : #include "../../../../util/tile/fd_tile_private.h"
      34             : 
      35             : extern fd_topo_obj_callbacks_t * CALLBACKS[];
      36             : 
      37           0 : #define NAME "run"
      38             : 
      39             : void
      40             : run_cmd_perm( args_t *         args,
      41             :               fd_cap_chk_t *   chk,
      42           0 :               config_t const * config ) {
      43           0 :   (void)args;
      44             : 
      45           0 :   ulong mlock_limit = fd_topo_mlock_max_tile( &config->topo );
      46             : 
      47           0 :   fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_MEMLOCK, mlock_limit, "call `rlimit(2)` to increase `RLIMIT_MEMLOCK` so all memory can be locked with `mlock(2)`" );
      48           0 :   fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_NICE,    40,          "call `setpriority(2)` to increase thread priorities" );
      49           0 :   fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_NOFILE,  CONFIGURE_NR_OPEN_FILES,
      50           0 :                                                                        "call `rlimit(2)  to increase `RLIMIT_NOFILE` to allow more open files for Agave" );
      51           0 :   fd_cap_chk_cap(          chk, NAME, CAP_NET_RAW,                 "call `socket(2)` to bind to a raw socket for use by XDP" );
      52           0 :   fd_cap_chk_cap(          chk, NAME, CAP_SYS_ADMIN,               "call `bpf(2)` with the `BPF_OBJ_GET` command to initialize XDP" );
      53           0 :   if( fd_sandbox_requires_cap_sys_admin( config->uid, config->gid ) )
      54           0 :     fd_cap_chk_cap(        chk, NAME, CAP_SYS_ADMIN,               "call `unshare(2)` with `CLONE_NEWUSER` to sandbox the process in a user namespace" );
      55           0 :   if( FD_LIKELY( getuid() != config->uid ) )
      56           0 :     fd_cap_chk_cap(        chk, NAME, CAP_SETUID,                  "call `setresuid(2)` to switch uid to the sandbox user" );
      57           0 :   if( FD_LIKELY( getgid()!=config->gid ) )
      58           0 :     fd_cap_chk_cap(        chk, NAME, CAP_SETGID,                  "call `setresgid(2)` to switch gid to the sandbox user" );
      59           0 :   if( FD_UNLIKELY( config->development.netns.enabled ) )
      60           0 :     fd_cap_chk_cap(        chk, NAME, CAP_SYS_ADMIN,               "call `setns(2)` to enter a network namespace" );
      61           0 :   if( FD_UNLIKELY( config->tiles.metric.prometheus_listen_port<1024 ) )
      62           0 :     fd_cap_chk_cap(        chk, NAME, CAP_NET_BIND_SERVICE,        "call `bind(2)` to bind to a privileged port for serving metrics" );
      63           0 :   if( FD_UNLIKELY( config->tiles.gui.gui_listen_port<1024 ) )
      64           0 :     fd_cap_chk_cap(        chk, NAME, CAP_NET_BIND_SERVICE,        "call `bind(2)` to bind to a privileged port for serving the GUI" );
      65           0 : }
      66             : 
      67             : struct pidns_clone_args {
      68             :   config_t const * config;
      69             :   int *            pipefd;
      70             :   int              closefd;
      71             : };
      72             : 
      73             : extern char fd_log_private_path[ 1024 ]; /* empty string on start */
      74             : 
      75             : static pid_t pid_namespace;
      76             : 
      77           0 : #define FD_LOG_ERR_NOEXIT(a) do { long _fd_log_msg_now = fd_log_wallclock(); fd_log_private_1( 4, _fd_log_msg_now, __FILE__, __LINE__, __func__, fd_log_private_0 a ); } while(0)
      78             : 
      79             : extern int * fd_log_private_shared_lock;
      80             : 
      81             : static void
      82           0 : parent_signal( int sig ) {
      83           0 :   if( FD_LIKELY( pid_namespace ) ) kill( pid_namespace, SIGKILL );
      84             : 
      85             :   /* A pretty gross hack.  For the local process, clear the lock so that
      86             :      we can always print the messages without waiting on another process,
      87             :      particularly if one of those processes might have just died.  The
      88             :      signal handler is re-entrant so this also avoids a deadlock since
      89             :      the log lock is not re-entrant. */
      90           0 :   int lock = 0;
      91           0 :   fd_log_private_shared_lock = &lock;
      92             : 
      93           0 :   if( -1!=fd_log_private_logfile_fd() ) FD_LOG_ERR_NOEXIT(( "Received signal %s\nLog at \"%s\"", fd_io_strsignal( sig ), fd_log_private_path ));
      94           0 :   else                                  FD_LOG_ERR_NOEXIT(( "Received signal %s",                fd_io_strsignal( sig ) ));
      95             : 
      96           0 :   if( FD_LIKELY( sig==SIGINT ) ) fd_sys_util_exit_group( 128+SIGINT );
      97           0 :   else                           fd_sys_util_exit_group( 0          );
      98           0 : }
      99             : 
     100             : static void
     101           0 : install_parent_signals( void ) {
     102           0 :   struct sigaction sa = {
     103           0 :     .sa_handler = parent_signal,
     104           0 :     .sa_flags   = 0,
     105           0 :   };
     106           0 :   if( FD_UNLIKELY( sigaction( SIGTERM, &sa, NULL ) ) )
     107           0 :     FD_LOG_ERR(( "sigaction(SIGTERM) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     108           0 :   if( FD_UNLIKELY( sigaction( SIGINT, &sa, NULL ) ) )
     109           0 :     FD_LOG_ERR(( "sigaction(SIGINT) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     110             : 
     111           0 :   sa.sa_handler = SIG_IGN;
     112           0 :   if( FD_UNLIKELY( sigaction( SIGUSR1, &sa, NULL ) ) )
     113           0 :     FD_LOG_ERR(( "sigaction(SIGUSR1) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     114           0 :   if( FD_UNLIKELY( sigaction( SIGUSR2, &sa, NULL ) ) )
     115           0 :     FD_LOG_ERR(( "sigaction(SIGUSR2) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     116           0 : }
     117             : 
     118             : void *
     119           0 : create_clone_stack( void ) {
     120           0 :   ulong mmap_sz = FD_TILE_PRIVATE_STACK_SZ + 2UL*FD_SHMEM_NORMAL_PAGE_SZ;
     121           0 :   uchar * stack = (uchar *)mmap( NULL, mmap_sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, (off_t)0 );
     122           0 :   if( FD_UNLIKELY( stack==MAP_FAILED ) )
     123           0 :     FD_LOG_ERR(( "mmap() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     124             : 
     125             :   /* Make space for guard lo and guard hi */
     126           0 :   if( FD_UNLIKELY( munmap( stack, FD_SHMEM_NORMAL_PAGE_SZ ) ) )
     127           0 :     FD_LOG_ERR(( "munmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     128           0 :   stack += FD_SHMEM_NORMAL_PAGE_SZ;
     129           0 :   if( FD_UNLIKELY( munmap( stack + FD_TILE_PRIVATE_STACK_SZ, FD_SHMEM_NORMAL_PAGE_SZ ) ) )
     130           0 :     FD_LOG_ERR(( "munmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     131             : 
     132             :   /* Create the guard regions in the extra space */
     133           0 :   void * guard_lo = (void *)(stack - FD_SHMEM_NORMAL_PAGE_SZ );
     134           0 :   if( FD_UNLIKELY( mmap( guard_lo, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
     135           0 :                          MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_lo ) )
     136           0 :     FD_LOG_ERR(( "mmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     137             : 
     138           0 :   void * guard_hi = (void *)(stack + FD_TILE_PRIVATE_STACK_SZ);
     139           0 :   if( FD_UNLIKELY( mmap( guard_hi, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
     140           0 :                          MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_hi ) )
     141           0 :     FD_LOG_ERR(( "mmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     142             : 
     143           0 :   return stack;
     144           0 : }
     145             : 
     146             : 
     147             : static int
     148             : execve_agave( int config_memfd,
     149           0 :                     int pipefd ) {
     150           0 :   if( FD_UNLIKELY( -1==fcntl( pipefd, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     151           0 :   pid_t child = fork();
     152           0 :   if( FD_UNLIKELY( -1==child ) ) FD_LOG_ERR(( "fork() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     153           0 :   if( FD_LIKELY( !child ) ) {
     154           0 :     char _current_executable_path[ PATH_MAX ];
     155           0 :     FD_TEST( -1!=fd_file_util_self_exe( _current_executable_path ) );
     156             : 
     157           0 :     char config_fd[ 32 ];
     158           0 :     FD_TEST( fd_cstr_printf_check( config_fd, sizeof( config_fd ), NULL, "%d", config_memfd ) );
     159           0 :     char * args[ 5 ] = { _current_executable_path, "run-agave", "--config-fd", config_fd, NULL };
     160             : 
     161           0 :     char * envp[] = { NULL, NULL };
     162           0 :     char * google_creds = getenv( "GOOGLE_APPLICATION_CREDENTIALS" );
     163           0 :     char provide_creds[ PATH_MAX+30UL ];
     164           0 :     if( FD_UNLIKELY( google_creds ) ) {
     165           0 :       FD_TEST( fd_cstr_printf_check( provide_creds, sizeof( provide_creds ), NULL, "GOOGLE_APPLICATION_CREDENTIALS=%s", google_creds ) );
     166           0 :       envp[ 0 ] = provide_creds;
     167           0 :     }
     168             : 
     169           0 :     if( FD_UNLIKELY( -1==execve( _current_executable_path, args, envp ) ) ) FD_LOG_ERR(( "execve() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     170           0 :   } else {
     171           0 :     if( FD_UNLIKELY( -1==fcntl( pipefd, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     172           0 :     return child;
     173           0 :   }
     174           0 :   return 0;
     175           0 : }
     176             : 
     177             : static pid_t
     178             : execve_tile( fd_topo_tile_t const * tile,
     179             :              fd_cpuset_t const *    floating_cpu_set,
     180             :              int                    floating_priority,
     181             :              int                    config_memfd,
     182           0 :              int                    pipefd ) {
     183           0 :   FD_CPUSET_DECL( cpu_set );
     184           0 :   if( FD_LIKELY( tile->cpu_idx!=ULONG_MAX ) ) {
     185             :     /* set the thread affinity before we clone the new process to ensure
     186             :         kernel first touch happens on the desired thread. */
     187           0 :     fd_cpuset_insert( cpu_set, tile->cpu_idx );
     188           0 :     if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, -19 ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     189           0 :   } else {
     190           0 :     fd_memcpy( cpu_set, floating_cpu_set, fd_cpuset_footprint() );
     191           0 :     if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, floating_priority ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     192           0 :   }
     193             : 
     194           0 :   if( FD_UNLIKELY( fd_cpuset_setaffinity( 0, cpu_set ) ) ) {
     195           0 :     if( FD_LIKELY( errno==EINVAL ) ) {
     196           0 :       FD_LOG_ERR(( "Unable to set the thread affinity for tile %s:%lu on cpu %lu. It is likely that the affinity "
     197           0 :                    "you have specified for this tile in [layout.affinity] of your configuration file contains a "
     198           0 :                    "CPU (%lu) which does not exist on this machine.",
     199           0 :                    tile->name, tile->kind_id, tile->cpu_idx, tile->cpu_idx ));
     200           0 :     } else {
     201           0 :       FD_LOG_ERR(( "sched_setaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     202           0 :     }
     203           0 :   }
     204             : 
     205             :   /* Clear CLOEXEC on the side of the pipe we want to pass to the tile. */
     206           0 :   if( FD_UNLIKELY( -1==fcntl( pipefd, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     207           0 :   pid_t child = fork();
     208           0 :   if( FD_UNLIKELY( -1==child ) ) FD_LOG_ERR(( "fork() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     209           0 :   if( FD_LIKELY( !child ) ) {
     210           0 :     char _current_executable_path[ PATH_MAX ];
     211           0 :     FD_TEST( -1!=fd_file_util_self_exe( _current_executable_path ) );
     212             : 
     213           0 :     char kind_id[ 32 ], config_fd[ 32 ], pipe_fd[ 32 ];
     214           0 :     FD_TEST( fd_cstr_printf_check( kind_id,   sizeof( kind_id ),   NULL, "%lu", tile->kind_id ) );
     215           0 :     FD_TEST( fd_cstr_printf_check( config_fd, sizeof( config_fd ), NULL, "%d",  config_memfd ) );
     216           0 :     FD_TEST( fd_cstr_printf_check( pipe_fd,   sizeof( pipe_fd ),   NULL, "%d",  pipefd ) );
     217           0 :     char const * args[ 9 ] = { _current_executable_path, "run1", tile->name, kind_id, "--pipe-fd", pipe_fd, "--config-fd", config_fd, NULL };
     218           0 :     if( FD_UNLIKELY( -1==execve( _current_executable_path, (char **)args, NULL ) ) ) FD_LOG_ERR(( "execve() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     219           0 :   } else {
     220           0 :     if( FD_UNLIKELY( -1==fcntl( pipefd, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     221           0 :     return child;
     222           0 :   }
     223           0 :   return 0;
     224           0 : }
     225             : 
     226             : extern int * fd_log_private_shared_lock;
     227             : 
     228             : int
     229           0 : main_pid_namespace( void * _args ) {
     230           0 :   struct pidns_clone_args * args = _args;
     231           0 :   if( FD_UNLIKELY( close( args->pipefd[ 0 ] ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     232           0 :   if( FD_UNLIKELY( -1!=args->closefd ) ) {
     233           0 :     if( FD_UNLIKELY( close( args->closefd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     234           0 :   }
     235             : 
     236           0 :   config_t const * config = args->config;
     237             : 
     238           0 :   fd_log_thread_set( "pidns" );
     239           0 :   ulong pid = fd_sandbox_getpid(); /* Need to read /proc again.. we got a new PID from clone */
     240           0 :   fd_log_private_group_id_set( pid );
     241           0 :   fd_log_private_thread_id_set( pid );
     242           0 :   fd_log_private_stack_discover( FD_TILE_PRIVATE_STACK_SZ,
     243           0 :                                  &fd_tile_private_stack0, &fd_tile_private_stack1 );
     244             : 
     245           0 :   if( FD_UNLIKELY( !config->development.sandbox ) ) {
     246             :     /* If no sandbox, then there's no actual PID namespace so we can't
     247             :        wait() grandchildren for the exit code.  Do this as a workaround. */
     248           0 :     if( FD_UNLIKELY( -1==prctl( PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0 ) ) )
     249           0 :       FD_LOG_ERR(( "prctl(PR_SET_CHILD_SUBREAPER) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     250           0 :   }
     251             : 
     252             :   /* Save the current affinity, it will be restored after creating any child tiles */
     253           0 :   FD_CPUSET_DECL( floating_cpu_set );
     254           0 :   if( FD_UNLIKELY( fd_cpuset_getaffinity( 0, floating_cpu_set ) ) )
     255           0 :     FD_LOG_ERR(( "fd_cpuset_getaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     256             : 
     257           0 :   pid_t child_pids[ FD_TOPO_MAX_TILES+1 ];
     258           0 :   ulong actual_pids[ FD_TOPO_MAX_TILES+1 ];
     259           0 :   for( ulong i=0UL; i<FD_TOPO_MAX_TILES+1; i++ ) actual_pids[ i ] = ULONG_MAX;
     260           0 :   char  child_names[ FD_TOPO_MAX_TILES+1 ][ 32 ];
     261           0 :   ulong child_idxs[ FD_TOPO_MAX_TILES+1 ];
     262           0 :   struct pollfd fds[ FD_TOPO_MAX_TILES+2 ];
     263             : 
     264           0 :   int config_memfd = fd_config_to_memfd( config );
     265           0 :   if( FD_UNLIKELY( -1==config_memfd ) ) FD_LOG_ERR(( "fd_config_to_memfd() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     266             : 
     267           0 :   if( FD_UNLIKELY( config->development.debug_tile ) ) {
     268           0 :     fd_log_private_shared_lock[1] = 1;
     269           0 :   }
     270             : 
     271           0 :   ulong child_cnt = 0UL;
     272           0 :   if( FD_LIKELY( !config->is_firedancer && !config->development.no_agave ) ) {
     273           0 :     int pipefd[ 2 ];
     274           0 :     if( FD_UNLIKELY( pipe2( pipefd, O_CLOEXEC ) ) ) FD_LOG_ERR(( "pipe2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     275           0 :     fds[ child_cnt ] = (struct pollfd){ .fd = pipefd[ 0 ], .events = 0 };
     276           0 :     child_pids[ child_cnt ] = execve_agave( config_memfd, pipefd[ 1 ] );
     277           0 :     FD_TEST( child_pids[ child_cnt ]>0 );
     278           0 :     actual_pids[ child_cnt ] = (ulong)child_pids[ child_cnt ];
     279           0 :     child_idxs[ child_cnt ] = ULONG_MAX;
     280           0 :     if( FD_UNLIKELY( close( pipefd[ 1 ] ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     281           0 :     strncpy( child_names[ child_cnt ], "agave", 32 );
     282           0 :     child_cnt++;
     283           0 :   }
     284             : 
     285           0 :   if( FD_UNLIKELY( config->development.netns.enabled ) ) {
     286           0 :     if( FD_UNLIKELY( -1==fd_net_util_netns_enter( config->net.interface, NULL ) ) )
     287           0 :       FD_LOG_ERR(( "failed to enter network namespace `%s` (%i-%s)", config->net.interface, errno, fd_io_strerror( errno ) ));
     288           0 :   }
     289             : 
     290           0 :   errno = 0;
     291           0 :   int save_priority = getpriority( PRIO_PROCESS, 0 );
     292           0 :   if( FD_UNLIKELY( -1==save_priority && errno ) ) FD_LOG_ERR(( "getpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     293             : 
     294           0 :   int need_xdp = 0==strcmp( config->net.provider, "xdp" );
     295           0 :   fd_xdp_fds_t xdp_fds = {0};
     296           0 :   if( need_xdp ) {
     297           0 :     xdp_fds = fd_topo_install_xdp( &config->topo, config->net.bind_address_parsed );
     298           0 :   }
     299             : 
     300           0 :   for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
     301           0 :     fd_topo_tile_t const * tile = &config->topo.tiles[ i ];
     302           0 :     if( FD_UNLIKELY( tile->is_agave ) ) continue;
     303             : 
     304           0 :     if( need_xdp ) {
     305           0 :       if( FD_UNLIKELY( strcmp( tile->name, "net" ) ) ) {
     306             :         /* close XDP related file descriptors */
     307           0 :         if( FD_UNLIKELY( -1==fcntl( xdp_fds.xsk_map_fd,   F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     308           0 :         if( FD_UNLIKELY( -1==fcntl( xdp_fds.prog_link_fd, F_SETFD, FD_CLOEXEC ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,FD_CLOEXEC) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     309           0 :       } else {
     310           0 :         if( FD_UNLIKELY( -1==fcntl( xdp_fds.xsk_map_fd,   F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     311           0 :         if( FD_UNLIKELY( -1==fcntl( xdp_fds.prog_link_fd, F_SETFD, 0 ) ) ) FD_LOG_ERR(( "fcntl(F_SETFD,0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     312           0 :       }
     313           0 :     }
     314             : 
     315           0 :     int pipefd[ 2 ];
     316           0 :     if( FD_UNLIKELY( pipe2( pipefd, O_CLOEXEC ) ) ) FD_LOG_ERR(( "pipe2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     317           0 :     fds[ child_cnt ] = (struct pollfd){ .fd = pipefd[ 0 ], .events = 0 };
     318           0 :     child_pids[ child_cnt ] = execve_tile( tile, floating_cpu_set, save_priority, config_memfd, pipefd[ 1 ] );
     319           0 :     child_idxs[ child_cnt ] = i;
     320           0 :     if( FD_UNLIKELY( close( pipefd[ 1 ] ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     321           0 :     strncpy( child_names[ child_cnt ], tile->name, 32 );
     322           0 :     child_cnt++;
     323           0 :   }
     324             : 
     325             :   /* Obtain the actual grandchild PID from the pipe */
     326           0 :   for( ulong i=0UL; i<child_cnt; i++ ) {
     327           0 :     if( FD_UNLIKELY( actual_pids[ i ]!=ULONG_MAX ) ) continue;
     328           0 :     FD_TEST( 8UL==read( fds[ i ].fd, &actual_pids[ i ], 8UL ) );
     329           0 :   }
     330             : 
     331           0 :   if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, save_priority ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     332           0 :   if( FD_UNLIKELY( fd_cpuset_setaffinity( 0, floating_cpu_set ) ) )
     333           0 :     FD_LOG_ERR(( "fd_cpuset_setaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     334             : 
     335           0 :   if( FD_UNLIKELY( close( config_memfd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     336           0 :   if( FD_UNLIKELY( close( config->log.lock_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     337           0 :   if( need_xdp ) {
     338           0 :     if( FD_UNLIKELY( close( xdp_fds.xsk_map_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     339           0 :     if( FD_UNLIKELY( close( xdp_fds.prog_link_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     340           0 :   }
     341             : 
     342           0 :   int allow_fds[ 4+FD_TOPO_MAX_TILES ];
     343           0 :   ulong allow_fds_cnt = 0;
     344           0 :   allow_fds[ allow_fds_cnt++ ] = 2; /* stderr */
     345           0 :   if( FD_LIKELY( fd_log_private_logfile_fd()!=-1 ) )
     346           0 :     allow_fds[ allow_fds_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
     347           0 :   allow_fds[ allow_fds_cnt++ ] = args->pipefd[ 1 ]; /* write end of main pipe */
     348           0 :   for( ulong i=0UL; i<child_cnt; i++ )
     349           0 :     allow_fds[ allow_fds_cnt++ ] = fds[ i ].fd; /* read end of child pipes */
     350             : 
     351           0 :   struct sock_filter seccomp_filter[ 128UL ];
     352           0 :   unsigned int instr_cnt;
     353             :   #if defined(__aarch64__)
     354             :   populate_sock_filter_policy_pidns_arm64( 128UL, seccomp_filter, (uint)fd_log_private_logfile_fd() );
     355             :   instr_cnt = sock_filter_policy_pidns_arm64_instr_cnt;
     356             :   #else
     357           0 :   populate_sock_filter_policy_pidns( 128UL, seccomp_filter, (uint)fd_log_private_logfile_fd() );
     358           0 :   instr_cnt = sock_filter_policy_pidns_instr_cnt;
     359           0 :   #endif
     360             : 
     361           0 :   if( FD_LIKELY( config->development.sandbox ) ) {
     362           0 :     fd_sandbox_enter( config->uid,
     363           0 :                       config->gid,
     364           0 :                       0,
     365           0 :                       0,
     366           0 :                       0,
     367           0 :                       0,
     368           0 :                       0,
     369           0 :                       1UL+child_cnt, /* RLIMIT_NOFILE needs to be set to the nfds argument of poll() */
     370           0 :                       0UL,
     371           0 :                       0UL,
     372           0 :                       allow_fds_cnt,
     373           0 :                       allow_fds,
     374           0 :                       instr_cnt,
     375           0 :                       seccomp_filter );
     376           0 :   } else {
     377           0 :     fd_sandbox_switch_uid_gid( config->uid, config->gid );
     378           0 :   }
     379             : 
     380             :   /* The supervsior process should not share the log lock, because a
     381             :      child process might die while holding it and we still need to
     382             :      reap and print errors. */
     383           0 :   int lock = 0;
     384           0 :   fd_log_private_shared_lock = &lock;
     385             : 
     386             :   /* Reap child process PIDs so they don't show up in `ps` etc.  All of
     387             :      these children should have exited immediately after clone(2)'ing
     388             :      another child with a huge page based stack. */
     389           0 :   for( ulong i=0UL; i<child_cnt; i++ ) {
     390           0 :     int wstatus;
     391           0 :     int exited_pid = wait4( child_pids[ i ], &wstatus, (int)__WALL, NULL );
     392           0 :     if( FD_UNLIKELY( -1==exited_pid ) ) {
     393           0 :       FD_LOG_ERR(( "pidns wait4() failed (%i-%s) %lu %hu", errno, fd_io_strerror( errno ), i, fds[i].revents ));
     394           0 :     } else if( FD_UNLIKELY( child_pids[ i ]!=exited_pid ) ) {
     395           0 :       FD_LOG_ERR(( "pidns wait4() returned unexpected pid %d %d", child_pids[ i ], exited_pid ));
     396           0 :     } else if( FD_UNLIKELY( !WIFEXITED( wstatus ) ) ) {
     397           0 :       FD_LOG_ERR_NOEXIT(( "tile %lu (%s) exited while booting with signal %d (%s)\n", i, child_names[ i ], WTERMSIG( wstatus ), fd_io_strsignal( WTERMSIG( wstatus ) ) ));
     398           0 :       fd_sys_util_exit_group( WTERMSIG( wstatus ) ? WTERMSIG( wstatus ) : 1 );
     399           0 :     }
     400           0 :     if( FD_UNLIKELY( WEXITSTATUS( wstatus ) ) ) {
     401           0 :       FD_LOG_ERR_NOEXIT(( "tile %lu (%s) exited while booting with code %d\n", i, child_names[ i ], WEXITSTATUS( wstatus ) ));
     402           0 :       fd_sys_util_exit_group( WEXITSTATUS( wstatus ) ? WEXITSTATUS( wstatus ) : 1 );
     403           0 :     }
     404           0 :   }
     405             : 
     406           0 :   fds[ child_cnt ] = (struct pollfd){ .fd = args->pipefd[ 1 ], .events = 0 };
     407           0 :   strncpy( child_names[ child_cnt ], "parent", 32UL );
     408           0 :   child_idxs[ child_cnt ] = ULONG_MAX;
     409             : 
     410             :   /* We are now the init process of the pid namespace.  If the init
     411             :      process dies, all children are terminated.  If any child dies, we
     412             :      terminate the init process, which will cause the kernel to
     413             :      terminate all other children bringing all of our processes down as
     414             :      a group.  The parent process will also die if this process dies,
     415             :      due to getting SIGHUP on the pipe. */
     416           0 :   while( 1 ) {
     417           0 :     if( FD_UNLIKELY( -1==poll( fds, 1UL+child_cnt, -1 ) ) ) FD_LOG_ERR(( "poll() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     418             : 
     419             :     /* Parent process died, probably SIGINT, exit gracefully. */
     420           0 :     if( FD_UNLIKELY( fds[ child_cnt ].revents ) ) fd_sys_util_exit_group( 0 );
     421             : 
     422             :     /* Child process died, reap it to figure out exit code. */
     423           0 :     int wstatus;
     424           0 :     int exited_pid = wait4( -1, &wstatus, (int)__WALL | (int)WNOHANG, NULL );
     425           0 :     if( FD_UNLIKELY( -1==exited_pid ) ) {
     426           0 :       FD_LOG_ERR(( "pidns wait4() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     427           0 :     } else if( FD_UNLIKELY( !exited_pid ) ) {
     428             :       /* Spurious wakeup, no child actually dead yet. */
     429           0 :       continue;
     430           0 :     }
     431             : 
     432             :     /* Now find the tile corresponding to that PID */
     433           0 :     FD_TEST( exited_pid>0 );
     434           0 :     int found = 0;
     435           0 :     for( ulong i=0UL; i<child_cnt; i++ ) {
     436           0 :       if( FD_LIKELY( actual_pids[ i ]!=(ulong)exited_pid ) ) continue;
     437             : 
     438           0 :       found = 1;
     439           0 :       fds[ i ].fd = -1; /* Don't poll on this tile anymore */
     440             : 
     441           0 :       char * tile_name = child_names[ i ];
     442           0 :       ulong  tile_idx = child_idxs[ i ];
     443           0 :       ulong  tile_id = config->topo.tiles[ tile_idx ].kind_id;
     444             : 
     445           0 :       if( FD_UNLIKELY( !WIFEXITED( wstatus ) ) ) {
     446           0 :         FD_LOG_ERR_NOEXIT(( "tile %s:%lu exited with signal %d (%s)", tile_name, tile_id, WTERMSIG( wstatus ), fd_io_strsignal( WTERMSIG( wstatus ) ) ));
     447           0 :         fd_sys_util_exit_group( WTERMSIG( wstatus ) ? WTERMSIG( wstatus ) : 1 );
     448           0 :       } else {
     449           0 :         int exit_code = WEXITSTATUS( wstatus );
     450           0 :         if( FD_LIKELY( !exit_code && tile_idx!=ULONG_MAX && config->topo.tiles[ tile_idx ].allow_shutdown ) ) {
     451           0 :           found = 1;
     452           0 :           FD_LOG_INFO(( "tile %s:%lu exited gracefully with code %d", tile_name, tile_id, exit_code ));
     453           0 :         } else {
     454           0 :           FD_LOG_ERR_NOEXIT(( "tile %s:%lu exited with code %d", tile_name, tile_id, exit_code ));
     455           0 :           fd_sys_util_exit_group( exit_code ? exit_code : 1 );
     456           0 :         }
     457           0 :       }
     458           0 :     }
     459             : 
     460           0 :     if( FD_UNLIKELY( !found ) ) FD_LOG_ERR(( "wait4() returned unexpected pid %d", exited_pid ));
     461           0 :   }
     462             : 
     463           0 :   return 0;
     464           0 : }
     465             : 
     466             : int
     467             : clone_firedancer( config_t const * config,
     468             :                   int              close_fd,
     469           0 :                   int *            out_pipe ) {
     470             :   /* This pipe is here just so that the child process knows when the
     471             :      parent has died (it will get a HUP). */
     472           0 :   int pipefd[2];
     473           0 :   if( FD_UNLIKELY( pipe2( pipefd, O_CLOEXEC | O_NONBLOCK ) ) ) FD_LOG_ERR(( "pipe2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     474             : 
     475             :   /* clone into a pid namespace */
     476           0 :   int flags = config->development.sandbox ? CLONE_NEWPID : 0;
     477           0 :   struct pidns_clone_args args = { .config = config, .closefd = close_fd, .pipefd = pipefd, };
     478             : 
     479           0 :   void * stack = create_clone_stack();
     480             : 
     481           0 :   int pid_namespace = clone( main_pid_namespace, (uchar *)stack + FD_TILE_PRIVATE_STACK_SZ, flags, &args );
     482           0 :   if( FD_UNLIKELY( pid_namespace<0 ) ) FD_LOG_ERR(( "clone() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     483             : 
     484           0 :   if( FD_UNLIKELY( close( pipefd[ 1 ] ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     485             : 
     486           0 :   *out_pipe = pipefd[ 0 ];
     487           0 :   return pid_namespace;
     488           0 : }
     489             : 
     490             : static void
     491             : workspace_path( config_t const *       config,
     492             :                 fd_topo_wksp_t const * wksp,
     493           0 :                 char                   out[ PATH_MAX ] ) {
     494           0 :   char const * mount_path;
     495           0 :   switch( wksp->page_sz ) {
     496           0 :     case FD_SHMEM_HUGE_PAGE_SZ:
     497           0 :       mount_path = config->hugetlbfs.huge_page_mount_path;
     498           0 :       break;
     499           0 :     case FD_SHMEM_GIGANTIC_PAGE_SZ:
     500           0 :       mount_path = config->hugetlbfs.gigantic_page_mount_path;
     501           0 :       break;
     502           0 :     case FD_SHMEM_NORMAL_PAGE_SZ:
     503           0 :       mount_path = config->hugetlbfs.normal_page_mount_path;
     504           0 :       break;
     505           0 :     default:
     506           0 :       FD_LOG_ERR(( "invalid page size %lu", wksp->page_sz ));
     507           0 :   }
     508             : 
     509           0 :   FD_TEST( fd_cstr_printf_check( out, PATH_MAX, NULL, "%s/%s_%s.wksp", mount_path, config->name, wksp->name ) );
     510           0 : }
     511             : 
     512             : static void
     513             : warn_unknown_files( config_t const * config,
     514           0 :                     ulong            mount_type ) {
     515           0 :   char const * mount_path;
     516           0 :   switch( mount_type ) {
     517           0 :     case 0UL:
     518           0 :       mount_path = config->hugetlbfs.huge_page_mount_path;
     519           0 :       break;
     520           0 :     case 1UL:
     521           0 :       mount_path = config->hugetlbfs.gigantic_page_mount_path;
     522           0 :       break;
     523           0 :     default:
     524           0 :       FD_LOG_ERR(( "invalid mount type %lu", mount_type ));
     525           0 :   }
     526             : 
     527             :   /* Check if there are any files in mount_path */
     528           0 :   DIR * dir = opendir( mount_path );
     529           0 :   if( FD_UNLIKELY( !dir ) ) {
     530           0 :     if( FD_UNLIKELY( errno!=ENOENT ) ) FD_LOG_ERR(( "error opening `%s` (%i-%s)", mount_path, errno, fd_io_strerror( errno ) ));
     531           0 :     return;
     532           0 :   }
     533             : 
     534           0 :   struct dirent * entry;
     535           0 :   while(( FD_LIKELY( entry = readdir( dir ) ) )) {
     536           0 :     if( FD_UNLIKELY( !strcmp( entry->d_name, ".") || !strcmp( entry->d_name, ".." ) ) ) continue;
     537             : 
     538           0 :     char entry_path[ PATH_MAX ];
     539           0 :     FD_TEST( fd_cstr_printf_check( entry_path, PATH_MAX, NULL, "%s/%s", mount_path, entry->d_name ));
     540             : 
     541           0 :     int known_file = 0;
     542           0 :     for( ulong i=0UL; i<config->topo.wksp_cnt; i++ ) {
     543           0 :       fd_topo_wksp_t const * wksp = &config->topo.workspaces[ i ];
     544           0 :       if( !wksp->is_locked ) continue;
     545             : 
     546           0 :       char expected_path[ PATH_MAX ];
     547           0 :       workspace_path( config, wksp, expected_path );
     548             : 
     549           0 :       if( !strcmp( entry_path, expected_path ) ) {
     550           0 :         known_file = 1;
     551           0 :         break;
     552           0 :       }
     553           0 :     }
     554             : 
     555           0 :     if( mount_type==0UL ) {
     556           0 :       for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
     557           0 :         fd_topo_tile_t const * tile = &config->topo.tiles [ i ];
     558             : 
     559           0 :         char expected_path[ PATH_MAX ];
     560           0 :         FD_TEST( fd_cstr_printf_check( expected_path, PATH_MAX, NULL, "%s/%s_stack_%s%lu", config->hugetlbfs.huge_page_mount_path, config->name, tile->name, tile->kind_id ) );
     561             : 
     562           0 :         if( !strcmp( entry_path, expected_path ) ) {
     563           0 :           known_file = 1;
     564           0 :           break;
     565           0 :         }
     566           0 :       }
     567           0 :     }
     568             : 
     569           0 :     if( FD_UNLIKELY( !known_file ) ) FD_LOG_WARNING(( "unknown file `%s` found in `%s`", entry->d_name, mount_path ));
     570           0 :   }
     571             : 
     572           0 :   if( FD_UNLIKELY( errno && errno!=ENOENT ) ) FD_LOG_ERR(( "error reading dir `%s` (%i-%s)", mount_path, errno, fd_io_strerror( errno ) ));
     573           0 :   if( FD_UNLIKELY( closedir( dir ) ) ) FD_LOG_ERR(( "error closing `%s` (%i-%s)", mount_path, errno, fd_io_strerror( errno ) ));
     574           0 : }
     575             : 
     576             : void
     577           0 : initialize_workspaces( config_t * config ) {
     578             :   /* Switch to non-root uid/gid for workspace creation.  Permissions
     579             :      checks are still done as the current user. */
     580           0 :   uint gid = getgid();
     581           0 :   uint uid = getuid();
     582           0 :   if( FD_LIKELY( gid!=config->gid && -1==setegid( config->gid ) ) )
     583           0 :     FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     584           0 :   if( FD_LIKELY( uid!=config->uid && -1==seteuid( config->uid ) ) )
     585           0 :     FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     586             : 
     587           0 :   for( ulong i=0UL; i<config->topo.wksp_cnt; i++ ) {
     588           0 :     fd_topo_wksp_t * wksp = &config->topo.workspaces[ i ];
     589             : 
     590           0 :     char path[ PATH_MAX ];
     591           0 :     workspace_path( config, wksp, path );
     592             : 
     593           0 :     struct stat st;
     594           0 :     int result = stat( path, &st );
     595             : 
     596           0 :     int update_existing;
     597           0 :     if( FD_UNLIKELY( !result && config->is_live_cluster ) ) {
     598           0 :       if( FD_UNLIKELY( -1==unlink( path ) && errno!=ENOENT ) ) FD_LOG_ERR(( "unlink() failed when trying to create workspace `%s` (%i-%s)", path, errno, fd_io_strerror( errno ) ));
     599           0 :       update_existing = 0;
     600           0 :     } else if( FD_UNLIKELY( !result ) ) {
     601             :       /* Creating all of the workspaces is very expensive because the
     602             :          kernel has to zero out all of the pages.  There can be tens or
     603             :          hundreds of gigabytes of zeroing to do.
     604             : 
     605             :          What would be really nice is if the kernel let us create huge
     606             :          pages without zeroing them, but it's not possible.  The
     607             :          ftruncate and fallocate calls do not support this type of
     608             :          resize with the hugetlbfs filesystem.
     609             : 
     610             :          Instead.. to prevent repeatedly doing this zeroing every time
     611             :          we start the validator, we have a small hack here to re-use the
     612             :          workspace files if they exist. */
     613           0 :       update_existing = 1;
     614           0 :     } else if( FD_LIKELY( result && errno==ENOENT ) ) {
     615           0 :       update_existing = 0;
     616           0 :     } else {
     617           0 :       FD_LOG_ERR(( "stat failed when trying to create workspace `%s` (%i-%s)", path, errno, fd_io_strerror( errno ) ));
     618           0 :     }
     619             : 
     620           0 :     if( FD_UNLIKELY( -1==fd_topo_create_workspace( &config->topo, wksp, update_existing ) ) ) {
     621           0 :       FD_TEST( errno==ENOMEM );
     622             : 
     623           0 :       warn_unknown_files( config, wksp->page_sz!=FD_SHMEM_HUGE_PAGE_SZ );
     624             : 
     625           0 :       char path[ PATH_MAX ];
     626           0 :       workspace_path( config, wksp, path );
     627           0 :       FD_LOG_ERR(( "ENOMEM-Out of memory when trying to create workspace `%s` at `%s` "
     628           0 :                    "with %lu %s pages. Firedancer reserves enough memory for all of its workspaces "
     629           0 :                    "during the `hugetlbfs` configure step, so it is likely you have unknown files "
     630           0 :                    "left over in this directory which are consuming memory, or another program on "
     631           0 :                    "the system is using pages from the same mount.",
     632           0 :                    wksp->name, path, wksp->page_cnt, fd_shmem_page_sz_to_cstr( wksp->page_sz ) ));
     633           0 :     }
     634           0 :     fd_topo_join_workspace( &config->topo, wksp, FD_SHMEM_JOIN_MODE_READ_WRITE );
     635           0 :     fd_topo_wksp_new( &config->topo, wksp, CALLBACKS );
     636           0 :     fd_topo_leave_workspace( &config->topo, wksp );
     637           0 :   }
     638             : 
     639           0 :   if( FD_UNLIKELY( seteuid( uid ) ) ) FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     640           0 :   if( FD_UNLIKELY( setegid( gid ) ) ) FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     641           0 : }
     642             : 
     643             : void
     644           0 : initialize_stacks( config_t const * config ) {
     645             : # if FD_HAS_MSAN
     646             :   /* MSan calls an external symbolizer using fork() on crashes, which is
     647             :      incompatible with Firedancer's MAP_SHARED stacks. */
     648             :   (void)config;
     649             :   return;
     650             : # endif
     651             : 
     652             :   /* Switch to non-root uid/gid for workspace creation.  Permissions
     653             :      checks are still done as the current user. */
     654           0 :   uint gid = getgid();
     655           0 :   uint uid = getuid();
     656           0 :   if( FD_LIKELY( gid!=config->gid && -1==setegid( config->gid ) ) )
     657           0 :     FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     658           0 :   if( FD_LIKELY( uid!=config->uid && -1==seteuid( config->uid ) ) )
     659           0 :     FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     660             : 
     661           0 :   for( ulong i=0UL; i<config->topo.tile_cnt; i++ ) {
     662           0 :     fd_topo_tile_t const * tile = &config->topo.tiles[ i ];
     663             : 
     664           0 :     char path[ PATH_MAX ];
     665           0 :     FD_TEST( fd_cstr_printf_check( path, PATH_MAX, NULL, "%s/%s_stack_%s%lu", config->hugetlbfs.huge_page_mount_path, config->name, tile->name, tile->kind_id ) );
     666             : 
     667           0 :     struct stat st;
     668           0 :     int result = stat( path, &st );
     669             : 
     670           0 :     int update_existing;
     671           0 :     if( FD_UNLIKELY( !result && config->is_live_cluster ) ) {
     672           0 :       if( FD_UNLIKELY( -1==unlink( path ) && errno!=ENOENT ) ) FD_LOG_ERR(( "unlink() failed when trying to create stack workspace `%s` (%i-%s)", path, errno, fd_io_strerror( errno ) ));
     673           0 :       update_existing = 0;
     674           0 :     } else if( FD_UNLIKELY( !result ) ) {
     675             :       /* See above note about zeroing out pages. */
     676           0 :       update_existing = 1;
     677           0 :     } else if( FD_LIKELY( result && errno==ENOENT ) ) {
     678           0 :       update_existing = 0;
     679           0 :     } else {
     680           0 :       FD_LOG_ERR(( "stat failed when trying to create workspace `%s` (%i-%s)", path, errno, fd_io_strerror( errno ) ));
     681           0 :     }
     682             : 
     683             :     /* TODO: Use a better CPU idx for the stack if tile is floating */
     684           0 :     ulong stack_cpu_idx = 0UL;
     685           0 :     if( FD_LIKELY( tile->cpu_idx<65535UL ) ) stack_cpu_idx = tile->cpu_idx;
     686             : 
     687           0 :     char name[ PATH_MAX ];
     688           0 :     FD_TEST( fd_cstr_printf_check( name, PATH_MAX, NULL, "%s_stack_%s%lu", config->name, tile->name, tile->kind_id ) );
     689             : 
     690           0 :     ulong sub_page_cnt[ 1 ] = { 6 };
     691           0 :     ulong sub_cpu_idx [ 1 ] = { stack_cpu_idx };
     692           0 :     int err;
     693           0 :     if( FD_UNLIKELY( update_existing ) ) {
     694           0 :       err = fd_shmem_update_multi( name, FD_SHMEM_HUGE_PAGE_SZ, 1, sub_page_cnt, sub_cpu_idx, S_IRUSR | S_IWUSR ); /* logs details */
     695           0 :     } else {
     696           0 :       err = fd_shmem_create_multi( name, FD_SHMEM_HUGE_PAGE_SZ, 1, sub_page_cnt, sub_cpu_idx, S_IRUSR | S_IWUSR ); /* logs details */
     697           0 :     }
     698           0 :     if( FD_UNLIKELY( err && errno==ENOMEM ) ) {
     699           0 :       warn_unknown_files( config, 0UL );
     700             : 
     701           0 :       char path[ PATH_MAX ];
     702           0 :       FD_TEST( fd_cstr_printf_check( path, PATH_MAX, NULL, "%s/%s_stack_%s%lu", config->hugetlbfs.huge_page_mount_path, config->name, tile->name, tile->kind_id ) );
     703           0 :       FD_LOG_ERR(( "ENOMEM-Out of memory when trying to create huge page stack for tile `%s` at `%s`. "
     704           0 :                    "Firedancer reserves enough memory for all of its stacks during the `hugetlbfs` configure "
     705           0 :                    "step, so it is likely you have unknown files left over in this directory which are "
     706           0 :                    "consuming memory, or another program on the system is using pages from the same mount.",
     707           0 :                    tile->name, path ));
     708           0 :     } else if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_shmem_create_multi failed" ));
     709           0 :   }
     710             : 
     711           0 :   if( FD_UNLIKELY( seteuid( uid ) ) ) FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     712           0 :   if( FD_UNLIKELY( setegid( gid ) ) ) FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     713           0 : }
     714             : 
     715             : void
     716           0 : fdctl_check_configure( config_t const * config ) {
     717           0 :   configure_result_t check = fd_cfg_stage_hugetlbfs.check( config );
     718           0 :   if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
     719           0 :     FD_LOG_ERR(( "Huge pages are not configured correctly: %s. You can run `fdctl configure init hugetlbfs` "
     720           0 :                  "to create the mounts correctly. This must be done after every system restart before running "
     721           0 :                  "Firedancer.", check.message ));
     722             : 
     723           0 :   if( FD_LIKELY( !config->development.netns.enabled && 0==strcmp( config->net.provider, "xdp" ) ) ) {
     724           0 :     check = fd_cfg_stage_ethtool_channels.check( config );
     725           0 :     if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
     726           0 :       FD_LOG_ERR(( "Network %s. You can run `fdctl configure init ethtool-channels` to set the number of channels on the "
     727           0 :                   "network device correctly.", check.message ));
     728             : 
     729           0 :     check = fd_cfg_stage_ethtool_offloads.check( config );
     730           0 :     if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
     731           0 :       FD_LOG_ERR(( "Network %s. You can run `fdctl configure init ethtool-offloads` to disable features "
     732           0 :                   "as required.", check.message ));
     733             : 
     734           0 :     check = fd_cfg_stage_ethtool_loopback.check( config );
     735           0 :     if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
     736           0 :       FD_LOG_ERR(( "Network %s. You can run `fdctl configure init ethtool-loopback` to disable tx-udp-segmentation "
     737           0 :                   "on the loopback device.", check.message ));
     738           0 :   }
     739             : 
     740           0 :   check = fd_cfg_stage_sysctl.check( config );
     741           0 :   if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
     742           0 :     FD_LOG_ERR(( "Kernel parameters are not configured correctly: %s. You can run `fdctl configure init sysctl` "
     743           0 :                  "to set kernel parameters correctly.", check.message ));
     744             : 
     745           0 :   check = fd_cfg_stage_hyperthreads.check( config );
     746           0 :   if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) )
     747           0 :     FD_LOG_ERR(( "Hyperthreading is not configured correctly: %s. You can run `fdctl configure init hyperthreads` "
     748           0 :                  "to configure hyperthreading correctly.", check.message ));
     749           0 : }
     750             : 
     751             : void
     752             : run_firedancer_init( config_t * config,
     753             :                      int        init_workspaces,
     754           0 :                      int        check_configure ) {
     755           0 :   struct stat st;
     756           0 :   int err = stat( config->paths.identity_key, &st );
     757           0 :   if( FD_UNLIKELY( -1==err && errno==ENOENT ) ) FD_LOG_ERR(( "[consensus.identity_path] key does not exist `%s`. You can generate an identity key at this path by running `fdctl keys new identity --config <toml>`", config->paths.identity_key ));
     758           0 :   else if( FD_UNLIKELY( -1==err ) )             FD_LOG_ERR(( "could not stat [consensus.identity_path] `%s` (%i-%s)", config->paths.identity_key, errno, fd_io_strerror( errno ) ));
     759             : 
     760           0 :   if( FD_UNLIKELY( !config->is_firedancer ) ) {
     761           0 :     for( ulong i=0UL; i<config->frankendancer.paths.authorized_voter_paths_cnt; i++ ) {
     762           0 :       err = stat( config->frankendancer.paths.authorized_voter_paths[ i ], &st );
     763           0 :       if( FD_UNLIKELY( -1==err && errno==ENOENT ) ) FD_LOG_ERR(( "[consensus.authorized_voter_paths] key does not exist `%s`", config->frankendancer.paths.authorized_voter_paths[ i ] ));
     764           0 :       else if( FD_UNLIKELY( -1==err ) )             FD_LOG_ERR(( "could not stat [consensus.authorized_voter_paths] `%s` (%i-%s)", config->frankendancer.paths.authorized_voter_paths[ i ], errno, fd_io_strerror( errno ) ));
     765           0 :     }
     766           0 :   }
     767             : 
     768             :   /* FIXME: fdctl_check_configure unconditionally checks for network
     769             :             stack prerequisites even if the command being run does not
     770             :             require networking.  Hack around that here for now. */
     771           0 :   if( check_configure ) fdctl_check_configure( config );
     772           0 :   if( FD_LIKELY( init_workspaces ) ) initialize_workspaces( config );
     773           0 :   initialize_stacks( config );
     774           0 : }
     775             : 
     776             : void
     777             : fdctl_setup_netns( config_t * config,
     778           0 :                    int        stay ) {
     779           0 :   if( !config->development.netns.enabled ) return;
     780             : 
     781           0 :   int original_netns_;
     782           0 :   int * original_netns = stay ? NULL : &original_netns_;
     783           0 :   if( FD_UNLIKELY( -1==fd_net_util_netns_enter( config->net.interface, original_netns ) ) )
     784           0 :     FD_LOG_ERR(( "failed to enter network namespace `%s` (%i-%s)", config->net.interface, errno, fd_io_strerror( errno ) ));
     785             : 
     786           0 :   if( 0==strcmp( config->net.provider, "xdp" ) ) {
     787           0 :     fd_cfg_stage_ethtool_channels.init( config );
     788           0 :     fd_cfg_stage_ethtool_offloads.init( config );
     789           0 :     fd_cfg_stage_ethtool_loopback.init( config );
     790           0 :   }
     791             : 
     792           0 :   if( FD_UNLIKELY( original_netns && -1==fd_net_util_netns_restore( original_netns_ ) ) )
     793           0 :     FD_LOG_ERR(( "failed to restore network namespace (fd=%d) (%i-%s)", original_netns_, errno, fd_io_strerror( errno ) ));
     794           0 : }
     795             : 
     796             : /* The boot sequence is a little bit involved...
     797             : 
     798             :    A process tree is created that looks like,
     799             : 
     800             :    + main
     801             :    +-- pidns
     802             :        +-- agave
     803             :        +-- tile 0
     804             :        +-- tile 1
     805             :        ...
     806             : 
     807             :    What we want is that if any process in the tree dies, all other
     808             :    processes will also die.  This is done as follows,
     809             : 
     810             :     (a) pidns is the init process of a PID namespace, so if it dies the
     811             :         kernel will terminate the child processes.
     812             : 
     813             :     (b) main is the parent of pidns, so it can issue a waitpid() on the
     814             :         child PID, and when it completes terminate itself.
     815             : 
     816             :     (c) pidns is the parent of agave and the tiles, so it could
     817             :         issue a waitpid() of -1 to wait for any of them to terminate,
     818             :         but how would it know if main has died?
     819             : 
     820             :     (d) main creates a pipe, and passes the write end to pidns.  If main
     821             :         dies, the pipe will be closed, and pidns will get a HUP on the
     822             :         read end.  Then pidns creates a pipe per child and passes the
     823             :         write end to the child.  If any of the children die, the pipe
     824             :         will be closed, and pidns will get a HUP on the read end.
     825             : 
     826             :         Then pidns can call poll() on both the write end of the main
     827             :         pipe and the read end of all the child pipes.  If any of them
     828             :         raises SIGHUP, then pidns knows that the parent or a child has
     829             :         died, and it can terminate itself, which due to (a) and (b)
     830             :         will kill all other processes. */
     831             : void
     832             : run_firedancer( config_t * config,
     833             :                 int        parent_pipefd,
     834           0 :                 int        init_workspaces ) {
     835             :   /* dump the topology we are using to the output log */
     836           0 :   fd_topo_print_log( 0, &config->topo );
     837             : 
     838           0 :   run_firedancer_init( config, init_workspaces, 1 );
     839             : 
     840           0 : #if defined(__x86_64__) || defined(__aarch64__)
     841             : 
     842             : #ifndef SYS_landlock_create_ruleset
     843             : #define SYS_landlock_create_ruleset 444
     844             : #endif
     845             : 
     846           0 : #ifndef LANDLOCK_CREATE_RULESET_VERSION
     847           0 : #define LANDLOCK_CREATE_RULESET_VERSION (1U << 0)
     848           0 : #endif
     849             : 
     850           0 : #endif
     851           0 :   long abi = syscall( SYS_landlock_create_ruleset, NULL, 0, LANDLOCK_CREATE_RULESET_VERSION );
     852           0 :   if( -1L==abi && (errno==ENOSYS || errno==EOPNOTSUPP ) ) {
     853           0 :     FD_LOG_WARNING(( "The Landlock access control system is not supported by your Linux kernel. Firedancer uses landlock to "
     854           0 :                      "provide an additional layer of security to the sandbox, but it is not required." ));
     855           0 :   }
     856             : 
     857           0 :   if( FD_UNLIKELY( close( 0 ) ) ) FD_LOG_ERR(( "close(0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     858           0 :   if( FD_UNLIKELY( fd_log_private_logfile_fd()!=1 && close( 1 ) ) ) FD_LOG_ERR(( "close(1) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     859             : 
     860           0 :   int pipefd;
     861           0 :   pid_namespace = clone_firedancer( config, parent_pipefd, &pipefd );
     862             : 
     863             :   /* Print the location of the logfile on SIGINT or SIGTERM, and also
     864             :      kill the child.  They are connected by a pipe which the child is
     865             :      polling so we don't strictly need to kill the child, but its helpful
     866             :      to do that before printing the log location line, else it might
     867             :      get interleaved due to timing windows in the shutdown. */
     868           0 :   install_parent_signals();
     869             : 
     870           0 :   if( FD_UNLIKELY( close( config->log.lock_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     871             : 
     872           0 :   struct sock_filter seccomp_filter[ 128UL ];
     873           0 :   populate_sock_filter_policy_main( 128UL, seccomp_filter, (uint)fd_log_private_logfile_fd(), (uint)pid_namespace );
     874             : 
     875           0 :   int allow_fds[ 4 ];
     876           0 :   ulong allow_fds_cnt = 0;
     877           0 :   allow_fds[ allow_fds_cnt++ ] = 2; /* stderr */
     878           0 :   if( FD_LIKELY( fd_log_private_logfile_fd()!=-1 ) )
     879           0 :     allow_fds[ allow_fds_cnt++ ] = fd_log_private_logfile_fd(); /* logfile, or maybe stdout */
     880           0 :   allow_fds[ allow_fds_cnt++ ] = pipefd; /* read end of main pipe */
     881           0 :   if( FD_UNLIKELY( parent_pipefd!=-1 ) )
     882           0 :     allow_fds[ allow_fds_cnt++ ] = parent_pipefd; /* write end of parent pipe */
     883             : 
     884           0 :   if( FD_LIKELY( config->development.sandbox ) ) {
     885           0 :     fd_sandbox_enter( config->uid,
     886           0 :                       config->gid,
     887           0 :                       0,
     888           0 :                       0,
     889           0 :                       0,
     890           0 :                       1, /* Keep controlling terminal for main so it can receive Ctrl+C */
     891           0 :                       0,
     892           0 :                       0UL,
     893           0 :                       0UL,
     894           0 :                       0UL,
     895           0 :                       allow_fds_cnt,
     896           0 :                       allow_fds,
     897           0 :                       sock_filter_policy_main_instr_cnt,
     898           0 :                       seccomp_filter );
     899           0 :   } else {
     900           0 :     fd_sandbox_switch_uid_gid( config->uid, config->gid );
     901           0 :   }
     902             : 
     903             :   /* The supervsior process should not share the log lock, because a
     904             :      child process might die while holding it and we still need to
     905             :      reap and print errors. */
     906           0 :   int lock = 0;
     907           0 :   fd_log_private_shared_lock = &lock;
     908             : 
     909             :   /* the only clean way to exit is SIGINT or SIGTERM on this parent process,
     910             :      so if wait4() completes, it must be an error */
     911           0 :   int wstatus;
     912           0 :   if( FD_UNLIKELY( -1==wait4( pid_namespace, &wstatus, (int)__WALL, NULL ) ) )
     913           0 :     FD_LOG_ERR(( "main wait4() failed (%i-%s)\nLog at \"%s\"", errno, fd_io_strerror( errno ), fd_log_private_path ));
     914             : 
     915           0 :   if( FD_UNLIKELY( WIFSIGNALED( wstatus ) ) ) fd_sys_util_exit_group( WTERMSIG( wstatus ) ? WTERMSIG( wstatus ) : 1 );
     916           0 :   else fd_sys_util_exit_group( WEXITSTATUS( wstatus ) ? WEXITSTATUS( wstatus ) : 1 );
     917           0 : }
     918             : 
     919             : void
     920             : run_cmd_fn( args_t *   args FD_PARAM_UNUSED,
     921           0 :             config_t * config ) {
     922           0 :   #define CHECK_PORT_NON_ZERO( field ) \
     923           0 :     if( FD_UNLIKELY( config->field==0 ) ) { \
     924           0 :       FD_LOG_ERR(( #field " is not set in your configuration file. Please set it to a non-zero value." )); \
     925           0 :     }
     926             : 
     927           0 :   if( FD_UNLIKELY( !config->gossip.entrypoints_cnt && !config->development.bootstrap ) )
     928           0 :     FD_LOG_ERR(( "No entrypoints specified in configuration file under [gossip.entrypoints], but "
     929           0 :                  "at least one is needed to determine how to connect to the Solana cluster. If "
     930           0 :                  "you want to start a new cluster in a development environment, use `fddev` instead "
     931           0 :                  "of `fdctl`. If you want to use an existing genesis, set [development.bootstrap] "
     932           0 :                  "to \"true\" in the configuration file." ));
     933             : 
     934           0 :   for( ulong i=0; i<config->gossip.entrypoints_cnt; i++ ) {
     935           0 :     if( FD_UNLIKELY( !strcmp( config->gossip.entrypoints[ i ], "" ) ) )
     936           0 :       FD_LOG_ERR(( "One of the entrypoints in your configuration file under [gossip.entrypoints] is "
     937           0 :                    "empty. Please remove the empty entrypoint or set it correctly. "));
     938           0 :   }
     939             : 
     940           0 :   CHECK_PORT_NON_ZERO( gossip.port );
     941           0 :   CHECK_PORT_NON_ZERO( tiles.quic.quic_transaction_listen_port );
     942           0 :   CHECK_PORT_NON_ZERO( tiles.quic.regular_transaction_listen_port );
     943           0 :   CHECK_PORT_NON_ZERO( tiles.shred.shred_listen_port );
     944           0 :   CHECK_PORT_NON_ZERO( tiles.metric.prometheus_listen_port );
     945           0 :   CHECK_PORT_NON_ZERO( tiles.gui.gui_listen_port );
     946             : 
     947           0 :   #undef CHECK_PORT_NON_ZERO
     948             : 
     949           0 :   run_firedancer( config, -1, 1 );
     950           0 : }
     951             : 
     952             : action_t fd_action_run1 = {
     953             :   .name        = "run1",
     954             :   .args        = run1_cmd_args,
     955             :   .fn          = run1_cmd_fn,
     956             :   .perm        = NULL,
     957             :   .description = "Start up a single Firedancer tile"
     958             : };
     959             : 
     960             : action_t fd_action_run = {
     961             :   .name           = "run",
     962             :   .args           = NULL,
     963             :   .fn             = run_cmd_fn,
     964             :   .require_config = 1,
     965             :   .perm           = run_cmd_perm,
     966             :   .description    = "Start up a Firedancer validator",
     967             :   .permission_err = "insufficient permissions to execute command `%s`. It is recommended "
     968             :                     "to start Firedancer as the root user, but you can also start it "
     969             :                     "with the missing capabilities listed above. The program only needs "
     970             :                     "to start with elevated permissions to do privileged operations at "
     971             :                     "boot, and will immediately drop permissions and switch to the user "
     972             :                     "specified in your configuration file once they are complete. Firedancer "
     973             :                     "will not execute outside of the boot process as root, and will refuse "
     974             :                     "to start if it cannot drop privileges. Firedancer needs to be started "
     975             :                     "privileged to configure high performance networking with XDP.",
     976             : };

Generated by: LCOV version 1.14