Line data Source code
1 : #define _GNU_SOURCE
2 : #include "fd_sandbox.h"
3 :
4 : #include "../cstr/fd_cstr.h"
5 : #include "../log/fd_log.h"
6 :
7 : #include <fcntl.h>
8 : #include <stdlib.h>
9 : #include <errno.h>
10 : #include <unistd.h>
11 : #include <sched.h>
12 : #include <dirent.h>
13 : #include <sys/stat.h>
14 : #include <sys/wait.h>
15 : #include <sys/prctl.h>
16 : #include <sys/mount.h>
17 : #include <sys/random.h>
18 : #include <sys/syscall.h>
19 : #include <sys/resource.h>
20 : #include <linux/keyctl.h>
21 : #include <linux/seccomp.h>
22 : #include <linux/securebits.h>
23 : #include <linux/capability.h>
24 :
25 : #if !defined(__linux__)
26 : #error "Target operating system is unsupported by seccomp."
27 : #endif
28 :
29 : #if !defined(__x86_64__) && !defined(__aarch64__)
30 : #error "Target architecture is unsupported by seccomp."
31 : #else
32 :
33 : #ifndef SYS_landlock_create_ruleset
34 0 : #define SYS_landlock_create_ruleset 444
35 : #endif
36 :
37 : #ifndef SYS_landlock_restrict_self
38 0 : #define SYS_landlock_restrict_self 446
39 : #endif
40 :
41 : #endif
42 :
43 : void
44 : fd_sandbox_private_switch_uid_gid( uint desired_uid,
45 : uint desired_gid );
46 :
47 : static int
48 0 : check_unshare_eacces_main( void * _arg ) {
49 0 : ulong arg = (ulong)_arg;
50 0 : uint desired_uid = (uint)((arg >> 0UL) & 0xFFFFUL);
51 0 : uint desired_gid = (uint)((arg >> 32UL) & 0xFFFFUL);
52 :
53 0 : fd_sandbox_private_switch_uid_gid( desired_uid, desired_gid );
54 0 : int result = unshare( CLONE_NEWUSER );
55 0 : if( -1==result && errno==EACCES ) return 255;
56 0 : else if( -1==result ) FD_LOG_ERR(( "unshare(CLONE_NEWUSER) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
57 0 : result = open( "/proc/self/setgroups", O_WRONLY );
58 0 : if( -1==result && errno==EACCES ) return 255;
59 0 : if( -1==result ) FD_LOG_ERR(( "open(/proc/self/setgroups) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
60 0 : return 0;
61 0 : }
62 :
63 : int
64 : fd_sandbox_requires_cap_sys_admin( uint desired_uid,
65 0 : uint desired_gid ) {
66 :
67 : /* Check for the `unprivileged_userns_clone` sysctl which restricts
68 : unprivileged user namespaces on Debian. */
69 :
70 0 : int fd = open( "/proc/sys/kernel/unprivileged_userns_clone", O_RDONLY );
71 0 : if( -1==fd && errno!=ENOENT ) FD_LOG_ERR(( "open(/proc/sys/kernel/unprivileged_userns_clone) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
72 0 : else if( -1!=fd ) {
73 0 : char buf[ 16 ] = {0};
74 0 : long count = read( fd, buf, sizeof( buf ) );
75 0 : if( -1L==count ) FD_LOG_ERR(( "read(/proc/sys/kernel/unprivileged_userns_clone) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
76 0 : if( (ulong)count>=sizeof( buf ) ) FD_LOG_ERR(( "read(/proc/sys/kernel/unprivileged_userns_clone) returned truncated data" ));
77 0 : if( 0L!=read( fd, buf, sizeof( buf ) ) ) FD_LOG_ERR(( "read(/proc/sys/kernel/unprivileged_userns_clone) did not return all the data" ));
78 :
79 0 : char * end;
80 0 : ulong unprivileged_userns_clone = strtoul( buf, &end, 10 );
81 0 : if( *end!='\n' ) FD_LOG_ERR(( "read(/proc/sys/kernel/unprivileged_userns_clone) returned malformed data" ));
82 0 : if( close( fd ) ) FD_LOG_ERR(( "close(/proc/sys/kernel/unprivileged_userns_clone) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
83 :
84 0 : if( unprivileged_userns_clone!=0 && unprivileged_userns_clone!=1 ) FD_LOG_ERR(( "unprivileged_userns_clone has unexpected value %lu", unprivileged_userns_clone ));
85 :
86 0 : if( !unprivileged_userns_clone ) return 1;
87 0 : }
88 :
89 : /* Check for EACCES when actually trying to create a user namespace,
90 : which indicates an Ubuntu, AppArmor, or SELinux restriction. We do
91 : this in a forked process so it doesn't unintentionally sandbox the
92 : caller. Actually we can't fork here, because the stack might be
93 : MAP_SHARED, so do it in a clone with a new stack instead.
94 :
95 : From Ubuntu 23.10 til 24.04, user namespace creation is disallowed
96 : by default and trying to create one as an unprivileged user will
97 : return EACCES.
98 :
99 : From Ubuntu 24.04 onwards, user namespace creation is allowed, but
100 : trying to write to /proc/self/setgroups or set the UID/GID maps
101 : within the namespace will return EACCES. */
102 :
103 0 : do {
104 0 : uchar child_stack[ 2097152 ]; /* 2 MiB */
105 0 : ulong arg = ((ulong)desired_uid << 0UL) | (((ulong)desired_gid) << 32UL);
106 0 : int child_pid = clone( check_unshare_eacces_main, child_stack+sizeof(child_stack), 0, (void*)arg );
107 0 : if( -1==child_pid ) FD_LOG_ERR(( "clone() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
108 :
109 0 : int wstatus;
110 0 : if( -1==waitpid( child_pid, &wstatus, __WALL ) ) FD_LOG_ERR(( "waitpid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
111 0 : if( WIFSIGNALED( wstatus ) ) FD_LOG_ERR(( "user namespace privilege checking process terminated by signal %i-%s", WTERMSIG( wstatus ), fd_io_strsignal( WTERMSIG( wstatus ) ) ));
112 0 : if( WEXITSTATUS( wstatus ) && WEXITSTATUS( wstatus )!=255 ) FD_LOG_ERR(( "user namespace privilege checking process exited with status %i", WEXITSTATUS( wstatus ) ));
113 :
114 0 : if( WEXITSTATUS( wstatus ) ) return 1;
115 0 : } while(0);
116 :
117 0 : return 0;
118 0 : }
119 :
120 : extern char ** environ;
121 :
122 : void FD_FN_SENSITIVE
123 0 : fd_sandbox_private_explicit_clear_environment_variables( void ) {
124 0 : if( !environ ) return;
125 :
126 0 : for( char * const * env = environ; *env; env++ ) {
127 0 : ulong len = strlen( *env );
128 0 : explicit_bzero( *env, len );
129 0 : }
130 :
131 0 : if( clearenv() ) FD_LOG_ERR(( "clearenv failed" ));
132 0 : }
133 :
134 : void
135 : fd_sandbox_private_check_exact_file_descriptors( ulong allowed_file_descriptor_cnt,
136 0 : int const * allowed_file_descriptor ) {
137 0 : if( allowed_file_descriptor_cnt>256UL ) FD_LOG_ERR(( "allowed_file_descriptors_cnt must not be more than 256" ));
138 0 : int seen_fds[ 256 ] = {0};
139 :
140 0 : for( ulong i=0UL; i<allowed_file_descriptor_cnt; i++ ) {
141 0 : if( allowed_file_descriptor[ i ]<0 || allowed_file_descriptor[ i ]==INT_MAX )
142 0 : FD_LOG_ERR(( "allowed_file_descriptors contains invalid file descriptor %d", allowed_file_descriptor[ i ] ));
143 0 : }
144 :
145 0 : for( ulong i=0UL; i<allowed_file_descriptor_cnt; i++ ) {
146 0 : for( ulong j=0UL; j<allowed_file_descriptor_cnt; j++ ) {
147 0 : if( i==j ) continue;
148 0 : if( allowed_file_descriptor[ i ]==allowed_file_descriptor[ j ] )
149 0 : FD_LOG_ERR(( "allowed_file_descriptor contains duplicate entry %d", allowed_file_descriptor[ i ] ));
150 0 : }
151 0 : }
152 :
153 0 : int dirfd = open( "/proc/self/fd", O_RDONLY | O_DIRECTORY );
154 0 : if( dirfd<0 ) FD_LOG_ERR(( "open(/proc/self/fd) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
155 :
156 0 : for(;;) {
157 : /* The getdents64() syscall ABI does not require that buf is aligned,
158 : since dent->d_name field is variable length, the records are not
159 : always aligned and the cast below is going to be unaligned anyway
160 : however...
161 :
162 : If we don't align it the compiler might prove somthing weird and
163 : trash this code, and also ASAN would flag it as an error. So we
164 : just align it anyway. */
165 0 : uchar buf[ 4096 ] __attribute__((aligned(alignof(struct dirent64))));
166 :
167 0 : long dents_bytes = syscall( SYS_getdents64, dirfd, buf, sizeof( buf ) );
168 0 : if( !dents_bytes ) break;
169 0 : else if( -1L==dents_bytes ) FD_LOG_ERR(( "getdents64() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
170 :
171 0 : ulong offset = 0UL;
172 0 : while( offset<(ulong)dents_bytes ) {
173 0 : struct dirent64 const * dent = (struct dirent64 const *)(buf + offset);
174 0 : if( !strcmp( dent->d_name, "." ) || !strcmp( dent->d_name, ".." ) ) {
175 0 : offset += dent->d_reclen;
176 0 : continue;
177 0 : }
178 :
179 0 : char * end;
180 0 : long _fd = strtol( dent->d_name, &end, 10 );
181 0 : if( *end != '\0' ) FD_LOG_ERR(( "/proc/self/pid has unrecognized entry name %s", dent->d_name ));
182 0 : if( _fd>=INT_MAX ) FD_LOG_ERR(( "/proc/self/pid has file descriptor number %ld which is too large", _fd ));
183 0 : int fd = (int)_fd;
184 :
185 0 : if( fd==dirfd ) {
186 0 : offset += dent->d_reclen;
187 0 : continue;
188 0 : }
189 :
190 0 : int found = 0;
191 0 : for( ulong i=0UL; i<allowed_file_descriptor_cnt; i++ ) {
192 0 : if( fd==allowed_file_descriptor[ i ] ) {
193 0 : if( seen_fds[ i ] ) FD_LOG_ERR(( "/proc/self/fd contained the same file descriptor (%d) twice", fd ));
194 0 : seen_fds[ i ] = 1;
195 0 : found = 1;
196 0 : break;
197 0 : }
198 0 : }
199 :
200 0 : if( !found ) {
201 0 : char path[ PATH_MAX ];
202 0 : FD_TEST( fd_cstr_printf_check( path, sizeof( path ), NULL, "/proc/self/fd/%d", fd ) );
203 :
204 0 : char target[ PATH_MAX ];
205 0 : long count = readlink( path, target, PATH_MAX );
206 0 : if( count<0L ) FD_LOG_ERR(( "readlink(%s) failed (%i-%s)", path, errno, fd_io_strerror( errno ) ));
207 0 : if( count>=PATH_MAX ) FD_LOG_ERR(( "readlink(%s) returned truncated path", path ));
208 0 : target[ count ] = '\0';
209 :
210 0 : FD_LOG_ERR(( "unexpected file descriptor %d open %s", fd, target ));
211 0 : }
212 :
213 0 : offset += dent->d_reclen;
214 0 : }
215 0 : }
216 :
217 0 : for( ulong i=0UL; i<allowed_file_descriptor_cnt; i++ ) {
218 0 : if( !seen_fds[ i ] ) FD_LOG_ERR(( "allowed file descriptor %d not present", allowed_file_descriptor[ i ] ));
219 0 : }
220 :
221 0 : if( close( dirfd ) ) FD_LOG_ERR(( "close(/proc/self/fd) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
222 0 : }
223 :
224 : void
225 : fd_sandbox_private_switch_uid_gid( uint desired_uid,
226 0 : uint desired_gid ) {
227 : /* We do a small hack: in development environments we sometimes want
228 : to run all tiles in a single process. In that case, the sandbox
229 : doesn't get created except that we still switch to the desired uid
230 : and gid.
231 :
232 : There's a problem with this: POSIX states that all threads in a
233 : process must have the same uid and gid, so glibc does some wacky
234 : stuff... from man 2 setresgid
235 :
236 : C library/kernel differences
237 : At the kernel level, user IDs and group IDs are a per-thread
238 : attribute. However, POSIX requires that all threads in a
239 : process share the same credentials. The NPTL threading
240 : implementation handles the POSIX requirements by providing
241 : wrapper functions for the various system calls that change
242 : process UIDs and GIDs. These wrap‐ per functions
243 : (including those for setresuid() and setresgid()) employ a
244 : signal-based technique to ensure that when one thread
245 : changes credentials, all of the other threads in the process
246 : also change their credentials. For details, see nptl(7).
247 :
248 : We know all of our threads in this development case are going to
249 : switch to the target uid/gid at their own leisure (they need to
250 : so they can do privileged steps before dropping root), so to
251 : align this behavior between production and development, we invoke
252 : the syscall directly and do not let glibc switch uid/gid on the
253 : other threads in the process. */
254 0 : int changed = 0;
255 0 : gid_t curgid, curegid, cursgid;
256 0 : if( -1==getresgid( &curgid, &curegid, &cursgid ) ) FD_LOG_ERR(( "getresgid failed (%i-%s)", errno, fd_io_strerror( errno ) ));
257 0 : if( desired_gid!=curgid || desired_gid!=curegid || desired_gid!=cursgid ) {
258 0 : if( -1==syscall( __NR_setresgid, desired_gid, desired_gid, desired_gid ) ) FD_LOG_ERR(( "setresgid failed (%i-%s)", errno, fd_io_strerror( errno ) ));
259 0 : changed = 1;
260 0 : }
261 :
262 0 : uid_t curuid, cureuid, cursuid;
263 0 : if( -1==getresuid( &curuid, &cureuid, &cursuid ) ) FD_LOG_ERR(( "getresuid failed (%i-%s)", errno, fd_io_strerror( errno ) ));
264 0 : if( desired_uid!=curuid || desired_uid!=cureuid || desired_uid!=cursuid ) {
265 0 : if( -1==syscall( __NR_setresuid, desired_uid, desired_uid, desired_uid ) ) FD_LOG_ERR(( "setresuid failed (%i-%s)", errno, fd_io_strerror( errno ) ));
266 0 : changed = 1;
267 0 : }
268 :
269 : /* Calling setresgid/setresuid sets the dumpable bit to 0 which
270 : prevents debugging and stops us from setting our uid/gid maps in
271 : the user namespace so restore it if it was changed. */
272 0 : if( changed ) {
273 0 : if( -1==prctl( PR_SET_DUMPABLE, 1 ) ) FD_LOG_ERR(( "prctl(PR_SET_DUMPABLE, 1) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
274 0 : }
275 0 : }
276 :
277 : void
278 : fd_sandbox_private_write_userns_uid_gid_maps( uint uid_in_parent,
279 0 : uint gid_in_parent ) {
280 0 : int setgroups_fd = open( "/proc/self/setgroups", O_WRONLY );
281 0 : if( FD_UNLIKELY( setgroups_fd<0 ) ) FD_LOG_ERR(( "open(/proc/self/setgroups) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
282 :
283 0 : long written = write( setgroups_fd, "deny", strlen( "deny" ) );
284 0 : if( FD_UNLIKELY( -1L==written ) ) FD_LOG_ERR(( "write(/proc/self/setgroups) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
285 0 : else if( FD_UNLIKELY( written!=(long)strlen( "deny" ) ) ) FD_LOG_ERR(( "write(/proc/self/setgroups) failed to write all data" ));
286 0 : if( FD_UNLIKELY( close( setgroups_fd ) ) ) FD_LOG_ERR(( "close(/proc/self/setgroups) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
287 :
288 0 : static char const * MAP_PATHS[] = {
289 0 : "/proc/self/uid_map",
290 0 : "/proc/self/gid_map",
291 0 : };
292 :
293 0 : uint ids[] = {
294 0 : uid_in_parent,
295 0 : gid_in_parent
296 0 : };
297 :
298 0 : for( ulong i=0UL; i<2UL; i++ ) {
299 0 : int fd = open( MAP_PATHS[ i ], O_WRONLY );
300 0 : if( -1==fd ) FD_LOG_ERR(( "open(%s) failed (%i-%s)", MAP_PATHS[ i ], errno, fd_io_strerror( errno ) ));
301 :
302 0 : char map_line[ 64 ];
303 0 : FD_TEST( fd_cstr_printf_check( map_line, sizeof( map_line ), NULL, "1 %u 1\n", ids[ i ] ) );
304 0 : long written = write( fd, map_line, strlen( map_line ) );
305 0 : if( -1L==written ) FD_LOG_ERR(( "write(%s) failed (%i-%s)", MAP_PATHS[ i ], errno, fd_io_strerror( errno ) ));
306 0 : if( written != (long)strlen( map_line ) ) FD_LOG_ERR(( "write(%s) failed to write all data", MAP_PATHS[ i ] ));
307 0 : if( close( fd ) ) FD_LOG_ERR(( "close(%s) failed (%i-%s)", MAP_PATHS[ i ], errno, fd_io_strerror( errno ) ));
308 0 : }
309 0 : }
310 :
311 : void
312 0 : fd_sandbox_private_deny_namespaces( void ) {
313 0 : static char const * SYSCTLS[] = {
314 0 : "/proc/sys/user/max_user_namespaces",
315 0 : "/proc/sys/user/max_mnt_namespaces",
316 0 : "/proc/sys/user/max_cgroup_namespaces",
317 0 : "/proc/sys/user/max_ipc_namespaces",
318 0 : "/proc/sys/user/max_net_namespaces",
319 0 : "/proc/sys/user/max_pid_namespaces",
320 0 : "/proc/sys/user/max_uts_namespaces",
321 0 : };
322 :
323 0 : static char const * VALUES[] = {
324 0 : "1", /* One user namespace is allowed, to created the nested child. */
325 0 : "2", /* Two mount namespaces are allowed, the one in the parent user namespace, and the one we will use to pivot the root in the child namespace */
326 0 : "0",
327 0 : "0",
328 0 : "0",
329 0 : "0",
330 0 : "0",
331 0 : };
332 :
333 0 : for( ulong i=0UL; i<sizeof(SYSCTLS)/sizeof(SYSCTLS[ 0 ]); i++) {
334 0 : int fd = open( SYSCTLS[ i ], O_WRONLY );
335 0 : if( fd<0 ) FD_LOG_ERR(( "open(%s) failed (%i-%s)", SYSCTLS[ i ], errno, fd_io_strerror( errno ) ));
336 :
337 0 : long written = write( fd, VALUES[ i ], 1 );
338 0 : if( written==-1 ) FD_LOG_ERR(( "write(%s) failed (%i-%s)", SYSCTLS[ i ], errno, fd_io_strerror( errno ) ));
339 0 : else if( written!=1 ) FD_LOG_ERR(( "write(%s) failed to write data", SYSCTLS[ i ] ));
340 0 : if( FD_UNLIKELY( close( fd ) ) ) FD_LOG_ERR(( "close(%s) failed (%i-%s)", SYSCTLS[ i ], errno, fd_io_strerror( errno ) ));
341 0 : }
342 0 : }
343 :
344 : void
345 0 : fd_sandbox_private_pivot_root( void ) {
346 : /* The steps taken here to unmount the filesystem and jail us into an
347 : empty location look incredibly strange, but are a somewhat standard
348 : pattern copied from other sandboxes. For a couple of examples, see
349 :
350 : https://github.com/firecracker-microvm/firecracker/blob/main/src/jailer/src/chroot.rs
351 : https://github.com/hpc/charliecloud/blob/master/bin/ch-checkns.c
352 : https://github.com/opencontainers/runc/blob/HEAD/libcontainer/rootfs_linux.go#L671
353 : https://github.com/lxc/lxc/blob/HEAD/src/lxc/conf.c#L1121
354 : https://github.com/containers/bubblewrap/blob/main/bubblewrap.c#L3196
355 :
356 : The core problem is that calling pivot_root(2) will fail if the
357 : list of mounts in the namespace is not arranged very carefully. */
358 :
359 0 : if( -1==unshare( CLONE_NEWNS ) ) FD_LOG_ERR(( "unshare(CLONE_NEWNS) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
360 :
361 0 : ulong bytes;
362 0 : if( 8UL!=getrandom( &bytes, sizeof( bytes ), 0 ) ) FD_LOG_ERR(( "getrandom() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
363 :
364 0 : char new_root_path[ PATH_MAX ];
365 0 : FD_TEST( fd_cstr_printf_check( new_root_path, sizeof( new_root_path ), NULL, "/tmp/fd_sandbox_%lu", bytes ) );
366 :
367 0 : if( -1==mkdir( new_root_path, S_IRUSR | S_IWUSR | S_IXUSR ) ) FD_LOG_ERR(( "mkdir(%s, 0700) failed (%i-%s)", new_root_path, errno, fd_io_strerror( errno ) ));
368 0 : if( -1==mount( NULL, "/", NULL, MS_SLAVE | MS_REC, NULL ) ) FD_LOG_ERR(( "mount(NULL, /, NULL, MS_SLAVE | MS_REC, NULL) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
369 0 : if( -1==mount( new_root_path, new_root_path, NULL, MS_BIND | MS_REC, NULL ) ) FD_LOG_ERR(( "mount(%s, %s, NULL, MS_BIND | MS_REC, NULL) failed (%i-%s)", new_root_path, new_root_path, errno, fd_io_strerror( errno ) ));
370 0 : if( -1==chdir( new_root_path ) ) FD_LOG_ERR(( "chdir(%s) failed (%i-%s)", new_root_path, errno, fd_io_strerror( errno ) ));
371 0 : if( -1==syscall( SYS_pivot_root, ".", "." ) ) FD_LOG_ERR(( "pivot_root(., .) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
372 0 : if( -1==umount2( ".", MNT_DETACH ) ) FD_LOG_ERR(( "umount2(., MNT_DETACH) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
373 0 : if( -1==chdir( "/" ) ) FD_LOG_ERR(( "chdir(/) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
374 0 : }
375 :
376 : struct rlimit_setting {
377 : #ifdef __GLIBC__
378 : __rlimit_resource_t resource;
379 : #else /* non-glibc */
380 : int resource;
381 : #endif /* __GLIBC__ */
382 :
383 : ulong limit;
384 : };
385 :
386 : void
387 0 : fd_sandbox_private_set_rlimits( ulong rlimit_file_cnt ) {
388 0 : struct rlimit_setting rlimits[] = {
389 0 : { .resource=RLIMIT_NOFILE, .limit=rlimit_file_cnt },
390 : /* The man page for setrlimit(2) states about RLIMIT_NICE:
391 :
392 : The useful range for this limit is thus from 1 (corresponding
393 : to a nice value of 19) to 40 (corresponding to a nice value of
394 : -20).
395 :
396 : But this is misleading. The range of values is from 0 to 40,
397 : even though the "useful" range is 1 to 40, because a value of 0
398 : and a value of 1 for the rlimit both map to a nice value of 19.
399 :
400 : But... if you attempt to call setrlimit( RLIMIT_NICE, 1 ) without
401 : CAP_SYS_RESOURCE, and the hard limit is already 0, you will get
402 : EPERM, so we actually have to set the limit to 0 here, not 1. */
403 0 : { .resource=RLIMIT_NICE, .limit=0UL },
404 :
405 0 : { .resource=RLIMIT_AS, .limit=0UL },
406 0 : { .resource=RLIMIT_CORE, .limit=0UL },
407 0 : { .resource=RLIMIT_DATA, .limit=0UL },
408 0 : { .resource=RLIMIT_MEMLOCK, .limit=0UL },
409 0 : { .resource=RLIMIT_MSGQUEUE, .limit=0UL },
410 0 : { .resource=RLIMIT_NPROC, .limit=0UL },
411 0 : { .resource=RLIMIT_RTPRIO, .limit=0UL },
412 0 : { .resource=RLIMIT_RTTIME, .limit=0UL },
413 0 : { .resource=RLIMIT_SIGPENDING, .limit=0UL },
414 0 : { .resource=RLIMIT_STACK, .limit=0UL },
415 :
416 : /* Resources that can't be restricted. */
417 : // { .resource=RLIMIT_CPU, .limit=0UL },
418 : // { .resource=RLIMIT_FSIZE, .limit=0UL },
419 :
420 : /* Deprecated resources, not used. */
421 : // { .resource=RLIMIT_LOCKS, .limit=0UL },
422 : // { .resource=RLIMIT_RSS, .limit=0UL },
423 0 : };
424 :
425 0 : for( ulong i=0UL; i<sizeof(rlimits)/sizeof(rlimits[ 0 ]); i++ ) {
426 0 : struct rlimit limit = { .rlim_cur=rlimits[ i ].limit, .rlim_max=rlimits[ i ].limit };
427 0 : if( -1==setrlimit( rlimits[ i ].resource, &limit ) ) FD_LOG_ERR(( "setrlimit(%u) failed (%i-%s)", rlimits[ i ].resource, errno, fd_io_strerror( errno ) ));
428 0 : }
429 0 : }
430 :
431 : void
432 0 : fd_sandbox_private_drop_caps( ulong cap_last_cap ) {
433 0 : if( -1==prctl( PR_SET_SECUREBITS,
434 0 : SECBIT_KEEP_CAPS_LOCKED | SECBIT_NO_SETUID_FIXUP |
435 0 : SECBIT_NO_SETUID_FIXUP_LOCKED | SECBIT_NOROOT |
436 0 : SECBIT_NOROOT_LOCKED | SECBIT_NO_CAP_AMBIENT_RAISE |
437 0 : SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED ) ) FD_LOG_ERR(( "prctl(PR_SET_SECUREBITS) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
438 :
439 0 : for( ulong cap=0UL; cap<=cap_last_cap; cap++ ) {
440 0 : if( -1==prctl( PR_CAPBSET_DROP, cap, 0, 0, 0 ) ) FD_LOG_ERR(( "prctl(PR_CAPBSET_DROP) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
441 0 : }
442 :
443 0 : struct __user_cap_header_struct hdr = { _LINUX_CAPABILITY_VERSION_3, 0 };
444 0 : struct __user_cap_data_struct data[2] = { { 0 } };
445 0 : if( -1==syscall( SYS_capset, &hdr, data ) ) FD_LOG_ERR(( "syscall(SYS_capset) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
446 0 : if( -1==prctl( PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0 ) ) FD_LOG_ERR(( "prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
447 0 : }
448 :
449 0 : #define LANDLOCK_CREATE_RULESET_VERSION (1U << 0)
450 :
451 0 : #define LANDLOCK_ACCESS_FS_EXECUTE (1ULL << 0)
452 0 : #define LANDLOCK_ACCESS_FS_WRITE_FILE (1ULL << 1)
453 0 : #define LANDLOCK_ACCESS_FS_READ_FILE (1ULL << 2)
454 0 : #define LANDLOCK_ACCESS_FS_READ_DIR (1ULL << 3)
455 0 : #define LANDLOCK_ACCESS_FS_REMOVE_DIR (1ULL << 4)
456 0 : #define LANDLOCK_ACCESS_FS_REMOVE_FILE (1ULL << 5)
457 0 : #define LANDLOCK_ACCESS_FS_MAKE_CHAR (1ULL << 6)
458 0 : #define LANDLOCK_ACCESS_FS_MAKE_DIR (1ULL << 7)
459 0 : #define LANDLOCK_ACCESS_FS_MAKE_REG (1ULL << 8)
460 0 : #define LANDLOCK_ACCESS_FS_MAKE_SOCK (1ULL << 9)
461 0 : #define LANDLOCK_ACCESS_FS_MAKE_FIFO (1ULL << 10)
462 0 : #define LANDLOCK_ACCESS_FS_MAKE_BLOCK (1ULL << 11)
463 0 : #define LANDLOCK_ACCESS_FS_MAKE_SYM (1ULL << 12)
464 0 : #define LANDLOCK_ACCESS_FS_REFER (1ULL << 13)
465 0 : #define LANDLOCK_ACCESS_FS_TRUNCATE (1ULL << 14)
466 0 : #define LANDLOCK_ACCESS_FS_IOCTL_DEV (1ULL << 15)
467 :
468 0 : #define LANDLOCK_ACCESS_NET_BIND_TCP (1ULL << 0)
469 0 : #define LANDLOCK_ACCESS_NET_CONNECT_TCP (1ULL << 1)
470 :
471 : struct landlock_ruleset_attr {
472 : __u64 handled_access_fs;
473 : __u64 handled_access_net;
474 : };
475 :
476 : void
477 0 : fd_sandbox_private_landlock_restrict_self( void ) {
478 0 : struct landlock_ruleset_attr attr = {
479 0 : .handled_access_fs =
480 0 : LANDLOCK_ACCESS_FS_EXECUTE |
481 0 : LANDLOCK_ACCESS_FS_WRITE_FILE |
482 0 : LANDLOCK_ACCESS_FS_READ_FILE |
483 0 : LANDLOCK_ACCESS_FS_READ_DIR |
484 0 : LANDLOCK_ACCESS_FS_REMOVE_DIR |
485 0 : LANDLOCK_ACCESS_FS_REMOVE_FILE |
486 0 : LANDLOCK_ACCESS_FS_MAKE_CHAR |
487 0 : LANDLOCK_ACCESS_FS_MAKE_DIR |
488 0 : LANDLOCK_ACCESS_FS_MAKE_REG |
489 0 : LANDLOCK_ACCESS_FS_MAKE_SOCK |
490 0 : LANDLOCK_ACCESS_FS_MAKE_FIFO |
491 0 : LANDLOCK_ACCESS_FS_MAKE_BLOCK |
492 0 : LANDLOCK_ACCESS_FS_MAKE_SYM |
493 0 : LANDLOCK_ACCESS_FS_REFER |
494 0 : LANDLOCK_ACCESS_FS_TRUNCATE |
495 0 : LANDLOCK_ACCESS_FS_IOCTL_DEV,
496 0 : .handled_access_net =
497 0 : LANDLOCK_ACCESS_NET_BIND_TCP |
498 0 : LANDLOCK_ACCESS_NET_CONNECT_TCP,
499 0 : };
500 :
501 0 : long abi = syscall( SYS_landlock_create_ruleset, NULL, 0, LANDLOCK_CREATE_RULESET_VERSION );
502 0 : if( -1L==abi && (errno==ENOSYS || errno==EOPNOTSUPP ) ) return;
503 0 : else if( -1L==abi ) FD_LOG_ERR(( "landlock_create_ruleset() failed (%i-%s).", errno, fd_io_strerror( errno ) ));
504 :
505 0 : switch (abi) {
506 0 : case 1L:
507 : /* Removes LANDLOCK_ACCESS_FS_REFER for ABI < 2 */
508 0 : attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_REFER;
509 0 : __attribute__((fallthrough));
510 0 : case 2L:
511 : /* Removes LANDLOCK_ACCESS_FS_TRUNCATE for ABI < 3 */
512 0 : attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_TRUNCATE;
513 0 : __attribute__((fallthrough));
514 0 : case 3L:
515 : /* Removes network support for ABI < 4 */
516 0 : attr.handled_access_net &=
517 0 : ~(LANDLOCK_ACCESS_NET_BIND_TCP |
518 0 : LANDLOCK_ACCESS_NET_CONNECT_TCP);
519 0 : __attribute__((fallthrough));
520 0 : case 4L:
521 : /* Removes LANDLOCK_ACCESS_FS_IOCTL_DEV for ABI < 5 */
522 0 : attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_IOCTL_DEV;
523 0 : }
524 :
525 0 : long landlock_fd = syscall( SYS_landlock_create_ruleset, &attr, 16, 0 );
526 0 : if( -1L==landlock_fd ) FD_LOG_ERR(( "landlock_create_ruleset() failed (%i-%s).", errno, fd_io_strerror( errno ) ));
527 :
528 0 : if( syscall( SYS_landlock_restrict_self, landlock_fd, 0 ) ) FD_LOG_ERR(( "landlock_restrict_self() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
529 0 : }
530 :
531 : void
532 : fd_sandbox_private_set_seccomp_filter( ushort seccomp_filter_cnt,
533 0 : struct sock_filter * seccomp_filter ) {
534 0 : struct sock_fprog program = {
535 0 : .len = seccomp_filter_cnt,
536 0 : .filter = seccomp_filter,
537 0 : };
538 :
539 0 : if( syscall( SYS_seccomp, SECCOMP_SET_MODE_FILTER, 0, &program ) ) FD_LOG_ERR(( "seccomp() failed (%i-%s)", errno, fd_io_strerror( errno ) ) );
540 0 : }
541 :
542 : ulong
543 0 : fd_sandbox_private_read_cap_last_cap( void ) {
544 0 : int fd = open( "/proc/sys/kernel/cap_last_cap", O_RDONLY );
545 0 : if( -1==fd ) FD_LOG_ERR(( "open(/proc/sys/kernel/cap_last_cap) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
546 0 : char buf[ 16 ] = {0};
547 0 : long count = read( fd, buf, sizeof( buf ) );
548 0 : if( -1L==count ) FD_LOG_ERR(( "read(/proc/sys/kernel/cap_last_cap) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
549 0 : if( (ulong)count>=sizeof( buf ) ) FD_LOG_ERR(( "read(/proc/sys/kernel/cap_last_cap) returned truncated data" ));
550 0 : if( 0L!=read( fd, buf, sizeof( buf ) ) ) FD_LOG_ERR(( "read(/proc/sys/kernel/cap_last_cap) did not return all the data" ));
551 :
552 0 : char * end;
553 0 : ulong cap_last_cap = strtoul( buf, &end, 10 );
554 0 : if( *end!='\n' ) FD_LOG_ERR(( "read(/proc/sys/kernel/cap_last_cap) returned malformed data" ));
555 0 : if( close( fd ) ) FD_LOG_ERR(( "close(/proc/sys/kernel/cap_last_cap) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
556 0 : if( !cap_last_cap || cap_last_cap>128 ) FD_LOG_ERR(( "read(/proc/sys/kernel/cap_last_cap) returned invalid data" ));
557 :
558 0 : return cap_last_cap;
559 0 : }
560 :
561 : void
562 : fd_sandbox_private_enter_no_seccomp( uint desired_uid,
563 : uint desired_gid,
564 : int keep_host_networking,
565 : int keep_controlling_terminal,
566 : ulong rlimit_file_cnt,
567 : ulong allowed_file_descriptor_cnt,
568 0 : int const * allowed_file_descriptor ) {
569 : /* Read the highest capability index on the currently running kernel
570 : from /proc */
571 0 : ulong cap_last_cap = fd_sandbox_private_read_cap_last_cap();
572 :
573 : /* The ordering here is quite delicate and should be preserved ...
574 :
575 : | Action | Must happen before | Reason
576 : |------------------------|-----------------------------|-------------------------------------
577 : | Check file descriptors | Pivot root | Requires access to /proc filesystem
578 : | Clear groups | Unshare namespaces | Cannot call setgroups(2) in user namespace
579 : | Unshare namespaces | Pivot root | Pivot root requires CAP_SYS_ADMIN
580 : | Pivot root | Drop caps | Requires CAP_SYS_ADMIN
581 : | Pivot root | Landlock | Accesses the filesystem
582 : | Landlock | Set resource limits | Creates a file descriptor
583 : | Set resource limits | Drop caps | Requires CAP_SYS_RESOURCE */
584 0 : fd_sandbox_private_explicit_clear_environment_variables();
585 0 : fd_sandbox_private_check_exact_file_descriptors( allowed_file_descriptor_cnt, allowed_file_descriptor );
586 :
587 : /* Dropping groups can increase privileges to resources that deny
588 : certain groups so don't do that, just check that we have no
589 : supplementary group IDs. */
590 0 : int getgroups_cnt = getgroups( 0UL, NULL );
591 0 : if( -1==getgroups_cnt ) FD_LOG_ERR(( "getgroups() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
592 0 : if( getgroups_cnt>1 ) FD_LOG_WARNING(( "getgroups() returned multiple supplementary groups (%d), run `id` to see them. "
593 0 : "Continuing, but it is suggested to run Firedancer with a sandbox user that has as few permissions as possible.", getgroups_cnt ));
594 :
595 : /* Replace the session keyring in the process with a new
596 : anonymous one, in case the systemd or other launcher
597 : provided us with something by mistake. */
598 0 : if( -1==syscall( SYS_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL ) ) FD_LOG_ERR(( "syscall(SYS_keyctl) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
599 :
600 : /* Detach from the controlling terminal to prevent TIOCSTI type of
601 : escapes. See https://github.com/containers/bubblewrap/issues/142 */
602 0 : if( !keep_controlling_terminal ) {
603 0 : if( -1==setsid() ) FD_LOG_ERR(( "setsid() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
604 0 : }
605 :
606 : /* Certain Linux kernels are configured to not allow user namespaces
607 : from an unprivileged process, since it's a common security exploit
608 : vector. You can still make the namespace if you have CAP_SYS_ADMIN
609 : so we need to make sure to carry this through the switch_uid_gid
610 : which would drop all capabilities by default. */
611 0 : int userns_requires_cap_sys_admin = fd_sandbox_requires_cap_sys_admin( desired_uid, desired_gid );
612 0 : if( userns_requires_cap_sys_admin ) {
613 0 : if( -1==prctl( PR_SET_KEEPCAPS, 1 ) ) FD_LOG_ERR(( "prctl(PR_SET_KEEPCAPS, 1) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
614 0 : }
615 0 : fd_sandbox_private_switch_uid_gid( desired_uid, desired_gid );
616 :
617 : /* Now raise CAP_SYS_ADMIN again after we switched UID/GID, if it's
618 : required to create the user namespace. */
619 0 : if( userns_requires_cap_sys_admin ) {
620 0 : struct __user_cap_header_struct capheader;
621 0 : capheader.version = _LINUX_CAPABILITY_VERSION_3;
622 0 : capheader.pid = 0;
623 0 : struct __user_cap_data_struct capdata[2] = { {0} };
624 0 : if( -1==syscall( SYS_capget, &capheader, capdata ) ) FD_LOG_ERR(( "syscall(SYS_capget) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
625 0 : capdata[ CAP_TO_INDEX( CAP_SYS_ADMIN ) ].effective |= CAP_TO_MASK( CAP_SYS_ADMIN );
626 0 : if( -1==syscall( SYS_capset, &capheader, capdata ) ) FD_LOG_ERR(( "syscall(SYS_capset) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
627 0 : }
628 :
629 : /* Now unshare the user namespace, disallow creating any more
630 : namespaces except one child user namespace, and then create the
631 : child user namespace so that the sandbox can't undo the change. */
632 0 : if( -1==unshare( CLONE_NEWUSER ) ) FD_LOG_ERR(( "unshare(CLONE_NEWUSER) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
633 0 : fd_sandbox_private_write_userns_uid_gid_maps( desired_uid, desired_gid );
634 :
635 : /* Unshare everything in the parent user namespace, so that the nested
636 : user namespace does not have privileges over them. */
637 0 : int flags = CLONE_NEWNS | CLONE_NEWCGROUP | CLONE_NEWIPC | CLONE_NEWUTS;
638 0 : if( !keep_host_networking ) flags |= CLONE_NEWNET;
639 :
640 0 : if( -1==unshare( flags ) ) FD_LOG_ERR(( "unshare(CLONE_NEWNS | CLONE_NEWNET | CLONE_NEWCGROUP | CLONE_NEWIPC | CLONE_NEWUTS) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
641 :
642 0 : fd_sandbox_private_deny_namespaces();
643 :
644 0 : if( -1==unshare( CLONE_NEWUSER ) ) FD_LOG_ERR(( "unshare(CLONE_NEWUSER) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
645 0 : fd_sandbox_private_write_userns_uid_gid_maps( 1, 1 );
646 :
647 : /* PR_SET_KEEPCAPS will already be 0 if we didn't need to raise
648 : CAP_SYS_ADMIN, but we always clear it anyway. */
649 0 : if( -1==prctl( PR_SET_KEEPCAPS, 0 ) ) FD_LOG_ERR(( "prctl(PR_SET_KEEPCAPS, 0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
650 0 : if( -1==prctl( PR_SET_DUMPABLE, 0 ) ) FD_LOG_ERR(( "prctl(PR_SET_DUMPABLE, 0) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
651 :
652 : /* Now remount the filesystem root so no files are accessible any more. */
653 0 : fd_sandbox_private_pivot_root();
654 :
655 : /* Add an empty landlock restriction to further prevent filesystem
656 : access. */
657 0 : fd_sandbox_private_landlock_restrict_self();
658 :
659 : /* And trim all the resource limits down to zero. */
660 0 : fd_sandbox_private_set_rlimits( rlimit_file_cnt );
661 :
662 : /* And drop all the capabilities we have in the new user namespace. */
663 0 : fd_sandbox_private_drop_caps( cap_last_cap );
664 :
665 0 : if( -1==prctl( PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0 ) ) FD_LOG_ERR(( "prctl(PR_SET_NO_NEW_PRIVS, 1) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
666 0 : }
667 :
668 : void
669 : fd_sandbox_enter( uint desired_uid,
670 : uint desired_gid,
671 : int keep_host_networking,
672 : int keep_controlling_terminal,
673 : ulong rlimit_file_cnt,
674 : ulong allowed_file_descriptor_cnt,
675 : int const * allowed_file_descriptor,
676 : ulong seccomp_filter_cnt,
677 0 : struct sock_filter * seccomp_filter ) {
678 0 : if( seccomp_filter_cnt>USHORT_MAX ) FD_LOG_ERR(( "seccomp_filter_cnt must not be more than %d", USHORT_MAX ));
679 :
680 0 : fd_sandbox_private_enter_no_seccomp( desired_uid,
681 0 : desired_gid,
682 0 : keep_host_networking,
683 0 : keep_controlling_terminal,
684 0 : rlimit_file_cnt,
685 0 : allowed_file_descriptor_cnt,
686 0 : allowed_file_descriptor );
687 :
688 0 : FD_LOG_INFO(( "sandbox: full sandbox is being enabled" )); /* log before seccomp in-case logging not allowed in sandbox */
689 :
690 : /* Now finally install the seccomp-bpf filter. */
691 0 : fd_sandbox_private_set_seccomp_filter( (ushort)seccomp_filter_cnt, seccomp_filter );
692 0 : }
693 :
694 : void
695 : fd_sandbox_switch_uid_gid( uint desired_uid,
696 0 : uint desired_gid ) {
697 0 : fd_sandbox_private_switch_uid_gid( desired_uid, desired_gid );
698 0 : FD_LOG_INFO(( "sandbox: sandbox disabled" ));
699 0 : }
700 :
701 : ulong
702 0 : fd_sandbox_getpid( void ) {
703 0 : char pid[ 11 ] = {0}; /* 10 characters for INT_MAX, and then a NUL terminator. */
704 0 : long count = readlink( "/proc/self", pid, sizeof(pid) );
705 0 : if( -1L==count ) FD_LOG_ERR(( "readlink(/proc/self) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
706 0 : if( (ulong)count>=sizeof(pid) ) FD_LOG_ERR(( "readlink(/proc/self) returned truncated pid" ));
707 :
708 0 : char * endptr;
709 0 : ulong result = strtoul( pid, &endptr, 10 );
710 : /* A pid > INT_MAX is malformed, even if we can represent it in the
711 : ulong we are returning. */
712 0 : if( *endptr!='\0' || result>INT_MAX ) FD_LOG_ERR(( "strtoul(/proc/self) returned invalid pid" ));
713 :
714 0 : return result;
715 0 : }
716 :
717 : ulong
718 0 : fd_sandbox_gettid( void ) {
719 0 : char tid[ 27 ] = {0}; /* 10 characters for INT_MAX, twice, + /task/ and then a NUL terminator. */
720 0 : long count = readlink( "/proc/thread-self", tid, sizeof(tid) );
721 0 : if( count<0L ) FD_LOG_ERR(( "readlink(/proc/thread-self) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
722 0 : if( (ulong)count>=sizeof(tid) ) FD_LOG_ERR(( "readlink(/proc/thread-self) returned truncated tid" ));
723 :
724 0 : char * taskstr = strchr( tid, '/' );
725 0 : if( !taskstr ) FD_LOG_ERR(( "readlink(/proc/thread-self) returned invalid tid" ));
726 0 : taskstr++;
727 :
728 0 : char * task = strchr( taskstr, '/' );
729 0 : if( !task ) FD_LOG_ERR(( "readlink(/proc/thread-self) returned invalid tid" ));
730 :
731 0 : char * endptr;
732 0 : ulong result = strtoul( task+1UL, &endptr, 10 );
733 : /* A tid > INT_MAX is malformed, even if we can represent it in the
734 : ulong we are returning. */
735 0 : if( *endptr!='\0' || result>INT_MAX ) FD_LOG_ERR(( "strtoul(/proc/self) returned invalid tid" ));
736 :
737 0 : return result;
738 0 : }
|