Line data Source code
1 : #define _GNU_SOURCE
2 : #include "fd_topo.h"
3 :
4 : #include "../metrics/fd_metrics.h"
5 : #include "../../waltz/xdp/fd_xdp1.h"
6 : #include "../../util/tile/fd_tile_private.h"
7 :
8 : #include <unistd.h>
9 : #include <signal.h>
10 : #include <errno.h>
11 : #include <pthread.h>
12 : #include <sys/syscall.h>
13 : #include <linux/futex.h>
14 : #include <sys/resource.h>
15 : #include <sys/prctl.h>
16 : #include <sys/stat.h>
17 : #include <sys/mman.h>
18 : #include <net/if.h>
19 :
20 : static void
21 : initialize_logging( char const * tile_name,
22 : ulong tile_kind_id,
23 : ulong pid,
24 0 : ulong tid ) {
25 0 : fd_log_cpu_set( NULL );
26 0 : fd_log_private_tid_set( pid );
27 0 : char thread_name[ 20 ];
28 0 : FD_TEST( fd_cstr_printf_check( thread_name, sizeof( thread_name ), NULL, "%s:%lu", tile_name, tile_kind_id ) );
29 0 : fd_log_thread_set( thread_name );
30 0 : fd_log_private_stack_discover( FD_TILE_PRIVATE_STACK_SZ,
31 0 : &fd_tile_private_stack0, &fd_tile_private_stack1 );
32 0 : FD_LOG_NOTICE(( "booting tile %s pid:%lu tid:%lu", thread_name, fd_log_group_id(), tid ));
33 0 : }
34 :
35 : static void
36 : check_wait_debugger( ulong pid,
37 : volatile int * wait,
38 0 : volatile int * debugger ) {
39 0 : if( FD_UNLIKELY( debugger ) ) {
40 0 : FD_LOG_WARNING(( "waiting for debugger to attach to tile pid:%lu", pid ));
41 0 : if( FD_UNLIKELY( -1==kill( getpid(), SIGSTOP ) ) )
42 0 : FD_LOG_ERR(( "kill(SIGSTOP) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
43 0 : *FD_VOLATILE( debugger ) = 1;
44 0 : }
45 :
46 0 : if( FD_UNLIKELY( wait ) ) {
47 0 : while( FD_LIKELY( !*FD_VOLATILE( wait ) ) ) FD_SPIN_PAUSE();
48 0 : }
49 0 : }
50 :
51 : void
52 : fd_topo_run_tile( fd_topo_t * topo,
53 : fd_topo_tile_t * tile,
54 : int sandbox,
55 : int keep_controlling_terminal,
56 : int dumpable,
57 : uint uid,
58 : uint gid,
59 : int allow_fd,
60 : volatile int * wait,
61 : volatile int * debugger,
62 0 : fd_topo_run_tile_t * tile_run ) {
63 0 : char thread_name[ 20 ];
64 0 : FD_TEST( fd_cstr_printf_check( thread_name, sizeof( thread_name ), NULL, "%s:%lu", tile->name, tile->kind_id ) );
65 0 : if( FD_UNLIKELY( prctl( PR_SET_NAME, thread_name, 0, 0, 0 ) ) ) FD_LOG_ERR(( "prctl(PR_SET_NAME) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
66 :
67 0 : ulong pid = fd_sandbox_getpid(); /* Need to read /proc again.. we got a new PID from clone */
68 0 : ulong tid = fd_sandbox_gettid(); /* Need to read /proc again.. we got a new TID from clone */
69 :
70 0 : check_wait_debugger( pid, wait, debugger );
71 0 : initialize_logging( tile->name, tile->kind_id, pid, tid );
72 :
73 : /* preload shared memory before sandboxing, so it is already mapped */
74 0 : fd_topo_join_tile_workspaces( topo, tile );
75 :
76 0 : if( FD_UNLIKELY( tile_run->privileged_init ) )
77 0 : tile_run->privileged_init( topo, tile );
78 :
79 0 : ulong allow_fds_offset = 0UL;
80 0 : int allow_fds[ 256 ] = { 0 };
81 0 : if( FD_LIKELY( -1!=allow_fd ) ) {
82 0 : allow_fds_offset = 1UL;
83 0 : allow_fds[ 0 ] = allow_fd;
84 0 : }
85 0 : ulong allow_fds_cnt = 0UL;
86 0 : if( FD_LIKELY( tile_run->populate_allowed_fds ) ) {
87 0 : allow_fds_cnt = tile_run->populate_allowed_fds( topo,
88 0 : tile,
89 0 : (sizeof(allow_fds)/sizeof(allow_fds[ 0 ]))-allow_fds_offset,
90 0 : allow_fds+allow_fds_offset );
91 0 : }
92 :
93 :
94 0 : struct sock_filter seccomp_filter[ 128UL ];
95 0 : ulong seccomp_filter_cnt = 0UL;
96 0 : if( FD_LIKELY( tile_run->populate_allowed_seccomp ) ) {
97 0 : seccomp_filter_cnt = tile_run->populate_allowed_seccomp( topo,
98 0 : tile,
99 0 : sizeof(seccomp_filter)/sizeof(seccomp_filter[ 0 ]),
100 0 : seccomp_filter );
101 0 : }
102 :
103 0 : ulong rlimit_file_cnt = tile_run->rlimit_file_cnt;
104 0 : if( tile_run->rlimit_file_cnt_fn ) {
105 0 : rlimit_file_cnt = tile_run->rlimit_file_cnt_fn( topo, tile );
106 0 : }
107 :
108 0 : if( FD_LIKELY( sandbox ) ) {
109 0 : fd_sandbox_enter( uid,
110 0 : gid,
111 0 : tile_run->keep_host_networking,
112 0 : tile_run->allow_connect,
113 0 : keep_controlling_terminal,
114 0 : dumpable,
115 0 : rlimit_file_cnt,
116 0 : tile_run->rlimit_address_space,
117 0 : tile_run->rlimit_data,
118 0 : allow_fds_cnt+allow_fds_offset,
119 0 : allow_fds,
120 0 : seccomp_filter_cnt,
121 0 : seccomp_filter );
122 0 : } else {
123 0 : fd_sandbox_switch_uid_gid( uid, gid );
124 0 : }
125 :
126 : /* Now we are sandboxed, join all the tango IPC objects in the workspaces */
127 0 : fd_topo_fill_tile( topo, tile );
128 :
129 0 : FD_TEST( tile->metrics );
130 0 : fd_metrics_register( tile->metrics );
131 :
132 0 : FD_MGAUGE_SET( TILE, PID, pid );
133 0 : FD_MGAUGE_SET( TILE, TID, tid );
134 :
135 0 : if( FD_UNLIKELY( tile_run->unprivileged_init ) )
136 0 : tile_run->unprivileged_init( topo, tile );
137 :
138 0 : tile_run->run( topo, tile );
139 0 : FD_LOG_ERR(( "tile run loop returned" ));
140 0 : }
141 :
142 : typedef struct {
143 : fd_topo_t * topo;
144 : fd_topo_tile_t * tile;
145 : fd_topo_run_tile_t tile_run;
146 : uint uid;
147 : uint gid;
148 : int * done_futex;
149 : volatile int copied;
150 : } fd_topo_run_thread_args_t;
151 :
152 : static void *
153 0 : run_tile_thread_main( void * _args ) {
154 0 : fd_topo_run_thread_args_t args = *(fd_topo_run_thread_args_t *)_args;
155 0 : FD_COMPILER_MFENCE();
156 0 : ((fd_topo_run_thread_args_t *)_args)->copied = 1;
157 :
158 0 : fd_topo_run_tile( args.topo, args.tile, 0, 1, 1, args.uid, args.gid, -1, NULL, NULL, &args.tile_run );
159 0 : if( FD_UNLIKELY( args.done_futex ) ) {
160 0 : for(;;) {
161 0 : if( FD_LIKELY( INT_MAX==FD_ATOMIC_CAS( args.done_futex, INT_MAX, (int)args.tile->id ) ) ) break;
162 0 : FD_SPIN_PAUSE();
163 0 : }
164 0 : if( FD_UNLIKELY( -1==syscall( SYS_futex, args.done_futex, FUTEX_WAKE, INT_MAX, NULL, NULL, 0 ) ) )
165 0 : FD_LOG_ERR(( "futex(FUTEX_WAKE) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
166 0 : } else {
167 0 : FD_LOG_ERR(( "fd_topo_run_tile() returned" ));
168 0 : }
169 0 : return NULL;
170 0 : }
171 :
172 : void *
173 : fd_topo_tile_stack_join( char const * app_name,
174 : char const * tile_name,
175 0 : ulong tile_kind_id ) {
176 0 : char name[ PATH_MAX ];
177 0 : FD_TEST( fd_cstr_printf_check( name, PATH_MAX, NULL, "%s_stack_%s%lu", app_name, tile_name, tile_kind_id ) );
178 :
179 0 : uchar * stack = fd_shmem_join( name, FD_SHMEM_JOIN_MODE_READ_WRITE, NULL, NULL, NULL );
180 0 : if( FD_UNLIKELY( !stack ) ) FD_LOG_ERR(( "fd_shmem_join failed" ));
181 :
182 : /* Make space for guard lo and guard hi */
183 0 : if( FD_UNLIKELY( fd_shmem_release( stack, FD_SHMEM_HUGE_PAGE_SZ, 1UL ) ) )
184 0 : FD_LOG_ERR(( "fd_shmem_release (%d-%s)", errno, fd_io_strerror( errno ) ));
185 0 : stack += FD_SHMEM_HUGE_PAGE_SZ;
186 0 : if( FD_UNLIKELY( fd_shmem_release( stack + FD_TILE_PRIVATE_STACK_SZ, FD_SHMEM_HUGE_PAGE_SZ, 1UL ) ) )
187 0 : FD_LOG_ERR(( "fd_shmem_release (%d-%s)", errno, fd_io_strerror( errno ) ));
188 :
189 : /* Create the guard regions in the extra space */
190 0 : void * guard_lo = (void *)(stack - FD_SHMEM_NORMAL_PAGE_SZ );
191 0 : if( FD_UNLIKELY( mmap( guard_lo, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
192 0 : MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_lo ) )
193 0 : FD_LOG_ERR(( "mmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
194 :
195 0 : void * guard_hi = (void *)(stack + FD_TILE_PRIVATE_STACK_SZ);
196 0 : if( FD_UNLIKELY( mmap( guard_hi, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
197 0 : MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_hi ) )
198 0 : FD_LOG_ERR(( "mmap failed (%i-%s)", errno, fd_io_strerror( errno ) ));
199 :
200 0 : return stack;
201 0 : }
202 :
203 : fd_xdp_fds_t
204 0 : fd_topo_install_xdp( fd_topo_t const * topo ) {
205 0 : ulong net0_tile_idx = fd_topo_find_tile( topo, "net", 0UL );
206 0 : FD_TEST( net0_tile_idx!=ULONG_MAX );
207 0 : fd_topo_tile_t const * net0_tile = &topo->tiles[ net0_tile_idx ];
208 :
209 0 : ushort udp_port_candidates[] = {
210 0 : (ushort)net0_tile->net.legacy_transaction_listen_port,
211 0 : (ushort)net0_tile->net.quic_transaction_listen_port,
212 0 : (ushort)net0_tile->net.shred_listen_port,
213 0 : (ushort)net0_tile->net.gossip_listen_port,
214 0 : (ushort)net0_tile->net.repair_intake_listen_port,
215 0 : (ushort)net0_tile->net.repair_serve_listen_port,
216 0 : };
217 :
218 0 : uint if_idx = if_nametoindex( net0_tile->net.interface );
219 0 : if( FD_UNLIKELY( !if_idx ) ) FD_LOG_ERR(( "if_nametoindex(%s) failed", net0_tile->net.interface ));
220 :
221 0 : fd_xdp_fds_t xdp_fds = fd_xdp_install( if_idx,
222 0 : sizeof(udp_port_candidates)/sizeof(udp_port_candidates[0]),
223 0 : udp_port_candidates,
224 0 : net0_tile->net.xdp_mode );
225 0 : if( FD_UNLIKELY( -1==dup2( xdp_fds.xsk_map_fd, 123462 ) ) ) FD_LOG_ERR(( "dup2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
226 0 : if( FD_UNLIKELY( -1==close( xdp_fds.xsk_map_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
227 0 : if( FD_UNLIKELY( -1==dup2( xdp_fds.prog_link_fd, 123463 ) ) ) FD_LOG_ERR(( "dup2() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
228 0 : if( FD_UNLIKELY( -1==close( xdp_fds.prog_link_fd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
229 :
230 0 : xdp_fds.xsk_map_fd = 123462;
231 0 : xdp_fds.prog_link_fd = 123463;
232 :
233 0 : return xdp_fds;
234 0 : }
235 :
236 : static inline void
237 : run_tile_thread( fd_topo_t * topo,
238 : fd_topo_tile_t * tile,
239 : fd_topo_run_tile_t tile_run,
240 : uint uid,
241 : uint gid,
242 : int * done_futex,
243 : fd_cpuset_t const * floating_cpu_set,
244 0 : int floating_priority ) {
245 : /* tpool will assign a thread later */
246 0 : if( FD_UNLIKELY( tile_run.for_tpool ) ) return;
247 0 : void * stack = fd_topo_tile_stack_join( topo->app_name, tile->name, tile->kind_id );
248 :
249 0 : pthread_attr_t attr[ 1 ];
250 0 : if( FD_UNLIKELY( pthread_attr_init( attr ) ) ) FD_LOG_ERR(( "pthread_attr_init() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
251 0 : if( FD_UNLIKELY( pthread_attr_setstack( attr, stack, FD_TILE_PRIVATE_STACK_SZ ) ) ) FD_LOG_ERR(( "pthread_attr_setstacksize() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
252 :
253 0 : FD_CPUSET_DECL( cpu_set );
254 0 : if( FD_LIKELY( tile->cpu_idx<65535UL ) ) {
255 : /* set the thread affinity before we clone the new process to ensure
256 : kernel first touch happens on the desired thread. */
257 0 : fd_cpuset_insert( cpu_set, tile->cpu_idx );
258 0 : if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, -19 ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
259 0 : } else {
260 0 : fd_memcpy( cpu_set, floating_cpu_set, fd_cpuset_footprint() );
261 0 : if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, floating_priority ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
262 0 : }
263 :
264 0 : if( FD_UNLIKELY( fd_cpuset_setaffinity( 0, cpu_set ) ) ) {
265 0 : if( FD_LIKELY( errno==EINVAL ) ) {
266 0 : FD_LOG_ERR(( "Unable to set the thread affinity for tile %s:%lu on cpu %lu. It is likely that the affinity "
267 0 : "you have specified for this tile in [layout.affinity] of your configuration file contains a "
268 0 : "CPU (%lu) which does not exist on this machine.",
269 0 : tile->name, tile->kind_id, tile->cpu_idx, tile->cpu_idx ));
270 0 : } else {
271 0 : FD_LOG_ERR(( "sched_setaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
272 0 : }
273 0 : }
274 :
275 0 : fd_topo_run_thread_args_t args = {
276 0 : .topo = topo,
277 0 : .tile = tile,
278 0 : .tile_run = tile_run,
279 0 : .uid = uid,
280 0 : .gid = gid,
281 0 : .done_futex = done_futex,
282 0 : .copied = 0,
283 0 : };
284 :
285 0 : pthread_t pthread;
286 0 : if( FD_UNLIKELY( pthread_create( &pthread, attr, run_tile_thread_main, &args ) ) ) FD_LOG_ERR(( "pthread_create() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
287 :
288 0 : while( !FD_VOLATILE( args.copied ) ) FD_SPIN_PAUSE();
289 0 : }
290 :
291 : void
292 : fd_topo_run_single_process( fd_topo_t * topo,
293 : int agave,
294 : uint uid,
295 : uint gid,
296 : fd_topo_run_tile_t (* tile_run )( fd_topo_tile_t const * tile ),
297 0 : int * done_futex ) {
298 : /* Save the current affinity, it will be restored after creating any child tiles */
299 0 : FD_CPUSET_DECL( floating_cpu_set );
300 0 : if( FD_UNLIKELY( fd_cpuset_getaffinity( 0, floating_cpu_set ) ) )
301 0 : FD_LOG_ERR(( "sched_getaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
302 :
303 0 : errno = 0;
304 0 : int save_priority = getpriority( PRIO_PROCESS, 0 );
305 0 : if( FD_UNLIKELY( -1==save_priority && errno ) ) FD_LOG_ERR(( "getpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
306 :
307 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
308 0 : fd_topo_tile_t * tile = &topo->tiles[ i ];
309 0 : if( !agave && tile->is_agave ) continue;
310 0 : if( agave==1 && !tile->is_agave ) continue;
311 :
312 0 : fd_topo_run_tile_t run_tile = tile_run( tile );
313 0 : run_tile_thread( topo, tile, run_tile, uid, gid, done_futex, floating_cpu_set, save_priority );
314 0 : }
315 :
316 0 : fd_sandbox_switch_uid_gid( uid, gid );
317 :
318 0 : if( FD_UNLIKELY( -1==setpriority( PRIO_PROCESS, 0, save_priority ) ) ) FD_LOG_ERR(( "setpriority() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
319 0 : if( FD_UNLIKELY( fd_cpuset_setaffinity( 0, floating_cpu_set ) ) )
320 0 : FD_LOG_ERR(( "sched_setaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) ));
321 0 : }
|