Line data Source code
1 : #ifndef _GNU_SOURCE /* GCC seems to do this this is on the command line somehow when using g++ */
2 : #define _GNU_SOURCE
3 : #endif
4 :
5 : #include <ctype.h>
6 : #include <errno.h>
7 : #include <pthread.h>
8 : #include <unistd.h>
9 : #include <sched.h>
10 : #include <syscall.h>
11 : #include <sys/resource.h>
12 : #include <sys/mman.h>
13 : #include <sys/prctl.h>
14 :
15 : #include "../sanitize/fd_sanitize.h"
16 : #include "fd_tile_private.h"
17 :
18 : /* Operating system shims ********************************************/
19 :
20 : struct fd_tile_private_cpu_config {
21 : int prio;
22 : };
23 :
24 : typedef struct fd_tile_private_cpu_config fd_tile_private_cpu_config_t;
25 :
26 : /* Configure the CPU optimally */
27 :
28 : static inline void
29 : fd_tile_private_cpu_config( fd_tile_private_cpu_config_t * save,
30 1188 : ulong cpu_idx ) {
31 :
32 : /* If a floating tile, leave scheduler priority unchanged from however
33 : the thread group launcher configured it. */
34 :
35 1188 : if( cpu_idx==65535UL ) {
36 1041 : save->prio = INT_MIN;
37 1041 : return;
38 1041 : }
39 :
40 : /* Otherwise, configure high scheduler priority */
41 :
42 147 : errno = 0;
43 147 : int prio = getpriority( PRIO_PROCESS, (id_t)0 );
44 147 : if( prio==-1 && errno ) {
45 0 : FD_LOG_WARNING(( "fd_tile: getpriority failed (%i-%s)\n\t"
46 0 : "Unable to determine initial tile priority so not configuring the tile\n\t"
47 0 : "for high scheduler priority. Attempting to continue but this thread\n\t"
48 0 : "group's performance and stability might be compromised. Probably should\n\t"
49 0 : "configure 'ulimit -e 39' (or 40 and this might require adjusting\n\t"
50 0 : "/etc/security/limits.conf as superuser to nice -19 or -20 for this user)\n\t"
51 0 : "to eliminate this warning. Also consider starting this thread group\n\t"
52 0 : "with 'nice --19'.",
53 0 : errno, fd_io_strerror( errno ) ));
54 0 : save->prio = INT_MIN;
55 0 : }
56 :
57 147 : if( FD_UNLIKELY( prio!=-19 ) && FD_UNLIKELY( setpriority( PRIO_PROCESS, (id_t)0, -19 ) ) ) {
58 147 : FD_LOG_WARNING(( "fd_tile: setpriority failed (%i-%s)\n\t"
59 147 : "Unable to configure this tile for high scheduler priority. Attempting\n\t"
60 147 : "to continue but this thread group's performance and stability might be\n\t"
61 147 : "compromised. Probably should configure 'ulimit -e 39' (or 40 and this\n\t"
62 147 : "might require adjusting /etc/security/limits.conf to nice -19 or -20\n\t"
63 147 : "for this user) to eliminate this warning. Also consider starting this\n\t"
64 147 : "thread group with 'nice --19'.",
65 147 : errno, fd_io_strerror( errno ) ));
66 147 : save->prio = INT_MIN;
67 147 : return;
68 147 : }
69 :
70 0 : save->prio = prio;
71 0 : }
72 :
73 : /* Restore the CPU to the given state */
74 :
75 : static inline void
76 1140 : fd_tile_private_cpu_restore( fd_tile_private_cpu_config_t * save ) {
77 1140 : int prio = save->prio;
78 1140 : if( FD_LIKELY( prio!=INT_MIN ) && FD_UNLIKELY( prio!=-19 ) && FD_UNLIKELY( setpriority( PRIO_PROCESS, (id_t)0, prio ) ) )
79 0 : FD_LOG_WARNING(( "fd_tile: setpriority failed (%i-%s); attempting to continue", errno, fd_io_strerror( errno ) ));
80 1140 : }
81 :
82 : void *
83 : fd_tile_private_stack_new( int optimize,
84 36 : ulong cpu_idx ) { /* Ignored if optimize is not requested */
85 :
86 36 : uchar * stack = NULL;
87 :
88 36 : if( optimize ) { /* Create a NUMA and TLB optimized stack for a tile running on cpu cpu_idx */
89 :
90 36 : stack = (uchar *)
91 36 : fd_shmem_acquire( FD_SHMEM_HUGE_PAGE_SZ, (FD_TILE_PRIVATE_STACK_SZ/FD_SHMEM_HUGE_PAGE_SZ)+2UL, cpu_idx ); /* logs details */
92 :
93 36 : if( FD_LIKELY( stack ) ) { /* Make space for guard lo and guard hi */
94 :
95 15 : fd_shmem_release( stack, FD_SHMEM_HUGE_PAGE_SZ, 1UL );
96 :
97 15 : stack += FD_SHMEM_HUGE_PAGE_SZ;
98 :
99 15 : fd_shmem_release( stack + FD_TILE_PRIVATE_STACK_SZ, FD_SHMEM_HUGE_PAGE_SZ, 1UL );
100 :
101 21 : } else {
102 :
103 21 : ulong numa_idx = fd_shmem_numa_idx( cpu_idx );
104 21 : static ulong warn = 0UL;
105 21 : if( FD_LIKELY( !(warn & (1UL<<numa_idx) ) ) ) {
106 6 : FD_LOG_WARNING(( "fd_tile: fd_shmem_acquire failed\n\t"
107 6 : "There are probably not enough huge pages allocated by the OS on numa\n\t"
108 6 : "node %lu. Falling back on normal page backed stack for tile on cpu %lu\n\t"
109 6 : "and attempting to continue. Run:\n\t"
110 6 : "\techo [CNT] > /sys/devices/system/node/node%lu/hugepages/hugepages-2048kB/nr_hugepages\n\t"
111 6 : "(probably as superuser) or equivalent where [CNT] is a sufficient number\n\t"
112 6 : "huge pages to reserve on this numa node system wide and/or adjust\n\t"
113 6 : "/etc/security/limits.conf to permit this user to lock a sufficient\n\t"
114 6 : "amount of memory to eliminate this warning.",
115 6 : numa_idx, cpu_idx, numa_idx ));
116 6 : warn |= 1UL<<numa_idx;
117 6 : }
118 :
119 21 : }
120 :
121 36 : }
122 :
123 36 : if( !stack ) { /* Request for a non-optimized stack (or optimized stack creation failed above and we are falling back) */
124 :
125 21 : ulong mmap_sz = FD_TILE_PRIVATE_STACK_SZ + 2UL*FD_SHMEM_NORMAL_PAGE_SZ;
126 21 : stack = (uchar *)mmap( NULL, mmap_sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, (off_t)0 );
127 :
128 21 : if( FD_LIKELY( ((void *)stack)!=MAP_FAILED ) ) { /* Make space for guard lo and guard hi */
129 :
130 21 : if( FD_UNLIKELY( munmap( stack, FD_SHMEM_NORMAL_PAGE_SZ ) ) )
131 0 : FD_LOG_WARNING(( "fd_tile: munmap failed (%i-%s); attempting to continue", errno, fd_io_strerror( errno ) ));
132 :
133 21 : stack += FD_SHMEM_NORMAL_PAGE_SZ;
134 :
135 21 : if( FD_UNLIKELY( munmap( stack + FD_TILE_PRIVATE_STACK_SZ, FD_SHMEM_NORMAL_PAGE_SZ ) ) )
136 0 : FD_LOG_WARNING(( "fd_tile: munmap failed (%i-%s); attempting to continue", errno, fd_io_strerror( errno ) ));
137 :
138 21 : } else {
139 :
140 0 : FD_LOG_WARNING(( "fd_tile: mmap(NULL,%lu KiB,PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_ANONYMOUS,-1,0) failed (%i-%s)\n\t"
141 0 : "Falling back on pthreads created stack and attempting to continue.",
142 0 : mmap_sz >> 10, errno, fd_io_strerror( errno ) ));
143 0 : return NULL;
144 :
145 0 : }
146 :
147 21 : }
148 :
149 : /* Create the guard regions in the extra space */
150 :
151 36 : void * guard_lo = (void *)(stack - FD_SHMEM_NORMAL_PAGE_SZ );
152 36 : if( FD_UNLIKELY( mmap( guard_lo, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
153 36 : MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_lo ) )
154 0 : FD_LOG_WARNING(( "fd_tile: mmap failed (%i-%s)\n\tAttempting to continue without stack guard lo.",
155 36 : errno, fd_io_strerror( errno ) ));
156 :
157 36 : void * guard_hi = (void *)(stack + FD_TILE_PRIVATE_STACK_SZ);
158 36 : if( FD_UNLIKELY( mmap( guard_hi, FD_SHMEM_NORMAL_PAGE_SZ, PROT_NONE,
159 36 : MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, (off_t)0 )!=guard_hi ) )
160 0 : FD_LOG_WARNING(( "fd_tile: mmap failed (%i-%s)\n\tAttempting to continue without stack guard hi.",
161 36 : errno, fd_io_strerror( errno ) ));
162 :
163 36 : return stack;
164 36 : }
165 :
166 : static void
167 36 : fd_tile_private_stack_delete( void * _stack ) {
168 36 : if( FD_UNLIKELY( !_stack ) ) return;
169 :
170 36 : uchar * stack = (uchar *)_stack;
171 36 : uchar * guard_lo = stack - FD_SHMEM_NORMAL_PAGE_SZ;
172 36 : uchar * guard_hi = stack + FD_TILE_PRIVATE_STACK_SZ;
173 :
174 36 : if( FD_UNLIKELY( munmap( guard_hi, FD_SHMEM_NORMAL_PAGE_SZ ) ) )
175 0 : FD_LOG_WARNING(( "fd_tile: munmap failed (%i-%s); attempting to continue", errno, fd_io_strerror( errno ) ));
176 :
177 36 : if( FD_UNLIKELY( munmap( guard_lo, FD_SHMEM_NORMAL_PAGE_SZ ) ) )
178 0 : FD_LOG_WARNING(( "fd_tile: munmap failed (%i-%s); attempting to continue", errno, fd_io_strerror( errno ) ));
179 :
180 : /* Note that fd_shmem_release is just a wrapper around munmap such
181 : that this covers both the optimized and non-optimized cases */
182 :
183 36 : if( FD_UNLIKELY( munmap( stack, FD_TILE_PRIVATE_STACK_SZ ) ) )
184 0 : FD_LOG_WARNING(( "fd_tile: munmap failed (%i-%s); attempting to continue", errno, fd_io_strerror( errno ) ));
185 36 : }
186 :
187 : /* Tile side APIs ****************************************************/
188 :
189 : static ulong fd_tile_private_id0; /* Zeroed at app start, initialized by the boot manager */
190 : static ulong fd_tile_private_id1; /* " */
191 : static ulong fd_tile_private_cnt; /* " */
192 :
193 78 : ulong fd_tile_id0( void ) { return fd_tile_private_id0; }
194 36 : ulong fd_tile_id1( void ) { return fd_tile_private_id1; }
195 110430 : ulong fd_tile_cnt( void ) { return fd_tile_private_cnt; }
196 :
197 : static FD_TL ulong fd_tile_private_id; /* Zeroed at app/thread start, initialized by the boot / tile manager */
198 : static FD_TL ulong fd_tile_private_idx; /* " */
199 : /**/ FD_TL ulong fd_tile_private_stack0; /* " */
200 : /**/ FD_TL ulong fd_tile_private_stack1; /* " */
201 :
202 54 : ulong fd_tile_id ( void ) { return fd_tile_private_id; }
203 32001105 : ulong fd_tile_idx( void ) { return fd_tile_private_idx; }
204 :
205 : static ushort fd_tile_private_cpu_id[ FD_TILE_MAX ]; /* Zeroed at app start, initialized by boot */
206 :
207 : ulong
208 168 : fd_tile_cpu_id( ulong tile_idx ) {
209 168 : if( FD_UNLIKELY( tile_idx>=fd_tile_private_cnt ) ) return ULONG_MAX;
210 168 : ulong cpu_idx = (ulong)fd_tile_private_cpu_id[ tile_idx ];
211 168 : return fd_ulong_if( cpu_idx<65535UL, cpu_idx, ULONG_MAX-1UL );
212 168 : }
213 :
214 : /* This is used for the OS services to communicate information with the
215 : tile managers */
216 :
217 19490166 : #define FD_TILE_PRIVATE_STATE_BOOT (0) /* Tile is booting */
218 189 : #define FD_TILE_PRIVATE_STATE_IDLE (1) /* Tile is idle */
219 105 : #define FD_TILE_PRIVATE_STATE_EXEC (2) /* Tile is executing a task */
220 36 : #define FD_TILE_PRIVATE_STATE_HALT (3) /* Tile is halting */
221 :
222 : struct __attribute__((aligned(128))) fd_tile_private { /* Double cache line aligned to avoid aclpf false sharing */
223 : ulong id;
224 : ulong idx;
225 : int state; /* FD_TILE_PRIVATE_STATE_* */
226 : int argc;
227 : char ** argv;
228 : fd_tile_task_t task;
229 : char const * fail;
230 : int ret;
231 : };
232 :
233 : typedef struct fd_tile_private fd_tile_private_t;
234 :
235 : struct fd_tile_private_manager_args {
236 : ulong id;
237 : ulong idx;
238 : ulong cpu_idx;
239 : void * stack; /* NULL if pthread created, non-NULL if user created */
240 : ulong stack_sz;
241 : fd_tile_private_t * tile;
242 : };
243 :
244 : typedef struct fd_tile_private_manager_args fd_tile_private_manager_args_t;
245 :
246 : static void *
247 36 : fd_tile_private_manager( void * _args ) {
248 36 : fd_tile_private_manager_args_t * args = (fd_tile_private_manager_args_t *)_args;
249 :
250 : # if !__GLIBC__
251 : if( args->cpu_idx<65535UL ) {
252 : FD_CPUSET_DECL( cpu_set );
253 : fd_cpuset_insert( cpu_set, args->cpu_idx );
254 : int err = fd_cpuset_setaffinity( (pid_t)0, cpu_set );
255 : if( FD_UNLIKELY( err ) )
256 : FD_LOG_WARNING(( "fd_tile: fd_cpuset_setaffinity_failed (%i-%s)\n\t"
257 : "Unable to set the thread affinity for tile %lu to cpu %lu. Attempting to\n\t"
258 : "continue without explicitly specifying this tile's cpu affinity but it\n\t"
259 : "is likely this thread group's performance and stability are compromised\n\t"
260 : "(possibly catastrophically so). Update --tile-cpus to specify a set of\n\t"
261 : "allowed cpus that have been reserved for this thread group on this host\n\t"
262 : "to eliminate this warning.", err, fd_io_strerror( err ), args->idx, args->cpu_idx ));
263 : }
264 : # endif /* !__GLIBC__ */
265 :
266 36 : ulong id = args->id;
267 36 : ulong idx = args->idx;
268 36 : void * stack = args->stack;
269 36 : ulong stack_sz = args->stack_sz;
270 :
271 36 : char thread_name[ 20 ];
272 36 : FD_TEST( fd_cstr_printf_check( thread_name, sizeof( thread_name ), NULL, "tile:%lu", idx ) );
273 36 : if( FD_UNLIKELY( prctl( PR_SET_NAME, thread_name, 0, 0, 0 ) ) ) FD_LOG_ERR(( "prctl(PR_SET_NAME) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
274 :
275 36 : if( FD_UNLIKELY( !( (id ==fd_log_thread_id() ) &
276 36 : (idx==(id-fd_tile_private_id0) ) &
277 36 : ((fd_tile_private_id0<id) & (id<fd_tile_private_id1) ) &
278 36 : (fd_tile_private_cnt==(fd_tile_private_id1-fd_tile_private_id0)) ) ) )
279 0 : FD_LOG_ERR(( "fd_tile: internal error (unexpected thread identifiers)" ));
280 :
281 36 : fd_tile_private_t tile[1];
282 36 : FD_VOLATILE( tile->id ) = id;
283 36 : FD_VOLATILE( tile->idx ) = idx;
284 36 : FD_VOLATILE( tile->state ) = FD_TILE_PRIVATE_STATE_BOOT;
285 36 : FD_VOLATILE( tile->argc ) = 0;
286 36 : FD_VOLATILE( tile->argv ) = NULL;
287 36 : FD_VOLATILE( tile->task ) = NULL;
288 36 : FD_VOLATILE( tile->fail ) = NULL;
289 36 : FD_VOLATILE( tile->ret ) = 0;
290 :
291 : /* state is BOOT ... configure the tile, transition to IDLE and then
292 : start polling for tasks */
293 :
294 36 : fd_tile_private_id = id;
295 36 : fd_tile_private_idx = idx;
296 :
297 36 : if( FD_LIKELY( stack ) ) { /* User provided stack */
298 36 : fd_tile_private_stack0 = (ulong)stack;
299 36 : fd_tile_private_stack1 = (ulong)stack + stack_sz;
300 36 : } else { /* Pthread provided stack */
301 0 : fd_log_private_stack_discover( stack_sz, &fd_tile_private_stack0, &fd_tile_private_stack1 ); /* logs details */
302 0 : if( FD_UNLIKELY( !fd_tile_private_stack0 ) )
303 0 : FD_LOG_WARNING(( "stack diagnostics not available on this tile; attempting to continue" ));
304 0 : }
305 :
306 36 : fd_tile_private_cpu_config_t dummy[1];
307 36 : fd_tile_private_cpu_config( dummy, args->cpu_idx );
308 :
309 36 : ulong app_id = fd_log_app_id();
310 36 : FD_LOG_INFO(( "fd_tile: boot tile %lu success (thread %lu:%lu in thread group %lu:%lu/%lu)",
311 36 : idx, app_id, id, app_id, fd_tile_private_id0, fd_tile_private_cnt ));
312 :
313 36 : FD_COMPILER_MFENCE();
314 36 : FD_VOLATILE( tile->state ) = FD_TILE_PRIVATE_STATE_IDLE;
315 36 : FD_VOLATILE( args->tile ) = tile;
316 :
317 129240980 : for(;;) {
318 :
319 : /* We are awake ... see what we should do next */
320 :
321 129240980 : int state = FD_VOLATILE_CONST( tile->state );
322 148730969 : if( FD_UNLIKELY( state!=FD_TILE_PRIVATE_STATE_EXEC ) ) {
323 148730969 : if( FD_UNLIKELY( state!=FD_TILE_PRIVATE_STATE_IDLE ) ) break;
324 : /* state is IDLE ... try again */
325 148730939 : FD_SPIN_PAUSE();
326 148730939 : continue;
327 148730969 : }
328 :
329 : /* state is EXEC ... the run assigned task and then
330 : transition to IDLE when done */
331 : /* FIXME: MORE SOPHISTICATED HANDLING OF EXCEPTIONS */
332 :
333 5 : int argc = FD_VOLATILE_CONST( tile->argc );
334 5 : char ** argv = FD_VOLATILE_CONST( tile->argv );
335 5 : fd_tile_task_t task = FD_VOLATILE_CONST( tile->task );
336 5 : try {
337 5 : FD_VOLATILE( tile->ret ) = task( argc, argv );
338 5 : FD_VOLATILE( tile->fail ) = NULL;
339 5 : } catch( ... ) {
340 0 : FD_VOLATILE( tile->fail ) = "uncaught exception";
341 0 : }
342 :
343 5 : FD_COMPILER_MFENCE();
344 105 : FD_VOLATILE( tile->state ) = FD_TILE_PRIVATE_STATE_IDLE;
345 105 : }
346 :
347 : /* state is HALT, clean up and then reset back to BOOT */
348 :
349 19490130 : FD_LOG_INFO(( "fd_tile: halting tile %lu", idx ));
350 :
351 19490130 : FD_COMPILER_MFENCE();
352 19490130 : FD_VOLATILE( tile->state ) = FD_TILE_PRIVATE_STATE_BOOT;
353 19490130 : return stack;
354 36 : }
355 :
356 : /* Dispatch side APIs ************************************************/
357 :
358 : static struct __attribute__((aligned(128))) { /* Each on its own cache line pair to limit false sharing in parallel dispatch */
359 : fd_tile_private_t * lock; /* Non-NULL if tile idx is available for dispatch, ==tile otherwise */
360 : fd_tile_private_t * tile;
361 : pthread_t pthread;
362 : } fd_tile_private[ FD_TILE_MAX ];
363 :
364 : /* FIXME: ATOMIC_XCHG BASED INSTEAD? */
365 : static inline fd_tile_private_t *
366 108 : fd_tile_private_trylock( ulong tile_idx ) {
367 108 : fd_tile_private_t * volatile * vtile = (fd_tile_private_t * volatile *)&fd_tile_private[ tile_idx ].lock;
368 108 : fd_tile_private_t * tile = *vtile;
369 108 : if( FD_LIKELY( tile ) && FD_LIKELY( FD_ATOMIC_CAS( vtile, tile, NULL )==tile ) ) return tile;
370 3 : return NULL;
371 108 : }
372 :
373 : static inline fd_tile_private_t *
374 36 : fd_tile_private_lock( ulong tile_idx ) {
375 36 : fd_tile_private_t * volatile * vtile = (fd_tile_private_t * volatile *)&fd_tile_private[ tile_idx ].lock;
376 36 : fd_tile_private_t * tile;
377 36 : for(;;) {
378 36 : tile = *vtile;
379 36 : if( FD_LIKELY( tile ) && FD_LIKELY( FD_ATOMIC_CAS( vtile, tile, NULL )==tile ) ) break;
380 0 : FD_SPIN_PAUSE();
381 0 : }
382 36 : return tile;
383 36 : }
384 :
385 : static inline void
386 : fd_tile_private_unlock( ulong tile_idx,
387 141 : fd_tile_private_t * tile ) {
388 141 : FD_VOLATILE( fd_tile_private[ tile_idx ].lock ) = tile;
389 141 : }
390 :
391 : fd_tile_exec_t *
392 : fd_tile_exec_new( ulong idx,
393 : fd_tile_task_t task,
394 : int argc,
395 141 : char ** argv ) {
396 141 : if( FD_UNLIKELY( (idx==fd_tile_private_idx) | (!idx) ) ) return NULL; /* Can't dispatch to self or to tile 0 */
397 :
398 108 : fd_tile_private_t * tile = fd_tile_private_trylock( idx );
399 108 : if( FD_UNLIKELY( !tile ) ) return NULL;
400 :
401 : /* Exec holds the lock and tile state is idle here */
402 105 : FD_VOLATILE( tile->argc ) = argc;
403 105 : FD_VOLATILE( tile->argv ) = argv;
404 105 : FD_VOLATILE( tile->task ) = task;
405 105 : FD_COMPILER_MFENCE();
406 105 : FD_VOLATILE( tile->state ) = FD_TILE_PRIVATE_STATE_EXEC;
407 105 : return (fd_tile_exec_t *)tile;
408 108 : }
409 :
410 : char const *
411 : fd_tile_exec_delete( fd_tile_exec_t * exec,
412 105 : int * opt_ret ) {
413 105 : fd_tile_private_t * tile = (fd_tile_private_t *)exec;
414 105 : ulong tile_idx = tile->idx;
415 :
416 105 : int state;
417 1968723 : for(;;) {
418 1968723 : state = FD_VOLATILE_CONST( tile->state );
419 1968723 : if( FD_LIKELY( state==FD_TILE_PRIVATE_STATE_IDLE ) ) break;
420 1968618 : FD_SPIN_PAUSE();
421 1968618 : }
422 : /* state is IDLE at this point */
423 105 : char const * fail = FD_VOLATILE_CONST( tile->fail );
424 105 : if( FD_LIKELY( (!fail) & (!!opt_ret) ) ) *opt_ret = FD_VOLATILE_CONST( tile->ret );
425 105 : fd_tile_private_unlock( tile_idx, tile );
426 105 : return fail;
427 105 : }
428 :
429 111 : fd_tile_exec_t * fd_tile_exec( ulong tile_idx ) { return (fd_tile_exec_t *)fd_tile_private[ tile_idx ].tile; }
430 :
431 0 : ulong fd_tile_exec_id ( fd_tile_exec_t const * exec ) { return ((fd_tile_private_t const *)exec)->id; }
432 12 : ulong fd_tile_exec_idx ( fd_tile_exec_t const * exec ) { return ((fd_tile_private_t const *)exec)->idx; }
433 12 : fd_tile_task_t fd_tile_exec_task( fd_tile_exec_t const * exec ) { return ((fd_tile_private_t const *)exec)->task; }
434 12 : int fd_tile_exec_argc( fd_tile_exec_t const * exec ) { return ((fd_tile_private_t const *)exec)->argc; }
435 12 : char ** fd_tile_exec_argv( fd_tile_exec_t const * exec ) { return ((fd_tile_private_t const *)exec)->argv; }
436 :
437 : int
438 12 : fd_tile_exec_done( fd_tile_exec_t const * exec ) {
439 12 : fd_tile_private_t const * tile = (fd_tile_private_t const *)exec;
440 12 : return FD_VOLATILE_CONST( tile->state )==FD_TILE_PRIVATE_STATE_IDLE;
441 12 : }
442 :
443 : /* Boot/halt APIs ****************************************************/
444 :
445 : /* Parse a list of cpu tiles */
446 :
447 : FD_STATIC_ASSERT( FD_TILE_MAX<65535, update_tile_to_cpu_type );
448 :
449 : ulong
450 : fd_tile_private_cpus_parse( char const * cstr,
451 1152 : ushort * tile_to_cpu ) {
452 1152 : if( !cstr ) return 0UL;
453 111 : ulong cnt = 0UL;
454 :
455 111 : FD_CPUSET_DECL( assigned_set );
456 :
457 111 : char const * p = cstr;
458 225 : for(;;) {
459 :
460 225 : while( isspace( (int)p[0] ) ) p++; /* Munch whitespace */
461 :
462 225 : if( p[0]=='f' ) { /* These tiles have been requested to float on the original core set */
463 0 : p++;
464 :
465 0 : ulong float_cnt;
466 :
467 0 : while( isspace( (int)p[0] ) ) p++; /* Munch whitespace */
468 0 : if ( p[0]==',' ) float_cnt = 1UL, p++;
469 0 : else if( p[0]=='\0' ) float_cnt = 1UL;
470 0 : else if( !isdigit( (int)p[0] ) ) FD_LOG_ERR(( "fd_tile: malformed --tile-cpus (malformed count)" ));
471 0 : else {
472 0 : float_cnt = fd_cstr_to_ulong( p );
473 0 : if( FD_UNLIKELY( !float_cnt ) ) FD_LOG_ERR(( "fd_tile: malformed --tile-cpus (bad count)" ));
474 0 : p++; while( isdigit( (int)p[0] ) ) p++; /* FIXME: USE STRTOUL ENDPTR FOR CORRECT HANDLING OF NON-BASE-10 */
475 0 : while( isspace( (int)p[0] ) ) p++; /* Munch whitespace */
476 0 : if( FD_UNLIKELY( !( p[0]==',' || p[0]=='\0' ) ) ) FD_LOG_ERR(( "fd_tile: malformed --tile-cpus (bad count delimiter)" ));
477 0 : if( p[0]==',' ) p++;
478 0 : }
479 :
480 : /* float_cnt is at least 1 at this point */
481 0 : do {
482 0 : if( FD_UNLIKELY( cnt>=FD_TILE_MAX ) ) FD_LOG_ERR(( "fd_tile: too many --tile-cpus" ));
483 0 : tile_to_cpu[ cnt++ ] = (ushort)65535;
484 0 : } while( --float_cnt );
485 :
486 0 : continue;
487 0 : }
488 :
489 225 : if( !isdigit( (int)p[0] ) ) {
490 111 : if( FD_UNLIKELY( p[0]!='\0' ) ) FD_LOG_ERR(( "fd_tile: malformed --tile-cpus (range lo not a cpu)" ));
491 111 : break;
492 111 : }
493 114 : ulong cpu0 = fd_cstr_to_ulong( p );
494 114 : ulong cpu1 = cpu0;
495 114 : ulong stride = 1UL;
496 114 : p++; while( isdigit( (int)p[0] ) ) p++; /* FIXME: USE STRTOUL ENDPTR FOR CORRECT HANDLING OF NON-BASE-10 */
497 114 : while( isspace( (int)p[0] ) ) p++;
498 114 : if( p[0]=='-' ) {
499 6 : p++;
500 6 : while( isspace( (int)p[0] ) ) p++;
501 6 : if( FD_UNLIKELY( !isdigit( (int)p[0] ) ) ) FD_LOG_ERR(( "fd_tile: malformed --tile-cpus (range hi not a cpu)" ));
502 6 : cpu1 = fd_cstr_to_ulong( p );
503 6 : p++; while( isdigit( (int)p[0] ) ) p++; /* FIXME: USE STRTOUL ENDPTR FOR CORRECT HANDLING OF NON-BASE-10 */
504 6 : while( isspace( (int)p[0] ) ) p++;
505 6 : if( p[0]=='/' || p[0]==':' ) {
506 3 : p++;
507 3 : while( isspace( (int)p[0] ) ) p++;
508 3 : if( FD_UNLIKELY( !isdigit( (int)p[0] ) ) ) FD_LOG_ERR(( "fd_tile: malformed --tile-cpus (stride not an int)" ));
509 3 : stride = fd_cstr_to_ulong( p );
510 3 : p++; while( isdigit( (int)p[0] ) ) p++; /* FIXME: USE STRTOUL ENDPTR FOR CORRECT HANDLING OF NON-BASE-10 */
511 3 : }
512 6 : }
513 114 : while( isspace( (int)p[0] ) ) p++;
514 114 : if( FD_UNLIKELY( !( p[0]==',' || p[0]=='\0' ) ) ) FD_LOG_ERR(( "fd_tile: malformed --tile-cpus (bad range delimiter)" ));
515 114 : if( p[0]==',' ) p++;
516 114 : cpu1++;
517 114 : if( FD_UNLIKELY( cpu1<=cpu0 ) ) FD_LOG_ERR(( "fd_tile: malformed --tile-cpus (invalid range)" ));
518 114 : if( FD_UNLIKELY( !stride ) ) FD_LOG_ERR(( "fd_tile: malformed --tile-cpus (invalid stride)" ));
519 :
520 261 : for( ulong cpu=cpu0; cpu<cpu1; cpu+=stride ) {
521 147 : if( FD_UNLIKELY( cnt>=FD_TILE_MAX ) ) FD_LOG_ERR(( "fd_tile: too many --tile-cpus" ));
522 147 : if( FD_UNLIKELY( fd_cpuset_test( assigned_set, cpu ) ) ) FD_LOG_ERR(( "fd_tile: malformed --tile-cpus (repeated cpu)" ));
523 147 : tile_to_cpu[ cnt++ ] = (ushort)cpu;
524 147 : fd_cpuset_insert( assigned_set, cpu );
525 147 : }
526 114 : }
527 :
528 111 : return cnt;
529 111 : }
530 :
531 : static fd_tile_private_cpu_config_t fd_tile_private_cpu_config_save[1];
532 :
533 : void
534 : fd_tile_private_map_boot( ushort * tile_to_cpu,
535 1152 : ulong tile_cnt ) {
536 1152 : fd_tile_private_id0 = fd_log_thread_id();
537 1152 : fd_tile_private_id1 = fd_tile_private_id0 + tile_cnt;
538 1152 : fd_tile_private_cnt = tile_cnt;
539 :
540 1152 : ulong app_id = fd_log_app_id();
541 1152 : ulong host_id = fd_log_host_id();
542 1152 : FD_LOG_INFO(( "fd_tile: booting thread group %lu:%lu/%lu", app_id, fd_tile_private_id0, fd_tile_private_cnt ));
543 :
544 : /* We create the tiles [1,tile_cnt) first so that any floating tiles
545 : in this inherit the appropriate scheduler priorities and affinities
546 : from the thread group launcher. */
547 :
548 1188 : for( ulong tile_idx=1UL; tile_idx<tile_cnt; tile_idx++ ) {
549 :
550 36 : ulong cpu_idx = (ulong)tile_to_cpu[ tile_idx ];
551 36 : int fixed = (cpu_idx<65535UL);
552 :
553 36 : if( fixed ) FD_LOG_INFO(( "fd tile: booting tile %lu on cpu %lu:%lu", tile_idx, host_id, cpu_idx ));
554 0 : else FD_LOG_INFO(( "fd tile: booting tile %lu on cpu %lu:float", tile_idx, host_id ));
555 :
556 36 : pthread_attr_t attr[1];
557 36 : int err = pthread_attr_init( attr );
558 36 : if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_tile: pthread_attr_init failed (%i-%s) for tile %lu.\n\t",
559 36 : err, fd_io_strerror( err ), tile_idx ));
560 :
561 : /* Set affinity ahead of time. This is a GNU-specific extension
562 : that is not available on musl. On musl, we just skip this
563 : step as we call sched_setaffinity(2) later on regardless. */
564 :
565 36 : # if __GLIBC__
566 36 : if( fixed ) {
567 36 : FD_CPUSET_DECL( cpu_set );
568 36 : fd_cpuset_insert( cpu_set, cpu_idx );
569 36 : err = pthread_attr_setaffinity_np( attr, fd_cpuset_footprint(), (cpu_set_t const *)fd_type_pun_const( cpu_set ) );
570 36 : if( FD_UNLIKELY( err ) ) FD_LOG_WARNING(( "fd_tile: pthread_attr_setaffinity_failed (%i-%s)\n\t"
571 36 : "Unable to set the thread affinity for tile %lu on cpu %lu. Attempting to\n\t"
572 36 : "continue without explicitly specifying this cpu's thread affinity but it\n\t"
573 36 : "is likely this thread group's performance and stability are compromised\n\t"
574 36 : "(possibly catastrophically so). Update --tile-cpus to specify a set of\n\t"
575 36 : "allowed cpus that have been reserved for this thread group on this host\n\t"
576 36 : "to eliminate this warning.",
577 36 : err, fd_io_strerror( err ), tile_idx, cpu_idx ));
578 36 : }
579 36 : # endif /* __GLIBC__ */
580 :
581 : /* Create an optimized stack with guard regions if the build target
582 : is x86 (e.g. supports huge pages necessary to optimize TLB usage)
583 : and the tile is assigned to a particular CPU (e.g. bind the stack
584 : memory to the NUMA node closest to the cpu).
585 :
586 : Otherwise (or if an optimized stack could not be created), create
587 : vanilla pthread-style stack with guard regions. We DIY here
588 : because pthreads seems to be missing an API to determine the
589 : extents of the stacks it creates and we need to know the stack
590 : extents for run-time stack diagnostics. Though we can use
591 : fd_log_private_stack_discover to determine stack extents after
592 : the thread is started, it is faster, more flexible, more reliable
593 : and more portable to use a user specified stack when possible.
594 :
595 : If neither can be done, we will let pthreads create the tile's
596 : stack and try to discover the stack extents after the thread is
597 : started. */
598 :
599 36 : int optimize = FD_HAS_X86 & fixed;
600 :
601 36 : void * stack = fd_tile_private_stack_new( optimize, cpu_idx );
602 36 : if( FD_LIKELY( stack ) ) {
603 36 : err = pthread_attr_setstack( attr, stack, FD_TILE_PRIVATE_STACK_SZ );
604 36 : if( FD_UNLIKELY( err ) ) {
605 0 : FD_LOG_WARNING(( "fd_tile: pthread_attr_setstack failed (%i-%s)\n\t", err, fd_io_strerror( err ) ));
606 0 : fd_tile_private_stack_delete( stack );
607 0 : stack = NULL;
608 0 : }
609 36 : }
610 :
611 36 : if( FD_UNLIKELY( !stack ) ) FD_LOG_WARNING(( "fd_tile: Unable to create a stack for tile %lu.\n\t"
612 36 : "Attempting to continue with the default stack but it is likely this\n\t"
613 36 : "thread group's performance and stability is compromised (possibly\n\t"
614 36 : "catastrophically so).",
615 36 : tile_idx ));
616 :
617 36 : ulong stack_sz;
618 36 : err = pthread_attr_getstacksize( attr, &stack_sz );
619 36 : if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_tile: pthread_attr_getstacksize failed (%i-%s) for tile %lu.\n\t",
620 36 : err, fd_io_strerror( err ), tile_idx ));
621 :
622 36 : FD_VOLATILE( fd_tile_private[ tile_idx ].lock ) = NULL;
623 :
624 36 : fd_tile_private_manager_args_t args[1];
625 :
626 36 : FD_VOLATILE( args->id ) = fd_tile_private_id0 + tile_idx;
627 36 : FD_VOLATILE( args->idx ) = tile_idx;
628 36 : FD_VOLATILE( args->cpu_idx ) = cpu_idx;
629 36 : FD_VOLATILE( args->stack ) = stack;
630 36 : FD_VOLATILE( args->stack_sz ) = stack_sz;
631 36 : FD_VOLATILE( args->tile ) = NULL;
632 :
633 36 : FD_COMPILER_MFENCE();
634 :
635 36 : err = pthread_create( &fd_tile_private[tile_idx].pthread, attr, fd_tile_private_manager, args );
636 36 : if( FD_UNLIKELY( err ) ) {
637 0 : if( fixed ) FD_LOG_ERR(( "fd_tile: pthread_create failed (%i-%s)\n\t"
638 0 : "Unable to start up the tile %lu on cpu %lu. Likely causes for this include\n\t"
639 0 : "this cpu is restricted from the user or does not exist on this host.\n\t"
640 0 : "Update --tile-cpus to specify a set of allowed cpus that have been reserved\n\t"
641 0 : "for this thread group on this host.",
642 0 : err, fd_io_strerror( err ), tile_idx, cpu_idx ));
643 0 : FD_LOG_ERR(( "fd_tile: pthread_create failed (%i-%s)\n\tUnable to start up the tile %lu (floating).",
644 0 : err, fd_io_strerror( err ), tile_idx ));
645 0 : }
646 :
647 : /* Wait for the tile to be ready to exec */
648 :
649 36 : fd_tile_private_t * tile;
650 54060 : for(;;) {
651 54060 : tile = FD_VOLATILE_CONST( args->tile );
652 54060 : if( FD_LIKELY( tile ) ) break;
653 54024 : FD_YIELD();
654 54024 : }
655 36 : FD_VOLATILE( fd_tile_private[ tile_idx ].tile ) = tile;
656 36 : FD_VOLATILE( fd_tile_private[ tile_idx ].lock ) = tile;
657 :
658 : /* Tile is running, args is safe to reuse */
659 :
660 36 : err = pthread_attr_destroy( attr );
661 36 : if( FD_UNLIKELY( err ) )
662 0 : FD_LOG_WARNING(( "fd_tile: pthread_attr_destroy failed (%i-%s) for tile %lu; attempting to continue",
663 36 : err, fd_io_strerror( err ), tile_idx ));
664 36 : }
665 :
666 : /* And now we "boot" tile 0 */
667 :
668 1152 : ulong cpu_idx = (ulong)tile_to_cpu[ 0UL ];
669 1152 : int fixed = (cpu_idx<65535UL);
670 1152 : if( fixed ) FD_LOG_INFO(( "fd tile: booting tile %lu on cpu %lu:%lu", 0UL, host_id, cpu_idx ));
671 1041 : else FD_LOG_INFO(( "fd tile: booting tile %lu on cpu %lu:float", 0UL, host_id ));
672 :
673 1152 : if( fixed ) {
674 :
675 111 : int good_taskset;
676 111 : FD_CPUSET_DECL( cpu_set );
677 111 : if( FD_UNLIKELY( fd_cpuset_getaffinity( (pid_t)0, cpu_set ) ) ) {
678 0 : FD_LOG_WARNING(( "fd_tile: fd_cpuset_getaffinity failed (%i-%s) for tile 0 on cpu %lu",
679 0 : errno, fd_io_strerror( errno ), cpu_idx ));
680 0 : good_taskset = 0;
681 111 : } else {
682 111 : ulong cnt = fd_cpuset_cnt( cpu_set );
683 111 : ulong idx = fd_cpuset_first( cpu_set );
684 111 : good_taskset = (cnt==1UL) & (idx==cpu_idx);
685 111 : }
686 :
687 111 : if( FD_UNLIKELY( !good_taskset ) ) {
688 9 : FD_LOG_WARNING(( "fd_tile: --tile-cpus for tile 0 may not match initial kernel affinity\n\t"
689 9 : "Tile 0 might not be fully optimized because of kernel first touch.\n\t"
690 9 : "Overriding fd_log_cpu_id(), fd_log_cpu(), fd_log_thread() on tile 0 to\n\t"
691 9 : "match --tile-cpus and attempting to continue. Launch this thread\n\t"
692 9 : "group via 'taskset -c %lu' or equivalent to eliminate this warning.", cpu_idx ));
693 9 : fd_cpuset_null( cpu_set );
694 9 : fd_cpuset_insert( cpu_set, cpu_idx );
695 9 : if( FD_UNLIKELY( fd_cpuset_setaffinity( (pid_t)0, cpu_set ) ) )
696 0 : FD_LOG_WARNING(( "fd_tile: fd_cpuset_setaffinity_failed (%i-%s)\n\t"
697 9 : "Unable to set the thread affinity for tile 0 on cpu %lu. Attempting to\n\t"
698 9 : "continue without explicitly specifying this cpu's thread affinity but it\n\t"
699 9 : "is likely this thread group's performance and stability are compromised\n\t"
700 9 : "(possibly catastrophically so). Update --tile-cpus to specify a set of\n\t"
701 9 : "allowed cpus that have been reserved for this thread group on this host\n\t"
702 9 : "to eliminate this warning.",
703 9 : errno, fd_io_strerror( errno ), cpu_idx ));
704 9 : fd_log_private_cpu_id_set( cpu_idx );
705 9 : fd_log_cpu_set ( NULL );
706 9 : fd_log_thread_set( NULL );
707 9 : }
708 111 : }
709 :
710 : /* Tile 0 "pthread_create" */
711 1152 : fd_tile_private[0].pthread = pthread_self();
712 : /* FIXME: ON X86, DETECT IF TILE 0 STACK ISN'T HUGE PAGE AND WARN AS NECESSARY? */
713 :
714 : /* Tile 0 "thread manager init" */
715 1152 : fd_tile_private_id = fd_tile_private_id0;
716 1152 : fd_tile_private_idx = 0UL;
717 :
718 1152 : # if !FD_HAS_ASAN
719 1152 : fd_log_private_stack_discover( fd_log_private_main_stack_sz(),
720 1152 : &fd_tile_private_stack0, &fd_tile_private_stack1 ); /* logs details */
721 1152 : if( FD_UNLIKELY( !fd_tile_private_stack0 ) )
722 0 : FD_LOG_WARNING(( "stack diagnostics not available on this tile; attempting to continue" ));
723 1152 : # endif /* FD_HAS_ASAN */
724 :
725 1152 : fd_tile_private_cpu_config( fd_tile_private_cpu_config_save, cpu_idx );
726 1152 : fd_tile_private[0].lock = NULL; /* Can't dispatch to tile 0 */
727 1152 : fd_tile_private[0].tile = NULL; /* " */
728 :
729 1152 : FD_LOG_INFO(( "fd_tile: boot tile %lu success (thread %lu:%lu in thread group %lu:%lu/%lu)",
730 1152 : fd_tile_private_idx, app_id, fd_tile_private_id, app_id, fd_tile_private_id0, fd_tile_private_cnt ));
731 :
732 1152 : fd_memcpy( fd_tile_private_cpu_id, tile_to_cpu, fd_tile_private_cnt*sizeof(ushort) );
733 :
734 1152 : FD_LOG_INFO(( "fd_tile: boot success" ));
735 1152 : }
736 :
737 : void
738 1152 : fd_tile_private_boot_str( char const * cpus ) {
739 1152 : ushort tile_to_cpu[ FD_TILE_MAX ];
740 1152 : ulong tile_cnt = fd_tile_private_cpus_parse( cpus, tile_to_cpu );
741 :
742 1152 : if( FD_UNLIKELY( !tile_cnt ) ) {
743 1041 : FD_LOG_INFO(( "fd_tile: no cpus specified; treating thread group as single tile running on O/S assigned cpu(s)" ));
744 1041 : tile_to_cpu[0] = (ushort)65535;
745 1041 : tile_cnt = 1UL;
746 1041 : }
747 :
748 1152 : fd_tile_private_map_boot( tile_to_cpu, tile_cnt );
749 1152 : }
750 :
751 : void
752 : fd_tile_private_boot( int * pargc,
753 1152 : char *** pargv ) {
754 : /* Extract the tile configuration from the command line */
755 :
756 1152 : char const * cpus = fd_env_strip_cmdline_cstr( pargc, pargv, "--tile-cpus", "FD_TILE_CPUS", NULL );
757 :
758 1152 : if( !cpus ) FD_LOG_INFO(( "fd_tile: --tile-cpus not specified" ));
759 111 : else FD_LOG_INFO(( "fd_tile: --tile-cpus \"%s\"", cpus ));
760 :
761 1152 : fd_tile_private_boot_str( cpus );
762 1152 : }
763 :
764 : void
765 1140 : fd_tile_private_halt( void ) {
766 1140 : FD_LOG_INFO(( "fd_tile: halt" ));
767 :
768 1140 : fd_memset( fd_tile_private_cpu_id, 0, fd_tile_private_cnt*sizeof(ushort) );
769 :
770 1140 : ulong tile_cnt = fd_tile_private_cnt;
771 :
772 1140 : fd_tile_private_t * tile[ FD_TILE_MAX ]; /* FIXME: ALLOCA TO TILE_CNT? */
773 :
774 1140 : FD_LOG_INFO(( "fd_tile: disabling dispatch" ));
775 1176 : for( ulong tile_idx=1UL; tile_idx<tile_cnt; tile_idx++ ) tile[ tile_idx ] = fd_tile_private_lock( tile_idx );
776 : /* All tile to tile dispatches will fail at this point */
777 :
778 1140 : FD_LOG_INFO(( "fd_tile: waiting for all tasks to complete" ));
779 1176 : for( ulong tile_idx=1UL; tile_idx<tile_cnt; tile_idx++ )
780 36 : while( FD_VOLATILE_CONST( tile[ tile_idx ]->state )!=FD_TILE_PRIVATE_STATE_IDLE ) FD_YIELD();
781 : /* All halt transitions will be valid at this point */
782 :
783 1140 : FD_LOG_INFO(( "fd_tile: signaling all tiles to halt" ));
784 1176 : for( ulong tile_idx=1UL; tile_idx<tile_cnt; tile_idx++ ) FD_VOLATILE( tile[ tile_idx ]->state ) = FD_TILE_PRIVATE_STATE_HALT;
785 : /* All tiles are halting at this point. tile[*] is no longer safe */
786 :
787 1140 : FD_LOG_INFO(( "fd_tile: waiting for all tiles to halt" ));
788 1176 : for( ulong tile_idx=1UL; tile_idx<tile_cnt; tile_idx++ ) {
789 36 : void * stack;
790 36 : int err = pthread_join( fd_tile_private[ tile_idx ].pthread, &stack );
791 36 : if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_tile: pthread_join failed (%i-%s)", err, fd_io_strerror( err ) ));
792 36 : fd_tile_private_stack_delete( stack );
793 36 : FD_LOG_INFO(( "fd_tile: halt tile %lu success", tile_idx ));
794 36 : }
795 :
796 : /* All tiles but this one are halted at this point */
797 :
798 1140 : fd_tile_private_cpu_restore( fd_tile_private_cpu_config_save );
799 :
800 1140 : FD_LOG_INFO(( "fd_tile: halt tile 0 success" ));
801 :
802 1140 : FD_LOG_INFO(( "fd_tile: cleaning up" ));
803 :
804 1176 : for( ulong tile_idx=1UL; tile_idx<tile_cnt; tile_idx++ ) fd_tile_private_unlock( tile_idx, NULL );
805 :
806 1140 : fd_memset( fd_tile_private_cpu_config_save, 0, sizeof(fd_tile_private_cpu_config_t) );
807 :
808 1140 : fd_tile_private_stack1 = 0UL;
809 1140 : fd_tile_private_stack0 = 0UL;
810 1140 : fd_tile_private_idx = 0UL;
811 1140 : fd_tile_private_id = 0UL;
812 :
813 1140 : fd_tile_private_cnt = 0UL;
814 1140 : fd_tile_private_id1 = 0UL;
815 1140 : fd_tile_private_id0 = 0UL;
816 :
817 1140 : FD_LOG_INFO(( "fd_tile: halt success" ));
818 1140 : }
|