LCOV - code coverage report
Current view: top level - tango/cnc - fd_cnc.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 105 193 54.4 %
Date: 2025-01-08 12:08:44 Functions: 11 11 100.0 %

          Line data    Source code
       1             : #include "fd_cnc.h"
       2             : 
       3             : ulong
       4          63 : fd_cnc_align( void ) {
       5          63 :   return FD_CNC_ALIGN;
       6          63 : }
       7             : 
       8             : ulong
       9         123 : fd_cnc_footprint( ulong app_sz ) {
      10         123 :   if( FD_UNLIKELY( app_sz > (ULONG_MAX-191UL) ) ) return 0UL; /* overflow */
      11         117 :   return FD_CNC_FOOTPRINT( app_sz );
      12         123 : }
      13             : 
      14             : void *
      15             : fd_cnc_new( void * shmem,
      16             :             ulong  app_sz,
      17             :             ulong  type,
      18          57 :             long   now ) {
      19          57 :   fd_cnc_t * cnc = (fd_cnc_t *)shmem;
      20             : 
      21          57 :   if( FD_UNLIKELY( !shmem ) ) {
      22           0 :     FD_LOG_WARNING(( "NULL shmem" ));
      23           0 :     return NULL;
      24           0 :   }
      25             : 
      26          57 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, fd_cnc_align() ) ) ) {
      27           0 :     FD_LOG_WARNING(( "misaligned shmem" ));
      28           0 :     return NULL;
      29           0 :   }
      30             : 
      31          57 :   ulong footprint = fd_cnc_footprint( app_sz );
      32          57 :   if( FD_UNLIKELY( !footprint ) ) {
      33           0 :     FD_LOG_WARNING(( "bad app_sz (%lu)", app_sz ));
      34           0 :     return NULL;
      35           0 :   }
      36             : 
      37          57 :   fd_memset( cnc, 0, footprint );
      38             : 
      39          57 :   cnc->app_sz     = app_sz;
      40          57 :   cnc->type       = type;
      41          57 :   cnc->heartbeat0 = now;
      42          57 :   cnc->heartbeat  = now;
      43          57 :   cnc->lock       = 0UL;
      44          57 :   cnc->signal     = FD_CNC_SIGNAL_BOOT;
      45             : 
      46          57 :   FD_COMPILER_MFENCE();
      47          57 :   FD_VOLATILE( cnc->magic ) = FD_CNC_MAGIC;
      48          57 :   FD_COMPILER_MFENCE();
      49             : 
      50          57 :   return (void *)cnc;
      51          57 : }
      52             : 
      53             : fd_cnc_t *
      54         216 : fd_cnc_join( void * shcnc ) {
      55             : 
      56         216 :   if( FD_UNLIKELY( !shcnc ) ) {
      57           0 :     FD_LOG_WARNING(( "NULL shcnc" ));
      58           0 :     return NULL;
      59           0 :   }
      60             : 
      61         216 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shcnc, fd_cnc_align() ) ) ) {
      62           0 :     FD_LOG_WARNING(( "misaligned shcnc" ));
      63           0 :     return NULL;
      64           0 :   }
      65             : 
      66         216 :   fd_cnc_t * cnc = (fd_cnc_t *)shcnc;
      67             : 
      68         216 :   if( FD_UNLIKELY( cnc->magic!=FD_CNC_MAGIC ) ) {
      69           0 :     FD_LOG_WARNING(( "bad magic" ));
      70           0 :     return NULL;
      71           0 :   }
      72             : 
      73         216 :   return cnc;
      74         216 : }
      75             : 
      76             : void *
      77         216 : fd_cnc_leave( fd_cnc_t const * cnc ) {
      78             : 
      79         216 :   if( FD_UNLIKELY( !cnc ) ) {
      80           0 :     FD_LOG_WARNING(( "NULL cnc" ));
      81           0 :     return NULL;
      82           0 :   }
      83             : 
      84         216 :   return (void *)cnc; /* Kinda ugly const cast */
      85         216 : }
      86             : 
      87             : void *
      88           9 : fd_cnc_delete( void * shcnc ) {
      89             : 
      90           9 :   if( FD_UNLIKELY( !shcnc ) ) {
      91           0 :     FD_LOG_WARNING(( "NULL shcnc" ));
      92           0 :     return NULL;
      93           0 :   }
      94             : 
      95           9 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shcnc, fd_cnc_align() ) ) ) {
      96           0 :     FD_LOG_WARNING(( "misaligned shcnc" ));
      97           0 :     return NULL;
      98           0 :   }
      99             : 
     100           9 :   fd_cnc_t * cnc = (fd_cnc_t *)shcnc;
     101             : 
     102           9 :   if( FD_UNLIKELY( cnc->magic!=FD_CNC_MAGIC ) ) {
     103           3 :     FD_LOG_WARNING(( "bad magic" ));
     104           3 :     return NULL;
     105           3 :   }
     106             : 
     107           6 :   FD_COMPILER_MFENCE();
     108           6 :   FD_VOLATILE( cnc->magic ) = 0UL;
     109           6 :   FD_COMPILER_MFENCE();
     110             : 
     111           6 :   return (void *)cnc;
     112           9 : }
     113             : 
     114             : #if FD_HAS_HOSTED && FD_HAS_ATOMIC
     115             : 
     116             : #include <errno.h>
     117             : #include <signal.h>
     118             : #include <sched.h>
     119             : 
     120             : int
     121         105 : fd_cnc_open( fd_cnc_t * cnc ) {
     122             : 
     123             :   /* Check input args */
     124             : 
     125         105 :   if( FD_UNLIKELY( !cnc ) ) {
     126           0 :     FD_LOG_WARNING(( "NULL cnc" ));
     127           0 :     return FD_CNC_ERR_INVAL;
     128           0 :   }
     129             : 
     130         105 :   ulong my_pid = fd_log_group_id();
     131         105 :   if( FD_UNLIKELY( (!my_pid) | (my_pid!=(ulong)(pid_t)my_pid) ) ) {
     132           0 :     FD_LOG_WARNING(( "unexpected pid (%lu)", my_pid ));
     133           0 :     return FD_CNC_ERR_UNSUP;
     134           0 :   }
     135             : 
     136             :   /* Try to acquire a lock on the cnc */
     137             : 
     138         105 :   FD_COMPILER_MFENCE();
     139         105 :   ulong cnc_pid = FD_ATOMIC_CAS( &cnc->lock, 0UL, my_pid );
     140         105 :   FD_COMPILER_MFENCE();
     141             : 
     142         105 :   if( FD_LIKELY( !cnc_pid ) ) {
     143             : 
     144             :     /* Got the lock ... get the status of the app thread. */
     145             : 
     146         105 :     ulong signal = fd_cnc_signal_query( cnc );
     147             : 
     148             :     /* If the app thread was in the run state, return success. */
     149             : 
     150         105 :     if( FD_LIKELY( signal==FD_CNC_SIGNAL_RUN ) ) return FD_CNC_SUCCESS;
     151             : 
     152             :     /* At this point, since RUN was not observed, we can't safely issue
     153             :        signals to the app thread.  So we unlock the lock.  If FAIL was
     154             :        observed, we know that this thread is permanently dead and we
     155             :        hard fail the open request.  If BOOT, HALT or USER defined, we
     156             :        can't guarantee that we will never be able to open up a command
     157             :        session, so we tell the user to try again later. */
     158             : 
     159           0 :     FD_COMPILER_MFENCE();
     160           0 :     FD_VOLATILE( cnc->lock ) = 0UL;
     161           0 :     FD_COMPILER_MFENCE();
     162             : 
     163           0 :     if( FD_LIKELY( signal==FD_CNC_SIGNAL_FAIL ) ) {
     164           0 :       FD_LOG_WARNING(( "app thread failed; unable to open command session" ));
     165           0 :       return FD_CNC_ERR_FAIL;
     166           0 :     }
     167             : 
     168           0 :     char buf[ FD_CNC_SIGNAL_CSTR_BUF_MAX ];
     169           0 :     FD_LOG_WARNING(( "signal %s (%lu) in progress on app thread; try again later?", fd_cnc_signal_cstr( signal, buf ), signal ));
     170           0 :     return FD_CNC_ERR_AGAIN;
     171           0 :   }
     172             : 
     173             :   /* Somebody else seems to have an open command session on the app
     174             :      thread.  Check that the somebody else is alive. */
     175             : 
     176           0 :   if( FD_UNLIKELY( cnc_pid!=my_pid && kill( (pid_t)cnc_pid, 0 ) ) ) {
     177             : 
     178           0 :     int err = errno;
     179           0 :     if( FD_LIKELY( err==ESRCH ) ) {
     180             : 
     181             :       /* A process died with an open command session.  Try to clean up
     182             :          after it and resume. */
     183             : 
     184           0 :       if( FD_LIKELY( FD_ATOMIC_CAS( &cnc->lock, cnc_pid, my_pid )==cnc_pid ) ) {
     185             : 
     186             :         /* We successfully reclaimed the lock from the dead process.  If
     187             :            there is a pending signal from it still being processed by
     188             :            the app thread (e.g. HALT or USER defined), wait briefly for
     189             :            it complete and then decide how best to proceed.  (Note: this
     190             :            assumes no pid reuse between the kill above cas.) */
     191             : 
     192           0 :         ulong signal = fd_cnc_signal_query( cnc );
     193             : 
     194           0 :         if( FD_UNLIKELY( !( (signal==FD_CNC_SIGNAL_BOOT) | (signal==FD_CNC_SIGNAL_RUN ) | (signal==FD_CNC_SIGNAL_FAIL) ) ) )
     195           0 :           signal = fd_cnc_wait( cnc, signal, (ulong)100e6, NULL ); /* 100 ms */
     196             : 
     197           0 :         if( FD_LIKELY( signal==FD_CNC_SIGNAL_RUN ) ) {
     198             : 
     199             :           /* App thread seem to be running and we have the lock.  Looks
     200             :              like we can recover. */
     201             : 
     202           0 :           FD_LOG_WARNING(( "pid %lu died with an open command session; attempting to recover", cnc_pid ));
     203           0 :           return FD_CNC_SUCCESS;
     204           0 :         }
     205             : 
     206           0 :         if( FD_LIKELY( signal==FD_CNC_SIGNAL_BOOT ) ) {
     207             : 
     208             :           /* Last signal apparently stopped the app thread and left it
     209             :              in a state where it can be booted again safely.  Unlock the
     210             :              session lock to end the stale command session (so that the
     211             :              thread can be booted again) and fail this open request with
     212             :              try again later as this open might succeed in the future
     213             :              (i.e. after the thread is booted in the run state again). */
     214             : 
     215           0 :           FD_COMPILER_MFENCE();
     216           0 :           FD_VOLATILE( cnc->lock ) = 0UL;
     217           0 :           FD_COMPILER_MFENCE();
     218             : 
     219           0 :           FD_LOG_WARNING(( "pid %lu died with an open command session that cleanly halted the app thread; try again later?",
     220           0 :                            cnc_pid ));
     221           0 :           return FD_CNC_ERR_AGAIN;
     222           0 :         }
     223             : 
     224           0 :         if( FD_LIKELY( signal==FD_CNC_SIGNAL_FAIL ) ) {
     225             : 
     226             :           /* Last signal apparently stopped the app thread and left it
     227             :              in a state where it cannot be booted again safely.  Unlock
     228             :              the session lock to end the stale command session (so that
     229             :              the app thread can be cleaned up) and fail this open
     230             :              request. */
     231             : 
     232           0 :           FD_COMPILER_MFENCE();
     233           0 :           FD_VOLATILE( cnc->lock ) = 0UL;
     234           0 :           FD_COMPILER_MFENCE();
     235             : 
     236           0 :           FD_LOG_WARNING(( "pid %lu died with an open command session that uncleanly halted the app thread", cnc_pid ));
     237           0 :           return FD_CNC_ERR_FAIL;
     238           0 :         }
     239             : 
     240             :         /* App thread seems to be still processing a HALT or USER
     241             :            defined signal.  Restore the lock to the dead pid and tell
     242             :            the user to try again later (when we might know better how to
     243             :            recover). */
     244             : 
     245           0 :         FD_COMPILER_MFENCE();
     246           0 :         FD_VOLATILE( cnc->lock ) = cnc_pid;
     247           0 :         FD_COMPILER_MFENCE();
     248             : 
     249           0 :         FD_LOG_WARNING(( "pid %lu died with an open command session and last signal issued (%lu) still seems to be pending; "
     250           0 :                          "try again later?", cnc_pid, signal ));
     251           0 :         return FD_CNC_ERR_AGAIN;
     252           0 :       }
     253             : 
     254             :       /* Another thread reclaimed the lock before we could.  Presumably
     255             :          that thread will recover the lock so we tell the user to try
     256             :          again later. */
     257             : 
     258           0 :       FD_LOG_WARNING(( "pid %lu died with an open command session and another thread is trying to clean it up; try again later?",
     259           0 :                        cnc_pid ));
     260           0 :       return FD_CNC_ERR_AGAIN;
     261           0 :     }
     262             : 
     263             :     /* There is an open command session but we can't tell if the pid
     264             :        running it is live.  Assume it is and tell the user to try again
     265             :        later. */
     266             : 
     267           0 :     FD_LOG_WARNING(( "pid %lu currently command session and unable to diagnose pid's state (%i-%s); try again later?",
     268           0 :                      cnc_pid, err, fd_io_strerror( err ) ));
     269           0 :     return FD_CNC_ERR_AGAIN;
     270           0 :   }
     271             : 
     272             :   /* There is already an open command session from a seemingly live
     273             :      process */
     274             : 
     275           0 :   FD_LOG_WARNING(( "pid %lu currently has an open command session; try again later?", cnc_pid ));
     276           0 :   return FD_CNC_ERR_AGAIN;
     277           0 : }
     278             : 
     279             : #else
     280             : 
     281             : int
     282             : fd_cnc_open( fd_cnc_t * cnc ) {
     283             :   (void)cnc;
     284             :   FD_LOG_WARNING(( "unsupported for this build target" ));
     285             :   return FD_CNC_ERR_UNSUP;
     286             : }
     287             : 
     288             : #endif
     289             : 
     290             : ulong
     291             : fd_cnc_wait( fd_cnc_t const * cnc,
     292             :              ulong            test,
     293             :              long             dt,
     294         492 :              long *           _opt_now ) {
     295         492 :   long then = fd_log_wallclock();
     296         492 :   long now  = then;
     297             : 
     298         492 :   ulong obs;
     299        7236 :   for(;;) {
     300        7236 :     obs = fd_cnc_signal_query( cnc );
     301        7236 :     int done = ((obs!=test) | ((now-then)>dt));
     302        7236 :     FD_COMPILER_FORGET( done ); /* avoid compiler misoptimization */
     303        7236 :     if( FD_LIKELY( done ) ) break; /* optimize for exit, single exit to optimize spin pause hinting */
     304        6744 :     FD_YIELD();
     305        6744 :     now = fd_log_wallclock();
     306        6744 :   }
     307             : 
     308         492 :   if( _opt_now ) *_opt_now = now; /* usage dep prob */
     309         492 :   return obs;
     310         492 : }
     311             : 
     312             : char const *
     313          18 : fd_cnc_strerror( int err ) {
     314          18 :   switch( err ) {
     315           3 :   case FD_CNC_SUCCESS:   return "success";
     316           3 :   case FD_CNC_ERR_UNSUP: return "unsupported here";
     317           3 :   case FD_CNC_ERR_INVAL: return "bad inputs";
     318           3 :   case FD_CNC_ERR_AGAIN: return "try again later";
     319           3 :   case FD_CNC_ERR_FAIL:  return "app thread failed";
     320           3 :   default: break;
     321          18 :   }
     322           3 :   return "unknown---possibly not a cnc error code";
     323          18 : }
     324             : 
     325             : ulong
     326         120 : fd_cstr_to_cnc_signal( char const * cstr ) {
     327         120 :   if( FD_UNLIKELY( !cstr ) ) return FD_CNC_SIGNAL_RUN;
     328         120 :   if( !fd_cstr_casecmp( cstr, "run"  ) ) return FD_CNC_SIGNAL_RUN;
     329         117 :   if( !fd_cstr_casecmp( cstr, "boot" ) ) return FD_CNC_SIGNAL_BOOT;
     330         114 :   if( !fd_cstr_casecmp( cstr, "fail" ) ) return FD_CNC_SIGNAL_FAIL;
     331         111 :   if( !fd_cstr_casecmp( cstr, "halt" ) ) return FD_CNC_SIGNAL_HALT;
     332           3 :   return fd_cstr_to_ulong( cstr );
     333         111 : }
     334             : 
     335             : char *
     336             : fd_cnc_signal_cstr( ulong  signal,
     337         222 :                     char * buf ) {
     338         222 :   if( FD_LIKELY( buf ) ) {
     339         222 :     switch( signal ) {
     340           3 :     case FD_CNC_SIGNAL_RUN:  strcpy( buf, "run"  ); break;
     341         108 :     case FD_CNC_SIGNAL_BOOT: strcpy( buf, "boot" ); break;
     342           3 :     case FD_CNC_SIGNAL_FAIL: strcpy( buf, "fail" ); break;
     343         105 :     case FD_CNC_SIGNAL_HALT: strcpy( buf, "halt" ); break;
     344           3 :     default:                 fd_cstr_printf( buf, FD_CNC_SIGNAL_CSTR_BUF_MAX, NULL, "%lu", signal ); break;
     345         222 :     }
     346         222 :   }
     347         222 :   return buf;
     348         222 : }

Generated by: LCOV version 1.14