Line data Source code
1 : #ifndef HEADER_fd_src_tango_cnc_fd_cnc_h 2 : #define HEADER_fd_src_tango_cnc_fd_cnc_h 3 : 4 : #include "../fd_tango_base.h" 5 : 6 : /* A fd_cnc_t provides APIs for out-of-band low bandwidth 7 : command-and-control ("cnc") signals to a high performance app thread 8 : ("app"). In the app thread's run loop, as part of its out-of-band 9 : housekeeping, it uses fd_cnc_t object dedicated to it to send and 10 : receive information from command and control threads and/or monitoring 11 : threads. The basic template for an fd_cnc_t state machine is: 12 : 13 : app 14 : new +---------<----------+ 15 : | | | 16 : | | +---<----+ | 17 : cnc v v | app | ^ 18 : | | v ^ | 19 : | | app | cnc | | 20 : +--<-- BOOT -->-- RUN -->-- USER -->---+ 21 : | cnc | | | | | | 22 : | | ^ v cnc | ^ v 23 : | v | app | | | | 24 : | | +--<-- HALT | +---<---+ 25 : v | | v app/cnc 26 : | +---->----+ | | 27 : | app | v app | 28 : | v | | 29 : | | | | 30 : delete -----<----- FAIL ---<--+ 31 : cnc app 32 : 33 : That is, when a cnc is created, it is in the BOOT state and the app 34 : thread that uses it is not running. When the app thread starts 35 : running and finishes booting up, it should transition the cnc to the 36 : RUN state if the thread started up successfully or the FAIL state if 37 : booting failed (the thread is not running and is considered to be 38 : unsafe to try restarting). While in the RUN state: 39 : 40 : - If a cnc thread raises a HALT signal on the app thread's cnc, the 41 : app thread should cleanup after itself and, just before it stops 42 : running, transition its cnc to BOOT (FAIL) if the app thread can 43 : (cannot) be booted again safely. 44 : 45 : - If a cnc thread raises a USER defined signal, the app thread should 46 : process the signal. If the app thread resumes running after 47 : processing the signal, it should transition its cnc to RUN. If 48 : this processing results in termination of the app thread, just before 49 : it stops running, it should transition its cnc to BOOT (FAIL) if the 50 : app thread can (cannot) be booted again safely. Note that the cnc 51 : state alone does not indicate if a USER defined signal was processed 52 : successfully (e.g. the app thread might chose to ignore a malformed 53 : command, log the details, and then resume running). For such 54 : information, the application can encode additional inputs and 55 : outputs regarding commands in the cnc app region. USER defined to 56 : USER defined transitions (either driven by the app thread as it 57 : processes a complex signal or by a back-and-forth interaction with 58 : the cnc thread) are fine and up to the application to define. 59 : 60 : The only thing that can be done to an app thread in the FAIL state is 61 : postmortem autopsies and clean up. An app thread should not do a 62 : RUN->FAIL transition, even if it dies in the RUN state. If a thread 63 : dies while in the RUN state, note that the cnc has a heartbeat to help 64 : cnc threads and/or monitor threads detect such without needing to open 65 : a cnc command session (specific heartbeating conventions are 66 : application defined). 67 : 68 : It is often useful to have one USER defined signal to be a no-op 69 : "ACK". For this, a cnc thread can signal ACK to the app thread. If 70 : the cnc returns to RUN reasonably promptly, the app thread has self 71 : reported to the cnc thread it is operating correctly. If it doesn't 72 : (i.e. times out), the cnc thread can forcibly terminate the app 73 : thread, move the cnc post termination into the FAIL state and then 74 : proceed like a normal FAIL. 75 : 76 : A cnc has an application defined type field to help applications 77 : distinguish between what USER defined signals might be supported by a 78 : particular app thread. */ 79 : 80 : /* FD_CNC_{ALIGN,FOOTPRINT} describe the alignment and footprint of a 81 : fd_cnc_t. ALIGN is a positive integer power of 2. FOOTPRINT is a 82 : multiple of ALIGN. ALIGN is recommended to be at least double cache 83 : line to mitigate various kinds of false sharing. app_sz is assumed 84 : to be valid (e.g. will not require a footprint larger than 85 : ULONG_MAX). These are provided to facilitate compile time 86 : declarations. */ 87 : 88 57 : #define FD_CNC_ALIGN (128UL) 89 : #define FD_CNC_FOOTPRINT( app_sz ) \ 90 114 : FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \ 91 114 : FD_CNC_ALIGN, 64UL ), \ 92 114 : FD_CNC_APP_ALIGN, (app_sz) ), \ 93 114 : FD_CNC_ALIGN ) 94 : 95 : /* FD_CNC_ALIGN describes the alignment and footprint of a fd_cnc_t's 96 : application region. This is a power of 2 of the minimal malloc 97 : alignment (typically 8) and at most FD_CNC_ALIGN. */ 98 : 99 : #define FD_CNC_APP_ALIGN (64UL) 100 : 101 : /* FD_CNC_SIGNAL_* are the standard cnc signals. All remaining values 102 : ([4,ULONG_MAX]) are available to implement user defined signals. 103 : Details of the standard signals are provided above. */ 104 : 105 303 : #define FD_CNC_SIGNAL_RUN (0UL) 106 273 : #define FD_CNC_SIGNAL_BOOT (1UL) 107 6 : #define FD_CNC_SIGNAL_FAIL (2UL) 108 408 : #define FD_CNC_SIGNAL_HALT (3UL) 109 : 110 : /* FD_CNC_SUCCESS, FD_CNC_ERR_* are error code return values used by 111 : cnc APIs. SUCCESS must be zero, ERR_* are negative and distinct. */ 112 : 113 108 : #define FD_CNC_SUCCESS (0) /* success */ 114 3 : #define FD_CNC_ERR_UNSUP (-1) /* unsupported on this caller */ 115 3 : #define FD_CNC_ERR_INVAL (-2) /* bad inputs */ 116 3 : #define FD_CNC_ERR_AGAIN (-3) /* potentially transient failure */ 117 3 : #define FD_CNC_ERR_FAIL (-4) /* permanent failure */ 118 : 119 : /* fd_cnc_t is an opaque handle of a command-and-control object. 120 : Details are exposed here to facilitate inlining of many cnc 121 : operations in performance critical app thread paths. */ 122 : 123 57 : #define FD_CNC_MAGIC (0xf17eda2c37c2c000UL) /* firedancer cnc ver 0 */ 124 : 125 : struct __attribute__((aligned(FD_CNC_ALIGN))) fd_cnc_private { 126 : ulong magic; /* ==FD_CNC_MAGIC */ 127 : ulong app_sz; 128 : ulong type; 129 : long heartbeat0; 130 : long heartbeat; 131 : ulong lock; 132 : ulong signal; 133 : /* Padding to FD_CNC_APP_ALIGN here */ 134 : /* app_sz bytes here */ 135 : /* Padding to FD_CNC_ALIGN here */ 136 : }; 137 : 138 : typedef struct fd_cnc_private fd_cnc_t; 139 : 140 : FD_PROTOTYPES_BEGIN 141 : 142 : /* fd_cnc_{align,footprint} return the required alignment and footprint 143 : of a memory region suitable for use as a cnc. fd_cnc_align returns 144 : FD_CNC_ALIGN. If footprint is larger than ULONG_MAX, footprint will 145 : silently return 0 (and thus can be used by the caller to validate the 146 : cnc configuration parameters). */ 147 : 148 : FD_FN_CONST ulong 149 : fd_cnc_align( void ); 150 : 151 : FD_FN_CONST ulong 152 : fd_cnc_footprint( ulong app_sz ); 153 : 154 : /* fd_cnc_new formats an unused memory region for use as a cnc. Assumes 155 : shmem is a non-NULL pointer to this region in the local address space 156 : with the required footprint and alignment. The cnc will be 157 : initialized to have the given type (should be in [0,UINT_MAX]) with 158 : an initial heartbeat of now. The cnc application region will be 159 : initialized to zero. Returns shmem (and the memory region it points 160 : to will be formatted as a cnc, caller is not joined) and NULL on 161 : failure (logs details). Reasons for failure include an obviously bad 162 : shmem region or app_sz. */ 163 : 164 : void * 165 : fd_cnc_new( void * shmem, 166 : ulong app_sz, 167 : ulong type, 168 : long now ); 169 : 170 : /* fd_cnc_join joins the caller to the cnc. shcnc points to the first 171 : byte of the memory region backing the cnc in the caller's address 172 : space. Returns a pointer in the local address space to the cnc on 173 : success (this should not be assumed to be just a cast of shcnc) or 174 : NULL on failure (logs details). Reasons for failure include the 175 : shcnc is obviously not a local pointer to a memory region holding a 176 : cnc. Every successful join should have a matching leave. The 177 : lifetime of the join is until the matching leave or caller's thread 178 : group is terminated. */ 179 : 180 : fd_cnc_t * 181 : fd_cnc_join( void * shcnc ); 182 : 183 : /* fd_cnc_leave leaves a current local join. Returns a pointer to the 184 : underlying shared memory region on success (this should not be 185 : assumed to be just a cast of cnc) and NULL on failure (logs details). 186 : Reasons for failure include cnc is NULL. */ 187 : 188 : void * 189 : fd_cnc_leave( fd_cnc_t const * cnc ); 190 : 191 : /* fd_cnc_delete unformats a memory region used as a cnc. Assumes 192 : nobody is joined to the region. Returns a pointer to the underlying 193 : shared memory region or NULL if used obviously in error (e.g. shcnc 194 : obviously does not point to a cnc ... logs details). The ownership 195 : of the memory region is transferred to the caller on success. */ 196 : 197 : void * 198 : fd_cnc_delete( void * shcnc ); 199 : 200 : /* fd_cnc_app_sz returns the size of a the cnc's application region. 201 : Assumes cnc is a current local join. */ 202 : 203 9 : FD_FN_PURE static inline ulong fd_cnc_app_sz( fd_cnc_t const * cnc ) { return cnc->app_sz; } 204 : 205 : /* fd_cnc_app_laddr returns local address of the cnc's application 206 : region. This will have FD_CNC_APP_ALIGN alignment and room for at 207 : least fd_cnc_app_sz( cnc ) bytes. Assumes cnc is a current local 208 : join. fd_cnc_app_laddr_const is for const correctness. The return 209 : values are valid for the lifetime of the local join. */ 210 : 211 12 : FD_FN_CONST static inline void * fd_cnc_app_laddr ( fd_cnc_t * cnc ) { return (void * )(((ulong)cnc)+64UL); } 212 9 : FD_FN_CONST static inline void const * fd_cnc_app_laddr_const( fd_cnc_t const * cnc ) { return (void const *)(((ulong)cnc)+64UL); } 213 : 214 : /* fd_cnc_type returns the application defined type of a cnc. Assumes 215 : cnc is a current local join. */ 216 : 217 3 : FD_FN_PURE static inline ulong fd_cnc_type( fd_cnc_t const * cnc ) { return cnc->type; } 218 : 219 : /* fd_cnc_heartbeat0 returns the heartbeat assigned when the cnc was 220 : created. Assumes cnc is a current local join. */ 221 : 222 3 : FD_FN_PURE static inline long fd_cnc_heartbeat0( fd_cnc_t const * cnc ) { return cnc->heartbeat0; } 223 : 224 : /* fd_cnc_heartbeat_query returns the value of the cnc's heartbeat 225 : as of some point in time between when this was called and when this 226 : returned. Assumes cnc is a current local join. This acts as a 227 : compiler memory fence. */ 228 : 229 : static inline long 230 99 : fd_cnc_heartbeat_query( fd_cnc_t const * cnc ) { 231 99 : FD_COMPILER_MFENCE(); 232 99 : long then = FD_VOLATILE_CONST( cnc->heartbeat ); 233 99 : FD_COMPILER_MFENCE(); 234 99 : return then; 235 99 : } 236 : 237 : /* fd_cnc_heartbeat is used by an app thread to update the cnc's 238 : heartbeat. Heartbeat values are application defined but typical 239 : usage is something that monotonically increases (e.g. the host 240 : wallclock, host tickcounter or just a flat counter). It is 241 : recommended app threads do cnc heartbeats with intervals that are 242 : uniform random distributed in a range like [min,2*min] nanoseconds 243 : for some reasonably fast to a human but slow to the computer value 244 : of min. This keeps load from heartbeating low, keeps the system 245 : human real time responsive, prevents heartbeats from multiple cnc 246 : auto-synchronizing and gives a strict range in time over which a cnc 247 : thread should expect to see a heartbeat from a normally running app 248 : thread. Assumes cnc is a current local join. This acts as a 249 : compiler memory fence. */ 250 : 251 : static inline void 252 : fd_cnc_heartbeat( fd_cnc_t * cnc, 253 17354418 : long now ) { 254 17354418 : FD_COMPILER_MFENCE(); 255 17354418 : FD_VOLATILE( cnc->heartbeat ) = now; 256 17354418 : FD_COMPILER_MFENCE(); 257 17354418 : } 258 : 259 : /* fd_cnc_signal query observes the current signal posted to the cnc. 260 : Assumes cnc is a current local join. This is a compiler fence. 261 : Returns the current signal on the cnc at some point in time between 262 : when this was called and this returned. */ 263 : 264 : static inline ulong 265 39383254 : fd_cnc_signal_query( fd_cnc_t const * cnc ) { 266 39383254 : FD_COMPILER_MFENCE(); 267 39383254 : ulong s = FD_VOLATILE_CONST( cnc->signal ); 268 39383254 : FD_COMPILER_MFENCE(); 269 39383254 : return s; 270 39383254 : } 271 : 272 : /* fd_cnc_signal atomically transitions the cnc to signal s. Assumes 273 : cnc is a current local join and the caller is currently allowed to do 274 : a transition to s. Specifically: 275 : 276 : CNC thread with open command session: 277 : 278 : - RUN->HALT: signal an app thread to shutdown 279 : 280 : - RUN->USER defined: as per application requirements 281 : 282 : - USER defined->USER defined: as per application requirements 283 : 284 : Running APP thread: 285 : 286 : - BOOT->RUN: when app thread it is done booting ... should be just 287 : before app thread enters its run loop. 288 : 289 : - BOOT->FAIL: if app thread failed to boot ... should be just 290 : before app thread stops running. 291 : 292 : - HALT->BOOT: when app thread is done halting ... should be just 293 : before app thread stops running. 294 : 295 : - USER defined->RUN: when app thread is done processing signal and 296 : can resume running ... should be just before app thread resumes 297 : its run loop. 298 : 299 : - USER defined->BOOT: when CNC thread signal processing halted the 300 : app thread normally ... should be just before app thread stops 301 : running. 302 : 303 : - USER defined->FAIL: when CNC thread signal processing halted the 304 : app thread abnormally ... should be just before app thread stops 305 : running. 306 : 307 : - USER defined->USER defined: as per application requirements 308 : 309 : See above state machine for more details. This function is a 310 : compiler memory fence (e.g. caller can populate the cnc app region 311 : with app signal specific details and all the memory operations to the 312 : app region will be issued before s is signaled). */ 313 : 314 : static inline void 315 : fd_cnc_signal( fd_cnc_t * cnc, 316 891 : ulong s ) { 317 891 : FD_COMPILER_MFENCE(); 318 891 : FD_VOLATILE( cnc->signal ) = s; 319 891 : FD_COMPILER_MFENCE(); 320 891 : } 321 : 322 : /* fd_cnc_open opens a new command session to an app thread. Returns 0 323 : (FD_CNC_SUCCESS) on success and a negative (FD_CNC_ERR_*) on failure 324 : (logs details). On successful return, caller will have an open 325 : command session on the cnc and the cnc will be in the RUN state. On 326 : failure, caller does not have a command session on cnc. 327 : 328 : Reasons for FD_CNC_ERR_UNSUP include not running on a hosted target, 329 : not running on an atomic capable target and strange thread group id; 330 : this is a permanent failure. Reasons for FD_CNC_ERR_INVAL include 331 : NULL cur; this is a permanent failure. Reasons for FD_CNC_ERR_FAIL 332 : include app thread is not running and cannot be restarted cleanly; 333 : this is a permanent failure. Reasons for FD_CNC_ERR_AGAIN include 334 : app thread is bootable, is in the process of booting or is in the 335 : process of halting (and thus might be running later) or there is 336 : already an open command session on app thread; this failure is 337 : _potentially_ transient. 338 : 339 : Caller should not leave the join while it has an open command 340 : session. Caller should not close an open command session while it 341 : has a signal pending on it. If the caller dies with an open command 342 : session, the next cnc thread will try to implicitly close it to 343 : recover (logging details as necessary). */ 344 : 345 : int 346 : fd_cnc_open( fd_cnc_t * cnc ); 347 : 348 : /* fd_cnc_wait waits up to dt ns for the cnc to transition to something 349 : other than test. Returns the last observed cnc signal (which can be 350 : used detect result of the way). dt==LONG_MAX will do a blocking 351 : wait. dt<=0 will poll cnc once. If _opt_now is non-NULL, *_opt_now 352 : will contain the wallclock observed just before the last time the cnc 353 : was queried on return. The wait is OS friendly (e.g. will not block 354 : other threads that might be running on the same core as the cnc 355 : thread as such threads are often scheduled to shared a common 356 : administrative core). */ 357 : 358 : ulong 359 : fd_cnc_wait( fd_cnc_t const * cnc, 360 : ulong test, 361 : long dt, 362 : long * _opt_now ); 363 : 364 : /* fd_cnc_close ends the current command session on cnc. Assumes caller 365 : has an open command session on cnc and there are no signals being 366 : processed by the app thread (e.g. the sync is in the RUN, BOOT or 367 : FAIL state). This function is a compiler fence. */ 368 : 369 : static inline void 370 105 : fd_cnc_close( fd_cnc_t * cnc ) { 371 105 : FD_COMPILER_MFENCE(); 372 105 : FD_VOLATILE( cnc->lock ) = 0UL; 373 105 : FD_COMPILER_MFENCE(); 374 105 : } 375 : 376 : /* fd_cnc_strerror converts a FD_CNC_SUCCESS / FD_CNC_ERR_* code into 377 : a human readable cstr. The lifetime of the returned pointer is 378 : infinite. The returned pointer is always to a non-NULL cstr. */ 379 : 380 : FD_FN_CONST char const * 381 : fd_cnc_strerror( int err ); 382 : 383 : /* fd_cstr_to_cnc_signal converts the cstr pointed to by into a cnc 384 : signal value. Return value undefined if cstr does not point to a cnc 385 : signal cstr. */ 386 : 387 : FD_FN_PURE ulong 388 : fd_cstr_to_cnc_signal( char const * cstr ); 389 : 390 : /* fd_cnc_signal_cstr pretty prints the cnc signal value into buf. buf 391 : must point to a character buffer with at least 392 : FD_CNC_SIGNAL_CSTR_BUF_MAX bytes. Always returns buf. If buf is 393 : non-NULL, the buffer pointed at will be populated with a proper '\0' 394 : terminated cstr on return (and one that fd_cstr_to_cnc_signal 395 : properly convert back to signal). */ 396 : 397 3 : #define FD_CNC_SIGNAL_CSTR_BUF_MAX (21UL) 398 : 399 : char * 400 : fd_cnc_signal_cstr( ulong signal, 401 : char * buf ); 402 : 403 : FD_PROTOTYPES_END 404 : 405 : #endif /* HEADER_fd_src_tango_cnc_fd_cnc_h */ 406 :