LCOV - code coverage report
Current view: top level - util - fd_util_base.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 110 122 90.2 %
Date: 2026-06-04 08:36:56 Functions: 426 7021 6.1 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_fd_util_base_h
       2             : #define HEADER_fd_src_util_fd_util_base_h
       3             : 
       4             : /* Base development environment */
       5             : 
       6             : /* Compiler checks ****************************************************/
       7             : 
       8             : #ifdef __cplusplus
       9             : 
      10             : #if __cplusplus<201703L
      11             : #error "Firedancer requires C++17 or later"
      12             : #endif
      13             : 
      14             : #else
      15             : 
      16             : #if __STDC_VERSION__<201710L
      17             : #error "Firedancer requires C Standard version C17 or later"
      18             : #endif
      19             : 
      20             : #endif //__cplusplus
      21             : 
      22             : /* Build target capabilities ******************************************/
      23             : 
      24             : /* Different build targets often have different levels of support for
      25             :    various language and hardware features.  The presence of various
      26             :    features can be tested at preprocessor, compile, or run time via the
      27             :    below capability macros.
      28             : 
      29             :    Code that does not exploit any of these capabilities written within
      30             :    the base development environment should be broadly portable across a
      31             :    range of build targets ranging from on-chain virtual machines to
      32             :    commodity hosts to custom hardware.
      33             : 
      34             :    As such, highly portable yet high performance code is possible by
      35             :    writing generic implementations that do not exploit any of the below
      36             :    capabilities as a portable fallback along with build target specific
      37             :    optimized implementations that are invoked when the build target
      38             :    supports the appropriate capabilities.
      39             : 
      40             :    The base development itself provide lots of functionality to help
      41             :    with implementing portable fallbacks while making very minimal
      42             :    assumptions about the build targets and zero use of 3rd party
      43             :    libraries (these might make unknown additional assumptions about the
      44             :    build target, including availability of a quality implementation of
      45             :    the library on the build target). */
      46             : 
      47             : /* FD_HAS_HOSTED:  If the build target is hosted (e.g. resides on a host
      48             :    with a POSIX-ish environment ... practically speaking, stdio.h,
      49             :    stdlib.h, unistd.h, et al more or less behave normally ...
      50             :    pedantically XOPEN_SOURCE=700), FD_HAS_HOSTED will be 1.  It will be
      51             :    zero otherwise. */
      52             : 
      53             : #ifndef FD_HAS_HOSTED
      54             : #define FD_HAS_HOSTED 0
      55             : #endif
      56             : 
      57             : /* FD_HAS_ATOMIC:  If the build target supports atomic operations
      58             :    between threads accessing a common memory region (include threads
      59             :    that reside in different processes on a host communicating via a
      60             :    shared memory region with potentially different local virtual
      61             :    mappings).  Practically speaking, does atomic compare-and-swap et al
      62             :    work? */
      63             : 
      64             : #ifndef FD_HAS_ATOMIC
      65             : #define FD_HAS_ATOMIC 0
      66             : #endif
      67             : 
      68             : /* FD_HAS_THREADS:  If the build target supports a POSIX-ish notion of
      69             :    threads (e.g. practically speaking, global variables declared within
      70             :    a compile unit are visible to more than one thread of execution,
      71             :    pthreads.h / threading parts of C standard, the atomics parts of the
      72             :    C standard, ... more or less work normally), FD_HAS_THREADS will be
      73             :    1.  It will be zero otherwise.  FD_HAS_THREADS implies FD_HAS_HOSTED
      74             :    and FD_HAS_ATOMIC. */
      75             : 
      76             : #ifndef FD_HAS_THREADS
      77             : #define FD_HAS_THREADS 0
      78             : #endif
      79             : 
      80             : /* FD_HAS_INT128:  If the build target supports reasonably efficient
      81             :    128-bit wide integer operations, define FD_HAS_INT128 to 1 to enable
      82             :    use of them in implementations. */
      83             : 
      84             : #ifndef FD_HAS_INT128
      85             : #define FD_HAS_INT128 0
      86             : #endif
      87             : 
      88             : /* FD_HAS_DOUBLE:  If the build target supports reasonably efficient
      89             :    IEEE 754 64-bit wide double precision floating point options, define
      90             :    FD_HAS_DOUBLE to 1 to enable use of them in implementations.  Note
      91             :    that even if the build target does not, va_args handling in the C /
      92             :    C++ language requires promotion of a float in an va_arg list to a
      93             :    double.  Thus, C / C++ language that support IEEE 754 float also
      94             :    implies a minimum level of support for double (though not necessarily
      95             :    efficient or IEEE 754).  That is, even if a target does not have
      96             :    FD_HAS_DOUBLE, there might still be limited use of double in va_arg
      97             :    list handling. */
      98             : 
      99             : #ifndef FD_HAS_DOUBLE
     100             : #define FD_HAS_DOUBLE 0
     101             : #endif
     102             : 
     103             : /* FD_HAS_ALLOCA:  If the build target supports fast alloca-style
     104             :    dynamic stack memory allocation (e.g. alloca.h / __builtin_alloca
     105             :    more or less work normally), define FD_HAS_ALLOCA to 1 to enable use
     106             :    of it in implementations. */
     107             : 
     108             : #ifndef FD_HAS_ALLOCA
     109             : #define FD_HAS_ALLOCA 0
     110             : #endif
     111             : 
     112             : /* FD_HAS_X86:  If the build target supports x86 specific features and
     113             :    can benefit from x86 specific optimizations, define FD_HAS_X86.  Code
     114             :    needing more specific target features (Intel / AMD / SSE / AVX2 /
     115             :    AVX512 / etc) can specialize further as necessary with even more
     116             :    precise capabilities (that in turn imply FD_HAS_X86). */
     117             : 
     118             : #ifndef FD_HAS_X86
     119             : #define FD_HAS_X86 0
     120             : #endif
     121             : 
     122             : /* These allow even more precise targeting for X86. */
     123             : 
     124             : /* FD_HAS_SSE indicates the target supports Intel SSE4 style SIMD
     125             :    (basically do the 128-bit wide parts of "x86intrin.h" work).
     126             :    Recommend using the simd/fd_sse.h APIs instead of raw Intel
     127             :    intrinsics for readability and to facilitate portability to non-x86
     128             :    platforms.  Implies FD_HAS_X86. */
     129             : 
     130             : #ifndef FD_HAS_SSE
     131             : #define FD_HAS_SSE 0
     132             : #endif
     133             : 
     134             : /* FD_HAS_AVX indicates the target supports Intel AVX2 style SIMD
     135             :    (basically do the 256-bit wide parts of "x86intrin.h" work).
     136             :    Recommend using the simd/fd_avx.h APIs instead of raw Intel
     137             :    intrinsics for readability and to facilitate portability to non-x86
     138             :    platforms.  Implies FD_HAS_SSE. */
     139             : 
     140             : #ifndef FD_HAS_AVX
     141             : #define FD_HAS_AVX 0
     142             : #endif
     143             : 
     144             : /* FD_HAS_AVX512 indicates the target supports Intel AVX-512 style SIMD
     145             :    (basically do the 512-bit wide parts of "x86intrin.h" work).
     146             :    Recommend using the simd/fd_avx512.h APIs instead of raw Intel
     147             :    intrinsics for readability and to facilitate portability to non-x86
     148             :    platforms.  Implies FD_HAS_AVX. */
     149             : 
     150             : #ifndef FD_HAS_AVX512
     151           4 : #define FD_HAS_AVX512 0
     152             : #endif
     153             : 
     154             : /* FD_HAS_SHANI indicates that the target supports Intel SHA extensions
     155             :    which accelerate SHA-1 and SHA-256 computation.  This extension is
     156             :    also called SHA-NI or SHA_NI (Secure Hash Algorithm New
     157             :    Instructions).  Although proposed in 2013, they're only supported on
     158             :    Intel Ice Lake and AMD Zen CPUs and newer.  Implies FD_HAS_AVX. */
     159             : 
     160             : #ifndef FD_HAS_SHANI
     161             : #define FD_HAS_SHANI 0
     162             : #endif
     163             : 
     164             : /* FD_HAS_GFNI indicates that the target supports Intel Galois Field
     165             :    extensions, which accelerate operations over binary extension fields,
     166             :    especially GF(2^8).  These instructions are supported on Intel Ice
     167             :    Lake and newer and AMD Zen4 and newer CPUs.  Implies FD_HAS_AVX. */
     168             : 
     169             : #ifndef FD_HAS_GFNI
     170             : #define FD_HAS_GFNI 0
     171             : #endif
     172             : 
     173             : /* FD_HAS_AESNI indicates that the target supports AES-NI extensions,
     174             :    which accelerate AES encryption and decryption.  While AVX predates
     175             :    the original AES-NI extension, the combination of AES-NI+AVX adds
     176             :    additional opcodes (such as vaesenc, a more flexible variant of
     177             :    aesenc).  Thus, implies FD_HAS_AVX.  A conservative estimate for
     178             :    minimum platform support is Intel Haswell or AMD Zen. */
     179             : 
     180             : #ifndef FD_HAS_AESNI
     181             : #define FD_HAS_AESNI 0
     182             : #endif
     183             : 
     184             : /* FD_HAS_ARM:  If the build target supports armv8-a specific features
     185             :    and can benefit from aarch64 specific optimizations, define
     186             :    FD_HAS_ARM. */
     187             : 
     188             : #ifndef FD_HAS_ARM
     189             : #define FD_HAS_ARM 0
     190             : #endif
     191             : 
     192             : /* FD_HAS_LZ4 indicates that the target supports LZ4 compression.
     193             :    Roughly, does "#include <lz4.h>" and the APIs therein work? */
     194             : 
     195             : #ifndef FD_HAS_LZ4
     196             : #define FD_HAS_LZ4 0
     197             : #endif
     198             : 
     199             : /* FD_HAS_ZSTD indicates that the target supports ZSTD compression.
     200             :    Roughly, does "#include <zstd.h>" and the APIs therein work? */
     201             : 
     202             : #ifndef FD_HAS_ZSTD
     203             : #define FD_HAS_ZSTD 0
     204             : #endif
     205             : 
     206             : /* FD_HAS_COVERAGE indicates that the build target is built with coverage instrumentation. */
     207             : 
     208             : #ifndef FD_HAS_COVERAGE
     209             : #define FD_HAS_COVERAGE 0
     210             : #endif
     211             : 
     212             : /* FD_HAS_ASAN indicates that the build target is using ASAN. */
     213             : 
     214             : #ifndef FD_HAS_ASAN
     215             : #define FD_HAS_ASAN 0
     216             : #endif
     217             : 
     218             : /* FD_HAS_UBSAN indicates that the build target is using UBSAN. */
     219             : 
     220             : #ifndef FD_HAS_UBSAN
     221             : #define FD_HAS_UBSAN 0
     222             : #endif
     223             : 
     224             : /* FD_HAS_DEEPASAN indicates that the build target is using ASAN with manual
     225             :    memory poisoning for fd_alloc, fd_wksp, and fd_scratch. */
     226             : 
     227             : #ifndef FD_HAS_DEEPASAN
     228             : #define FD_HAS_DEEPASAN 0
     229             : #endif
     230             : 
     231             : /* Base development environment ***************************************/
     232             : 
     233             : /* The functionality provided by these vanilla headers are always
     234             :    available within the base development environment.  Notably, stdio.h
     235             :    / stdlib.h / et al at are not included here as these make lots of
     236             :    assumptions about the build target that may not be true (especially
     237             :    for on-chain and custom hardware use).  Code should prefer the fd
     238             :    util equivalents for such functionality when possible. */
     239             : 
     240             : #include <stdalign.h>
     241             : #include <string.h>
     242             : #include <limits.h>
     243             : #include <float.h>
     244             : 
     245             : /* Work around some library naming irregularities */
     246             : /* FIXME: Consider this for FLOAT/FLT, DOUBLE/DBL too? */
     247             : 
     248           3 : #define  SHORT_MIN  SHRT_MIN
     249           3 : #define  SHORT_MAX  SHRT_MAX
     250     2912775 : #define USHORT_MAX USHRT_MAX
     251             : 
     252             : /* Primitive types ****************************************************/
     253             : 
     254             : /* These typedefs provide single token regularized names for all the
     255             :    primitive types in the base development environment:
     256             : 
     257             :      char !
     258             :      schar !   short   int   long   int128 !!
     259             :      uchar    ushort  uint  ulong  uint128 !!
     260             :      float
     261             :      double !!!
     262             : 
     263             :    ! Does not assume the sign of char.  A naked char should be treated
     264             :      as cstr character and mathematical operations should be avoided on
     265             :      them.  This is less than ideal as the patterns for integer types in
     266             :      the C/C++ language spec itself are far more consistent with a naked
     267             :      char naturally being treated as signed (see above).  But there are
     268             :      lots of conflicts between architectures, languages and standard
     269             :      libraries about this so any use of a naked char shouldn't assume
     270             :      the sign ... sigh.
     271             : 
     272             :    !! Only available if FD_HAS_INT128 is defined
     273             : 
     274             :    !!! Should only used if FD_HAS_DOUBLE is defined but see note in
     275             :        FD_HAS_DOUBLE about C/C++ silent promotions of float to double in
     276             :        va_arg lists.
     277             : 
     278             :    Note also that these token names more naturally interoperate with
     279             :    integer constant declarations, type generic code generation
     280             :    techniques, with printf-style format strings than the stdint.h /
     281             :    inttypes.h handling.
     282             : 
     283             :    To minimize portability issues, unexpected silent type conversion
     284             :    issues, align with typical developer implicit usage, align with
     285             :    typical build target usage, ..., assumes char / short / int / long
     286             :    are 8 / 16 / 32 / 64 twos complement integers and float is IEEE-754
     287             :    single precision.  Further assumes little endian, truncating signed
     288             :    integer division, sign extending (arithmetic) signed right shift and
     289             :    signed left shift behaves the same as an unsigned left shift from bit
     290             :    operations point of view (technically the standard says signed left
     291             :    shift is undefined if the result would overflow).  Also, except for
     292             :    int128/uint128, assumes that aligned access to these will be
     293             :    naturally atomic.  Lastly assumes that unaligned access to these is
     294             :    functionally valid but does not assume that unaligned access to these
     295             :    is efficient or atomic.
     296             : 
     297             :    For values meant to be held in registers, code should prefer long /
     298             :    ulong types (improves asm generation given the prevalence of 64-bit
     299             :    targets and also to avoid lots of tricky bugs with silent promotions
     300             :    in the language ... e.g. ushort should ideally only be used for
     301             :    in-memory representations).
     302             : 
     303             :    These are currently not prefixed given how often they are used.  If
     304             :    this becomes problematic prefixes can be added as necessary.
     305             :    Specifically, C++ allows typedefs to be defined multiple times so
     306             :    long as they are equivalent.  Inequivalent collisions are not
     307             :    supported but should be rare (e.g. if a 3rd party header thinks
     308             :    "ulong" should be something other an "unsigned long", the 3rd party
     309             :    header probably should be nuked from orbit).  C11 and forward also
     310             :    allow multiple equivalent typedefs.  C99 and earlier don't but this
     311             :    is typically only a warning and then only if pedantic warnings are
     312             :    enabled.  Thus, if we want to support users using C99 and earlier who
     313             :    want to do a strict compile and have a superfluous collision with
     314             :    these types in other libraries, uncomment the below (or do something
     315             :    equivalent for the compiler). */
     316             : 
     317             : //#pragma GCC diagnostic push
     318             : //#pragma GCC diagnostic ignored "-Wpedantic"
     319             : 
     320             : typedef signed char schar; /* See above note of sadness */
     321             : 
     322             : typedef unsigned char  uchar;
     323             : typedef unsigned short ushort;
     324             : typedef unsigned int   uint;
     325             : typedef unsigned long  ulong;
     326             : 
     327             : #if FD_HAS_INT128
     328             : 
     329             : __extension__ typedef          __int128  int128;
     330             : __extension__ typedef unsigned __int128 uint128;
     331             : 
     332  1200003879 : #define UINT128_MAX (~(uint128)0)
     333           6 : #define  INT128_MAX ((int128)(UINT128_MAX>>1))
     334           3 : #define  INT128_MIN (-INT128_MAX-(int128)1)
     335             : 
     336             : #endif
     337             : 
     338             : //#pragma GCC diagnostic pop
     339             : 
     340             : /* Compiler tricks ****************************************************/
     341             : 
     342             : /* FD_STRINGIFY,FD_CONCAT{2,3,4}:  Various macros for token
     343             :    stringification and pasting.  FD_STRINGIFY returns the argument as a
     344             :    cstr (e.g. FD_STRINGIFY(foo) -> "foo").  FD_CONCAT* pastes the tokens
     345             :    together into a single token (e.g.  FD_CONCAT3(a,b,c) -> abc).  The
     346             :    EXPAND variants first expand their arguments and then do the token
     347             :    operation (e.g.  FD_EXPAND_THEN_STRINGIFY(__LINE__) -> "104" if done
     348             :    on line 104 of the source code file). */
     349             : 
     350           0 : #define FD_STRINGIFY(x)#x
     351     7394889 : #define FD_CONCAT2(a,b)a##b
     352    39588294 : #define FD_CONCAT3(a,b,c)a##b##c
     353          54 : #define FD_CONCAT4(a,b,c,d)a##b##c##d
     354             : 
     355             : #define FD_EXPAND_THEN_STRINGIFY(x)FD_STRINGIFY(x)
     356     4219623 : #define FD_EXPAND_THEN_CONCAT2(a,b)FD_CONCAT2(a,b)
     357 >49817*10^7 : #define FD_EXPAND_THEN_CONCAT3(a,b,c)FD_CONCAT3(a,b,c)
     358     5549877 : #define FD_EXPAND_THEN_CONCAT4(a,b,c,d)FD_CONCAT4(a,b,c,d)
     359             : 
     360             : /* FD_VA_ARGS_SELECT(__VA_ARGS__,e32,e31,...e1):  Macro that expands to
     361             :    en at compile time where n is number of items in the __VA_ARGS__
     362             :    list.  If __VA_ARGS__ is empty, returns e1.  Assumes __VA_ARGS__ has
     363             :    at most 32 arguments.  Useful for making a variadic macro whose
     364             :    behavior depends on the number of arguments in __VA_ARGS__. */
     365             : 
     366             : #define FD_VA_ARGS_SELECT(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,a,b,c,d,e,f,_,...)_
     367             : 
     368             : /* FD_SRC_LOCATION returns a const cstr holding the line of code where
     369             :    FD_SRC_LOCATION was used. */
     370             : 
     371             : #define FD_SRC_LOCATION __FILE__ "(" FD_EXPAND_THEN_STRINGIFY(__LINE__) ")"
     372             : 
     373             : /* FD_STATIC_ASSERT tests at compile time if c is non-zero.  If not,
     374             :    it aborts the compile with an error.  err itself should be a token
     375             :    (e.g. not a string, no whitespace, etc). */
     376             : 
     377             : #ifdef __cplusplus
     378             : #define FD_STATIC_ASSERT(c,err) static_assert(c, #err)
     379             : #else
     380       91110 : #define FD_STATIC_ASSERT(c,err) _Static_assert(c, #err)
     381             : #endif
     382             : 
     383             : /* FD_ADDRESS_OF_PACKED_MEMBER(x):  Linguistically does &(x) but without
     384             :    recent compiler complaints that &x might be unaligned if x is a
     385             :    member of a packed datastructure.  (Often needed for interfacing with
     386             :    hardware / packets / etc.) */
     387             : 
     388           3 : #define FD_ADDRESS_OF_PACKED_MEMBER( x ) (__extension__({                                      \
     389           3 :     char * _fd_aopm = (char *)&(x);                                                            \
     390           3 :     __asm__( "# FD_ADDRESS_OF_PACKED_MEMBER(" #x ") @" FD_SRC_LOCATION : "+r" (_fd_aopm) :: ); \
     391           3 :     (__typeof__(&(x)))_fd_aopm;                                                                \
     392           3 :   }))
     393             : 
     394             : /* FD_PROTOTYPES_{BEGIN,END}:  Headers that might be included in C++
     395             :    source should encapsulate the prototypes of code and globals
     396             :    contained in compilation units compiled as C with a
     397             :    FD_PROTOTYPE_{BEGIN,END} pair. */
     398             : 
     399             : #ifdef __cplusplus
     400             : #define FD_PROTOTYPES_BEGIN extern "C" {
     401             : #else
     402             : #define FD_PROTOTYPES_BEGIN
     403             : #endif
     404             : 
     405             : #ifdef __cplusplus
     406             : #define FD_PROTOTYPES_END }
     407             : #else
     408             : #define FD_PROTOTYPES_END
     409             : #endif
     410             : 
     411             : /* FD_ASM_LG_ALIGN(lg_n) expands to an alignment assembler directive
     412             :    appropriate for the current architecture/ABI.  The resulting align
     413             :    is 2^(lg_n) bytes, i.e. FD_ASM_LG_ALIGN(3) aligns by 8 bytes. */
     414             : 
     415             : #if defined(__aarch64__)
     416             : #define FD_ASM_LG_ALIGN(lg_n) ".align " #lg_n "\n"
     417             : #elif defined(__x86_64__) || defined(__powerpc64__) || defined(__riscv)
     418             : #define FD_ASM_LG_ALIGN(lg_n) ".p2align " #lg_n "\n"
     419             : #endif
     420             : 
     421             : /* FD_IMPORT declares a variable name and initializes with the contents
     422             :    of the file at path (with potentially some assembly directives for
     423             :    additional footer info).  It is equivalent to:
     424             : 
     425             :      type const name[] __attribute__((aligned(align))) = {
     426             : 
     427             :        ... code that would initialize the contents of name to the
     428             :        ... raw binary data found in the file at path at compile time
     429             :        ... (with any appended information as specified by footer)
     430             : 
     431             :      };
     432             : 
     433             :      ulong const name_sz = ... number of bytes pointed to by name;
     434             : 
     435             :    More precisely, this creates a symbol "name" in the object file that
     436             :    points to a read-only copy of the raw data in the file at "path" as
     437             :    it was at compile time.  2^lg_align specifies the minimum alignment
     438             :    required for the copy's first byte as an unsuffixed decimal integer.
     439             :    footer are assembly commands to permit additional data to be appended
     440             :    to the copy (use "" for footer if no footer is necessary).
     441             : 
     442             :    Then it exposes a pointer to this copy in the current compilation
     443             :    unit as name and the byte size as name_sz.  name_sz covers the first
     444             :    byte of the included data to the last byte of the footer inclusive.
     445             : 
     446             :    The dummy linker symbol _fd_import_name_sz will also be created in
     447             :    the object file as some under the hood magic to make this work.  This
     448             :    should not be used in any compile unit as some compilers (I'm looking
     449             :    at you clang-15, but apparently not clang-10) will sometimes mangle
     450             :    its value from what it was set to in the object file even marked as
     451             :    absolute in the object file.
     452             : 
     453             :    This should only be used at global scope and should be done at most
     454             :    once over all object files / libraries used to make a program.  If
     455             :    other compilation units want to make use of an import in a different
     456             :    compilation unit, they should declare:
     457             : 
     458             :      extern type const name[] __attribute__((aligned(align)));
     459             : 
     460             :    and/or:
     461             : 
     462             :      extern ulong const name_sz;
     463             : 
     464             :    as necessary (that is, do the usual to use name and name_sz as shown
     465             :    for the pseudo code above).
     466             : 
     467             :    Important safety tip!  gcc -M will generally not detect the
     468             :    dependency this creates between the importing file and the imported
     469             :    file.  This can cause incremental builds to miss changes to the
     470             :    imported file.  Ideally, we would have FD_IMPORT automatically do
     471             :    something like:
     472             : 
     473             :      _Pragma( "GCC dependency \"" path "\" )
     474             : 
     475             :    This doesn't work as is because _Pragma needs some macro expansion
     476             :    hacks to accept this (this is doable).  After that workaround, this
     477             :    still doesn't work because, due to tooling limitations, the pragma
     478             :    path is relative to the source file directory and the FD_IMPORT path
     479             :    is relative to the make directory (working around this would
     480             :    require a __FILE__-like directive for the source code directory base
     481             :    path).  Even if that did exist, it might still not work because
     482             :    out-of-tree builds often require some substitutions to the gcc -M
     483             :    generated dependencies that this might not pick up (at least not
     484             :    without some build system surgery).  And then it still wouldn't work
     485             :    because gcc -M seems to ignore all of this anyways (which is the
     486             :    actual show stopper as this pragma does something subtly different
     487             :    than what the name suggests and there isn't any obvious support for a
     488             :    "pseudo-include".)  Another reminder that make clean and fast builds
     489             :    are our friend. */
     490             : 
     491             : #if defined(__ELF__)
     492             : 
     493             : #define FD_IMPORT( name, path, type, lg_align, footer )      \
     494             :   __asm__( ".section .rodata,\"a\",@progbits\n"              \
     495             :            ".type " #name ",@object\n"                       \
     496             :            ".globl " #name "\n"                              \
     497             :            FD_ASM_LG_ALIGN(lg_align)                         \
     498             :            #name ":\n"                                       \
     499             :            ".incbin \"" path "\"\n"                          \
     500             :            footer "\n"                                       \
     501             :            ".size " #name ",. - " #name "\n"                 \
     502             :            "_fd_import_" #name "_sz = . - " #name "\n"       \
     503             :            ".type " #name "_sz,@object\n"                    \
     504             :            ".globl " #name "_sz\n"                           \
     505             :            FD_ASM_LG_ALIGN(3)                                \
     506             :            #name "_sz:\n"                                    \
     507             :            ".quad _fd_import_" #name "_sz\n"                 \
     508             :            ".size " #name "_sz,8\n"                          \
     509             :            ".previous\n" );                                  \
     510             :   extern type  const name[] __attribute__((aligned(1<<(lg_align)))); \
     511             :   extern ulong const name##_sz
     512             : 
     513             : #elif defined(__MACH__)
     514             : 
     515             : #define FD_IMPORT( name, path, type, lg_align, footer )      \
     516             :   __asm__( ".section __DATA,__const\n"                       \
     517             :            ".globl _" #name "\n"                             \
     518             :            FD_ASM_LG_ALIGN(lg_align)                         \
     519             :            "_" #name ":\n"                                   \
     520             :            ".incbin \"" path "\"\n"                          \
     521             :            footer "\n"                                       \
     522             :            "_fd_import_" #name "_sz = . - _" #name "\n"      \
     523             :            ".globl _" #name "_sz\n"                          \
     524             :            FD_ASM_LG_ALIGN(3)                                \
     525             :            "_" #name "_sz:\n"                                \
     526             :            ".quad _fd_import_" #name "_sz\n"                 \
     527             :            ".previous\n" );                                  \
     528             :   extern type  const name[] __attribute__((aligned(1<<(lg_align)))); \
     529             :   extern ulong const name##_sz
     530             : 
     531             : #endif
     532             : 
     533             : /* FD_IMPORT_{BINARY,CSTR} are common cases for FD_IMPORT.
     534             : 
     535             :    In BINARY, the file is imported into the object file and exposed to
     536             :    the caller as a uchar binary data.  name_sz will be the number of
     537             :    bytes in the file at time of import.  name will have 128 byte
     538             :    alignment.
     539             : 
     540             :    In CSTR, the file is imported into the object caller with a '\0'
     541             :    termination appended and exposed to the caller as a cstr.  Assuming
     542             :    the file is text (i.e. has no internal '\0's), strlen(name) will the
     543             :    number of bytes in the file and name_sz will be strlen(name)+1.  name
     544             :    can have arbitrary alignment. */
     545             : 
     546             : #ifdef FD_IMPORT
     547             : #define FD_IMPORT_BINARY(name, path) FD_IMPORT( name, path, uchar, 7, ""        )
     548             : #define FD_IMPORT_CSTR(  name, path) FD_IMPORT( name, path,  char, 1, ".byte 0" )
     549             : #endif
     550             : 
     551             : /* Optimizer tricks ***************************************************/
     552             : 
     553             : /* FD_RESTRICT is a pointer modifier for to designate a pointer as
     554             :    restricted.  Hoops jumped because C++-17 still doesn't understand
     555             :    restrict ... sigh */
     556             : 
     557             : #ifndef FD_RESTRICT
     558             : #ifdef __cplusplus
     559             : #define FD_RESTRICT __restrict
     560             : #else
     561             : #define FD_RESTRICT restrict
     562             : #endif
     563             : #endif
     564             : 
     565             : /* fd_type_pun(p), fd_type_pun_const(p):  These allow use of type
     566             :    punning while keeping strict aliasing optimizations enabled (e.g.
     567             :    some UNIX APIs, like sockaddr related APIs are dependent on type
     568             :    punning).  These allow these API's to be used cleanly while keeping
     569             :    strict aliasing optimizations enabled and strict alias checking done. */
     570             : 
     571             : static inline void *
     572   802528881 : fd_type_pun( void * p ) {
     573   802528881 :   __asm__( "# fd_type_pun @" FD_SRC_LOCATION : "+r" (p) :: "memory" );
     574   802528881 :   return p;
     575   802528881 : }
     576             : 
     577             : static inline void const *
     578   236738653 : fd_type_pun_const( void const * p ) {
     579   236738653 :   __asm__( "# fd_type_pun_const @" FD_SRC_LOCATION : "+r" (p) :: "memory" );
     580   236738653 :   return p;
     581   236738653 : }
     582             : 
     583             : /* FD_{LIKELY,UNLIKELY}(c):  Evaluates c and returns whether it is
     584             :    logical true/false as long (1L/0L).  It also hints to the optimizer
     585             :    whether it should optimize for the case of c evaluating as
     586             :    true/false. */
     587             : 
     588 >13499*10^7 : #define FD_LIKELY(c)   __builtin_expect( !!(c), 1L )
     589 >54858*10^7 : #define FD_UNLIKELY(c) __builtin_expect( !!(c), 0L )
     590             : 
     591             : /* FD_FN_PURE hints to the optimizer that the function, roughly
     592             :    speaking, does not have side effects.  As such, the compiler can
     593             :    replace a call to the function with the result of an earlier call to
     594             :    that function provide the inputs and memory used hasn't changed.
     595             : 
     596             :    IMPORTANT SAFETY TIP!  Recent compilers seem to take an undocumented
     597             :    and debatable stance that pure functions do no writes to memory.
     598             :    This is a sufficient condition for the above but not a necessary one.
     599             : 
     600             :    Consider, for example, the real world case of an otherwise pure
     601             :    function that uses pass-by-reference to return more than one value
     602             :    (an unpleasant practice that is sadly often necessary because C/C++,
     603             :    compilers and underlying platform ABIs are very bad at helping
     604             :    developers simply and clearly express their intent to return multiple
     605             :    values and then generate good assembly for such).
     606             : 
     607             :    If called multiple times sequentially, all but the first call to such
     608             :    a "pure" function could be optimized away because the non-volatile
     609             :    memory writes done in the all but the 1st call for the
     610             :    pass-by-reference-returns write the same value to normal memory that
     611             :    was written on the 1st call.  That is, these calls return the same
     612             :    value for their direct return and do writes that do not have any
     613             :    visible effect.
     614             : 
     615             :    Thus, while it is safe for the compiler to eliminate all but the
     616             :    first call via techniques like common subexpression elimination, it
     617             :    is not safe for the compiler to infer that the first call did no
     618             :    writes.
     619             : 
     620             :    But recent compilers seem to do exactly that.
     621             : 
     622             :    Sigh ... we can't use FD_FN_PURE on such functions because of all the
     623             :    above linguistic, compiler, documentation and ABI infinite sadness.
     624             : 
     625             :    TL;DR To be safe against the above vagaries, recommend using
     626             :    FD_FN_PURE to annotate functions that do no memory writes (including
     627             :    trivial memory writes) and try to design HPC APIs to avoid returning
     628             :    multiple values as much as possible.
     629             : 
     630             :    Followup: FD_FN_PURE expands to nothing by default given additional
     631             :    confusion between how current languages, compilers, CI, fuzzing, and
     632             :    developers interpret this function attribute.  We keep it around
     633             :    given it documents the intent of various APIs and so it can be
     634             :    manually enabled to find implementation surprises during bullet
     635             :    proofing (e.g. under compiler options like "extra-brutality").
     636             :    Hopefully someday, pure function attributes will someday be handled
     637             :    more consistently across the board. */
     638             : 
     639             : #ifndef FD_FN_PURE
     640             : #define FD_FN_PURE
     641             : #endif
     642             : 
     643             : /* FD_FN_CONST is like pure but also, even stronger, indicates that the
     644             :    function does not depend on the state of memory.  See note above
     645             :    about why this expands to nothing by default. */
     646             : 
     647             : #ifndef FD_FN_CONST
     648             : #define FD_FN_CONST
     649             : #endif
     650             : 
     651             : /* FD_FN_UNUSED indicates that it is okay if the function with static
     652             :    linkage is not used.  Allows working around -Winline in header only
     653             :    APIs where the compiler decides not to actually inline the function.
     654             :    (This belief, frequently promulgated by anti-macro cults, that "An
     655             :    Inline Function is As Fast As a Macro" ... an entire section in gcc's
     656             :    documentation devoted to it in fact ... remains among the biggest
     657             :    lies in computer science.  Yes, an inline function is as fast as a
     658             :    macro ... when the compiler actually decides to treat the inline
     659             :    keyword more than just for entertainment purposes only.  Which, as
     660             :    -Winline proves, it frequently doesn't.  Sigh ... force_inline like
     661             :    compiler extensions might be an alternative here but they have their
     662             :    own portability issues.) */
     663             : 
     664          48 : #define FD_FN_UNUSED __attribute__((unused))
     665             : 
     666             : /* FD_FN_UNSANITIZED tells the compiler to disable AddressSanitizer and
     667             :    UndefinedBehaviorSanitizer instrumentation.  For some functions, this
     668             :    can improve instrumented compile time by ~30x. */
     669             : 
     670             : #if FD_HAS_MSAN
     671             : #define FD_FN_UNSANITIZED __attribute__((no_sanitize("memory")))
     672             : #else
     673             : #define FD_FN_UNSANITIZED __attribute__((no_sanitize("address", "undefined")))
     674             : #endif
     675             : 
     676             : /* FD_FN_SENSITIVE instruments the compiler to sanitize sensitive functions.
     677             :    https://eprint.iacr.org/2023/1713 (Sec 3.2)
     678             :    - Clear all registers with __attribute__((zero_call_used_regs("all")))
     679             :    - Clear stack with __attribute__((strub)), available in gcc 14+ */
     680             : 
     681             : #if __has_attribute(strub)
     682             : #define FD_FN_SENSITIVE __attribute__((strub)) __attribute__((zero_call_used_regs("all")))
     683             : #elif __has_attribute(zero_call_used_regs)
     684             : #define FD_FN_SENSITIVE __attribute__((zero_call_used_regs("all")))
     685             : #else
     686             : #define FD_FN_SENSITIVE
     687             : #endif
     688             : 
     689             : /* FD_PARAM_UNUSED indicates that it is okay if the function parameter is not
     690             :    used. */
     691             : 
     692             : #define FD_PARAM_UNUSED __attribute__((unused))
     693             : 
     694             : /* FD_TYPE_PACKED indicates that a type is to be packed, reseting its alignment
     695             :    to 1. */
     696             : 
     697             : #define FD_TYPE_PACKED __attribute__((packed))
     698             : 
     699             : /* FD_WARN_UNUSED tells the compiler the result (from a function) should
     700             :    be checked. This is useful to force callers to either check the result
     701             :    or deliberately and explicitly ignore it. Good for result codes and
     702             :    errors */
     703             : 
     704             : #define FD_WARN_UNUSED __attribute__ ((warn_unused_result))
     705             : 
     706             : /* FD_FALLTHRU tells the compiler that a case in a switch falls through
     707             :    to the next case. This avoids the compiler complaining, in cases where
     708             :    it is an intentional fall through.
     709             :    The "while(0)" avoids a compiler complaint in the event the case
     710             :    has no statement, example:
     711             :      switch( return_code ) {
     712             :        case RETURN_CASE_1: FD_FALLTHRU;
     713             :        case RETURN_CASE_2: FD_FALLTHRU;
     714             :        case RETURN_CASE_3:
     715             :          case_123();
     716             :        default:
     717             :          case_other();
     718             :      }
     719             : 
     720             :    See C++17 [[fallthrough]] and gcc __attribute__((fallthrough)) */
     721             : 
     722             : #define FD_FALLTHRU while(0) __attribute__((fallthrough))
     723             : 
     724             : /* FD_COMPILER_FORGET(var):  Tells the compiler that it shouldn't use
     725             :    any knowledge it has about the provided register-compatible variable
     726             :    var for optimizations going forward (i.e. the variable has changed in
     727             :    a deterministic but unknown-to-the-compiler way where the actual
     728             :    change is the identity operation).  Useful for inhibiting various
     729             :    branch nest misoptimizations (compilers unfortunately tend to
     730             :    radically underestimate the impact in raw average performance and
     731             :    jitter and the probability of branch mispredicts or the cost to the
     732             :    CPU of having lots of branches).  This is not asm volatile (use
     733             :    UNPREDICTABLE below for that) and has no clobbers.  So if var is not
     734             :    used after the forget, the compiler can optimize the FORGET away
     735             :    (along with operations preceding it used to produce var). */
     736             : 
     737 27014383788 : #define FD_COMPILER_FORGET(var) __asm__( "# FD_COMPILER_FORGET(" #var ")@" FD_SRC_LOCATION : "+r" (var) )
     738             : 
     739             : /* FD_COMPILER_UNPREDICTABLE(var):  Same as FD_COMPILER_FORGET(var) but
     740             :    the provided variable has changed in a non-deterministic way from the
     741             :    compiler's POV (e.g. the value in the variable on output should not
     742             :    be treated as a compile time constant even if it is one
     743             :    linguistically).  Useful for suppressing unwanted
     744             :    compile-time-const-based optimizations like hoisting operations with
     745             :    useful CPU side effects out of a critical loop. */
     746             : 
     747    74083562 : #define FD_COMPILER_UNPREDICTABLE(var) __asm__ __volatile__( "# FD_COMPILER_UNPREDICTABLE(" #var ")@" FD_SRC_LOCATION : "+m,r" (var) )
     748             : 
     749             : /* Atomic tricks ******************************************************/
     750             : 
     751             : /* FD_COMPILER_MFENCE():  Tells the compiler that it can't move any
     752             :    memory operations (load or store) from before the MFENCE to after the
     753             :    MFENCE (and vice versa).  The processor itself might still reorder
     754             :    around the fence though (that requires platform specific fences). */
     755             : 
     756 97883522397 : #define FD_COMPILER_MFENCE() __asm__ __volatile__( "# FD_COMPILER_MFENCE()@" FD_SRC_LOCATION ::: "memory" )
     757             : 
     758             : /* FD_HW_MFENCE():  A full hardware memory fence.  All prior stores
     759             :    are globally visible before any subsequent loads execute.  Use
     760             :    when a compiler fence is insufficient, e.g. epoch-based safe
     761             :    reclamation where a store must be visible to another core before
     762             :    a dependent load on this core (StoreLoad barrier). */
     763             : 
     764             : #if FD_HAS_X86
     765             : #define FD_HW_MFENCE()    __asm__ __volatile__( "lock addl $0, (%%rsp)" ::: "memory", "cc" )
     766           0 : #define FD_HW_MFENCE_LD() FD_COMPILER_MFENCE()
     767             : #define FD_HW_MFENCE_ST() FD_COMPILER_MFENCE()
     768             : #elif FD_HAS_ARM
     769             : #define FD_HW_MFENCE()    __asm__ __volatile__( "dmb ish" ::: "memory" )
     770             : #define FD_HW_MFENCE_LD() __asm__ __volatile__( "dmb ishld" ::: "memory" )
     771             : #define FD_HW_MFENCE_ST() __asm__ __volatile__( "dmb ishst" ::: "memory" )
     772             : #else
     773             : #define FD_HW_MFENCE()    __sync_synchronize()
     774             : #define FD_HW_MFENCE_LD() __sync_synchronize()
     775             : #define FD_HW_MFENCE_ST() __sync_synchronize()
     776             : #endif
     777             : 
     778             : /* FD_SPIN_PAUSE():  Yields the logical core of the calling thread to
     779             :    the other logical cores sharing the same underlying physical core for
     780             :    a few clocks without yielding it to the operating system scheduler.
     781             :    Typically useful for shared memory spin polling loops, especially if
     782             :    hyperthreading is in use.  IMPORTANT SAFETY TIP!  This might act as a
     783             :    FD_COMPILER_MFENCE on some combinations of toolchains and targets
     784             :    (e.g. gcc documents that __builtin_ia32_pause also does a compiler
     785             :    memory) but this should not be relied upon for portable code
     786             :    (consider making this a compiler memory fence on all platforms?) */
     787             : 
     788             : #if FD_HAS_X86
     789 16605896077 : #define FD_SPIN_PAUSE() __builtin_ia32_pause()
     790             : #elif FD_HAS_ARM
     791             : #define FD_SPIN_PAUSE() __asm__ __volatile__( "yield" ::: "memory" )
     792             : #else
     793             : #define FD_SPIN_PAUSE() ((void)0)
     794             : #endif
     795             : 
     796             : /* FD_YIELD():  Yields the logical core of the calling thread to the
     797             :    operating system scheduler if a hosted target and does a spin pause
     798             :    otherwise. */
     799             : 
     800             : #if FD_HAS_HOSTED
     801     3578635 : #define FD_YIELD() fd_yield()
     802             : #else
     803             : #define FD_YIELD() FD_SPIN_PAUSE()
     804             : #endif
     805             : 
     806             : /* FD_VOLATILE_CONST(x):  Tells the compiler is not able to predict the
     807             :    value obtained by dereferencing x and that dereferencing x might have
     808             :    other side effects (e.g. maybe another thread could change the value
     809             :    and the compiler has no way of knowing this).  Generally speaking,
     810             :    the volatile keyword is broken linguistically.  Volatility is not a
     811             :    property of the variable but of the dereferencing of a variable (e.g.
     812             :    what is volatile from the POV of a reader of a shared variable is not
     813             :    necessarily volatile from the POV a writer of that shared variable in
     814             :    a different thread). */
     815             : 
     816  1496026904 : #define FD_VOLATILE_CONST(x) (*((volatile const __typeof__((x)) *)&(x)))
     817             : 
     818             : /* FD_VOLATILE(x): tells the compiler is not able to predict the effect
     819             :    of modifying x and that dereferencing x might have other side effects
     820             :    (e.g. maybe another thread is spinning on x waiting for its value to
     821             :    change and the compiler has no way of knowing this). */
     822             : 
     823  1052033829 : #define FD_VOLATILE(x) (*((volatile __typeof__((x)) *)&(x)))
     824             : 
     825             : #if FD_HAS_ATOMIC
     826             : 
     827             : /* FD_ATOMIC_FETCH_AND_{ADD,SUB,OR,AND,XOR}(p,v):
     828             : 
     829             :    FD_ATOMIC_FETCH_AND_ADD(p,v) does
     830             :      f = *p;
     831             :      *p = f + v
     832             :      return f;
     833             :    as a single atomic operation.  Similarly for the other variants. */
     834             : 
     835     6607921 : #define FD_ATOMIC_FETCH_AND_ADD(p,v) __sync_fetch_and_add( (p), (v) )
     836     6615147 : #define FD_ATOMIC_FETCH_AND_SUB(p,v) __sync_fetch_and_sub( (p), (v) )
     837    82305210 : #define FD_ATOMIC_FETCH_AND_OR( p,v) __sync_fetch_and_or(  (p), (v) )
     838             : #define FD_ATOMIC_FETCH_AND_AND(p,v) __sync_fetch_and_and( (p), (v) )
     839             : #define FD_ATOMIC_FETCH_AND_XOR(p,v) __sync_fetch_and_xor( (p), (v) )
     840             : 
     841             : /* FD_ATOMIC_{ADD,SUB,OR,AND,XOR}_AND_FETCH(p,v):
     842             : 
     843             :    FD_ATOMIC_{ADD,SUB,OR,AND,XOR}_AND_FETCH(p,v) does
     844             :      r = *p + v;
     845             :      *p = r;
     846             :      return r;
     847             :    as a single atomic operation.  Similarly for the other variants. */
     848             : 
     849             : #define FD_ATOMIC_ADD_AND_FETCH(p,v) __sync_add_and_fetch( (p), (v) )
     850             : #define FD_ATOMIC_SUB_AND_FETCH(p,v) __sync_sub_and_fetch( (p), (v) )
     851             : #define FD_ATOMIC_OR_AND_FETCH( p,v) __sync_or_and_fetch(  (p), (v) )
     852             : #define FD_ATOMIC_AND_AND_FETCH(p,v) __sync_and_and_fetch( (p), (v) )
     853             : #define FD_ATOMIC_XOR_AND_FETCH(p,v) __sync_xor_and_fetch( (p), (v) )
     854             : 
     855             : /* FD_ATOMIC_CAS(p,c,s):
     856             : 
     857             :    o = FD_ATOMIC_CAS(p,c,s) conceptually does:
     858             :      o = *p;
     859             :      if( o==c ) *p = s;
     860             :      return o
     861             :    as a single atomic operation. */
     862             : 
     863   325475925 : #define FD_ATOMIC_CAS(p,c,s) __sync_val_compare_and_swap( (p), (c), (s) )
     864             : 
     865             : /* FD_ATOMIC_XCHG(p,v):
     866             : 
     867             :    o = FD_ATOMIC_XCHG( p, v ) conceptually does:
     868             :      o = *p
     869             :      *p = v
     870             :      return o
     871             :    as a single atomic operation. */
     872             : 
     873    13597806 : #define FD_ATOMIC_XCHG(p,v) __atomic_exchange_n( (p), (v), __ATOMIC_SEQ_CST )
     874             : 
     875             : #endif /* FD_HAS_ATOMIC */
     876             : 
     877             : /* FD_TL:  This indicates that the variable should be thread local.
     878             : 
     879             :    FD_ONCE_{BEGIN,END}:  The block:
     880             : 
     881             :      FD_ONCE_BEGIN {
     882             :        ... code ...
     883             :      } FD_ONCE_END
     884             : 
     885             :    linguistically behaves like:
     886             : 
     887             :      do {
     888             :        ... code ...
     889             :      } while(0)
     890             : 
     891             :    But provides a low overhead guarantee that:
     892             :      - The block will be executed by at most once over all threads
     893             :        in a process (i.e. the set of threads which share global
     894             :        variables).
     895             :      - No thread in a process that encounters the block will continue
     896             :        past it until it has executed once.
     897             : 
     898             :    This implies that caller promises a ONCE block will execute in a
     899             :    finite time.  (Meant for doing simple lightweight initializations.)
     900             : 
     901             :    It is okay to nest ONCE blocks.  The thread that executes the
     902             :    outermost will execute all the nested once as part of executing the
     903             :    outermost.
     904             : 
     905             :    A ONCE implicitly provides a compiler memory fence to reduce the risk
     906             :    that the compiler will assume that operations done in the once block
     907             :    on another thread have not been done (e.g. propagating pre-once block
     908             :    variable values into post-once block code).  It is up to the user to
     909             :    provide any necessary hardware fencing (usually not necessary).
     910             : 
     911             :    FD_THREAD_ONCE_{BEGIN,END}:  The block:
     912             : 
     913             :      FD_THREAD_ONCE_BEGIN {
     914             :        ... code ...
     915             :      } FD_THREAD_ONCE_END;
     916             : 
     917             :    is similar except the guarantee is that the block only covers the
     918             :    invoking thread and it does not provide any fencing.  If a thread
     919             :    once begin is nested inside a once begin, that thread once begin will
     920             :    only be executed on the thread that executes the thread once begin.
     921             :    It is similarly okay to nest ONCE block inside a THREAD_ONCE block.
     922             : 
     923             :    FD_TURNSTILE_{BEGIN,BLOCKED,END} implement a turnstile for all
     924             :    threads in a process.  Only one thread can be in the turnstile at a
     925             :    time.  Usage:
     926             : 
     927             :      FD_TURNSTILE_BEGIN(blocking) {
     928             : 
     929             :        ... At this point, we are the only thread executing this block of
     930             :        ... code.
     931             :        ...
     932             :        ... Do operations that must be done by threads one-at-a-time
     933             :        ... here.
     934             :        ...
     935             :        ... Because compiler memory fences are done just before entering
     936             :        ... and after exiting this block, there is typically no need to
     937             :        ... use any atomics / volatile / fencing here.  That is, we can
     938             :        ... just write "normal" code on platforms where writes to memory
     939             :        ... become visible to other threads in the order in which they
     940             :        ... were issued in the machine code (e.g. x86) as others will not
     941             :        ... proceed with this block until they exit it.  YMMV for non-x86
     942             :        ... platforms (probably need additional hardware store fences in
     943             :        ... these macros).
     944             :        ...
     945             :        ... It is safe to use "break" and/or "continue" within this
     946             :        ... block.  The block will exit with the appropriate compiler
     947             :        ... fencing and unlocking.  Execution will resume immediately
     948             :        ... after FD_TURNSTILE_END.
     949             : 
     950             :        ... IMPORTANT SAFETY TIP!  DO NOT RETURN FROM THIS BLOCK.
     951             : 
     952             :      } FD_TURNSTILE_BLOCKED {
     953             : 
     954             :        ... At this point, there was another thread in the turnstile when
     955             :        ... we tried to enter the turnstile.
     956             :        ...
     957             :        ... Handle blocked here.
     958             :        ...
     959             :        ... On exiting this block, if blocking was zero, we will resume
     960             :        ... execution immediately after FD_TURNSTILE_END.  If blocking
     961             :        ... was non-zero, we will resume execution immediately before
     962             :        ... FD_TURNSTILE_BEGIN (e.g. we will retry again after a short
     963             :        ... spin pause).
     964             :        ...
     965             :        ... It is safe to use "break" and/or "continue" within this
     966             :        ... block.  Both will exit this block and resume execution
     967             :        ... at the location indicated as per what blocking specified
     968             :        ... then the turnstile was entered.
     969             :        ...
     970             :        ... It is technically safe to return from this block but
     971             :        ... also extremely gross.
     972             : 
     973             :      } FD_TURNSTILE_END; */
     974             : 
     975             : #if FD_HAS_THREADS /* Potentially more than one thread in the process */
     976             : 
     977             : #ifndef FD_TL
     978             : #define FD_TL __thread
     979             : #endif
     980             : 
     981        4490 : #define FD_ONCE_BEGIN do {                                                \
     982        4490 :     FD_COMPILER_MFENCE();                                                 \
     983        4490 :     static volatile int _fd_once_block_state = 0;                         \
     984        4490 :     for(;;) {                                                             \
     985        4490 :       int _fd_once_block_tmp = _fd_once_block_state;                      \
     986        4490 :       if( FD_LIKELY( _fd_once_block_tmp>0 ) ) break;                      \
     987        4490 :       if( FD_LIKELY( !_fd_once_block_tmp ) &&                             \
     988        2683 :           FD_LIKELY( !FD_ATOMIC_CAS( &_fd_once_block_state, 0, -1 ) ) ) { \
     989        2683 :         do
     990             : 
     991             : #define FD_ONCE_END               \
     992        2683 :         while(0);                 \
     993        2683 :         FD_COMPILER_MFENCE();     \
     994           0 :         _fd_once_block_state = 1; \
     995           0 :         break;                    \
     996        2683 :       }                           \
     997        2683 :       FD_YIELD();                 \
     998           0 :     }                             \
     999        4490 :   } while(0)
    1000             : 
    1001          36 : #define FD_THREAD_ONCE_BEGIN do {                       \
    1002          36 :     static FD_TL int _fd_thread_once_block_state = 0;   \
    1003          36 :     if( FD_UNLIKELY( !_fd_thread_once_block_state ) ) { \
    1004           9 :       do
    1005             : 
    1006             : #define FD_THREAD_ONCE_END             \
    1007           9 :       while(0);                        \
    1008           9 :       _fd_thread_once_block_state = 1; \
    1009           9 :     }                                  \
    1010          36 :   } while(0)
    1011             : 
    1012           9 : #define FD_TURNSTILE_BEGIN(blocking) do {                               \
    1013           9 :     static volatile int _fd_turnstile_state    = 0;                     \
    1014           9 :     int                 _fd_turnstile_blocking = (blocking);            \
    1015           9 :     for(;;) {                                                           \
    1016           9 :       int _fd_turnstile_tmp = _fd_turnstile_state;                      \
    1017           9 :       if( FD_LIKELY( !_fd_turnstile_tmp ) &&                            \
    1018           9 :           FD_LIKELY( !FD_ATOMIC_CAS( &_fd_turnstile_state, 0, 1 ) ) ) { \
    1019           9 :         FD_COMPILER_MFENCE();                                           \
    1020           9 :         do
    1021             : 
    1022             : #define FD_TURNSTILE_BLOCKED     \
    1023           9 :         while(0);                \
    1024           9 :         FD_COMPILER_MFENCE();    \
    1025           9 :         _fd_turnstile_state = 0; \
    1026           9 :         FD_COMPILER_MFENCE();    \
    1027           9 :         break;                   \
    1028           9 :       }                          \
    1029           9 :       FD_COMPILER_MFENCE();      \
    1030           0 :       do
    1031             : 
    1032             : #define FD_TURNSTILE_END                                             \
    1033           0 :       while(0);                                                      \
    1034           0 :       FD_COMPILER_MFENCE();                                          \
    1035           0 :       if( !_fd_turnstile_blocking ) break; /* likely compile time */ \
    1036           0 :       FD_SPIN_PAUSE();                                               \
    1037           0 :     }                                                                \
    1038           9 :   } while(0)
    1039             : 
    1040             : #else /* Only one thread in the process */
    1041             : 
    1042             : #ifndef FD_TL
    1043             : #define FD_TL /**/
    1044             : #endif
    1045             : 
    1046             : #define FD_ONCE_BEGIN do {                       \
    1047             :     static int _fd_once_block_state = 0;         \
    1048             :     if( FD_UNLIKELY( !_fd_once_block_state ) ) { \
    1049             :       do
    1050             : 
    1051             : #define FD_ONCE_END             \
    1052             :       while(0);                 \
    1053             :       _fd_once_block_state = 1; \
    1054             :     }                           \
    1055             :   } while(0)
    1056             : 
    1057             : #define FD_THREAD_ONCE_BEGIN FD_ONCE_BEGIN
    1058             : #define FD_THREAD_ONCE_END   FD_ONCE_END
    1059             : 
    1060             : #define FD_TURNSTILE_BEGIN(blocking) do { \
    1061             :     (void)(blocking);                     \
    1062             :     FD_COMPILER_MFENCE();                 \
    1063             :     if( 1 ) {                             \
    1064             :       do
    1065             : 
    1066             : #define FD_TURNSTILE_BLOCKED \
    1067             :       while(0);              \
    1068             :     } else {                 \
    1069             :       do
    1070             : 
    1071             : #define FD_TURNSTILE_END  \
    1072             :       while(0);           \
    1073             :     }                     \
    1074             :     FD_COMPILER_MFENCE(); \
    1075             :   } while(0)
    1076             : 
    1077             : #endif
    1078             : 
    1079             : /* An ideal fd_clock_func_t is a function such that:
    1080             : 
    1081             :      long dx = clock( args );
    1082             :      ... stuff ...
    1083             :      dx = clock( args ) - dx;
    1084             : 
    1085             :    yields a strictly positive dx where dx approximates the amount of
    1086             :    wallclock time elapsed on the caller in some clock specific unit
    1087             :    (e.g. nanoseconds, CPU ticks, etc) for a reasonable amount of "stuff"
    1088             :    (including no "stuff").  args allows arbitrary clock specific context
    1089             :    to be passed to the clock implication.  (clocks that need a non-const
    1090             :    args can cast away the const in the implementation or cast the
    1091             :    function pointer as necessary.) */
    1092             : 
    1093             : typedef long (*fd_clock_func_t)( void const * args );
    1094             : 
    1095             : FD_PROTOTYPES_BEGIN
    1096             : 
    1097             : /* fd_memcpy(d,s,sz):  On modern x86 in some circumstances, rep mov will
    1098             :    be faster than memcpy under the hood (basically due to RFO /
    1099             :    read-for-ownership optimizations in the cache protocol under the hood
    1100             :    that aren't easily done from the ISA ... see Intel docs on enhanced
    1101             :    rep mov).  Compile time configurable though as this is not always
    1102             :    true.  So application can tune to taste.  Hard to beat rep mov for
    1103             :    code density though (2 bytes) and pretty hard to beat in situations
    1104             :    needing a completely generic memcpy.  But it can be beaten in
    1105             :    specialized situations for the usual reasons. */
    1106             : 
    1107             : /* FIXME: CONSIDER MEMCMP TOO! */
    1108             : /* FIXME: CONSIDER MEMCPY RELATED FUNC ATTRS */
    1109             : 
    1110             : #ifndef FD_USE_ARCH_MEMCPY
    1111             : #define FD_USE_ARCH_MEMCPY 0
    1112             : #endif
    1113             : 
    1114             : #if FD_HAS_X86 && FD_USE_ARCH_MEMCPY && !defined(CBMC) && !FD_HAS_DEEPASAN && !FD_HAS_MSAN
    1115             : 
    1116             : static inline void *
    1117             : fd_memcpy( void       * FD_RESTRICT d,
    1118             :            void const * FD_RESTRICT s,
    1119   478934291 :            ulong                    sz ) {
    1120   478934291 :   void * p = d;
    1121   478934291 :   __asm__ __volatile__( "rep movsb" : "+D" (p), "+S" (s), "+c" (sz) :: "memory" );
    1122   478934291 :   return d;
    1123   478934291 : }
    1124             : 
    1125             : #elif FD_HAS_MSAN
    1126             : 
    1127             : void * __msan_memcpy( void * dest, void const * src, ulong n );
    1128             : 
    1129             : static inline void *
    1130             : fd_memcpy( void       * FD_RESTRICT d,
    1131             :            void const * FD_RESTRICT s,
    1132             :            ulong                    sz ) {
    1133             :   return __msan_memcpy( d, s, sz );
    1134             : }
    1135             : 
    1136             : #else
    1137             : 
    1138             : static inline void *
    1139             : fd_memcpy( void       * FD_RESTRICT d,
    1140             :            void const * FD_RESTRICT s,
    1141   857668758 :            ulong                    sz ) {
    1142             : #if defined(CBMC) || FD_HAS_ASAN
    1143             :   if( FD_UNLIKELY( !sz ) ) return d; /* Standard says sz 0 is UB, uncomment if target is insane and doesn't treat sz 0 as a nop */
    1144             : #endif
    1145   857668758 :   return memcpy( d, s, sz );
    1146   857668758 : }
    1147             : 
    1148             : #endif
    1149             : 
    1150             : /* fd_memset(d,c,sz): architecturally optimized memset.  See fd_memcpy
    1151             :    for considerations. */
    1152             : 
    1153             : /* FIXME: CONSIDER MEMSET RELATED FUNC ATTRS */
    1154             : 
    1155             : #ifndef FD_USE_ARCH_MEMSET
    1156             : #define FD_USE_ARCH_MEMSET 0
    1157             : #endif
    1158             : 
    1159             : #if FD_HAS_X86 && FD_USE_ARCH_MEMSET && !defined(CBMC) && !FD_HAS_DEEPASAN && !FD_HAS_MSAN
    1160             : 
    1161             : static inline void *
    1162             : fd_memset( void  * d,
    1163             :            int     c,
    1164   116024976 :            ulong   sz ) {
    1165   116024976 :   void * p = d;
    1166   116024976 :   __asm__ __volatile__( "rep stosb" : "+D" (p), "+c" (sz) : "a" (c) : "memory" );
    1167   116024976 :   return d;
    1168   116024976 : }
    1169             : 
    1170             : #else
    1171             : 
    1172             : static inline void *
    1173             : fd_memset( void  * d,
    1174             :            int     c,
    1175   230968994 :            ulong   sz ) {
    1176             : # ifdef CBMC
    1177             :   if( FD_UNLIKELY( !sz ) ) return d; /* See fd_memcpy note */
    1178             : # endif
    1179   230968994 :   return memset( d, c, sz );
    1180   230968994 : }
    1181             : 
    1182             : #endif
    1183             : 
    1184             : /* Calling fd_memzero_explicit will fill the provided region with zeroes.
    1185             :    It is guaranteed to not be optimized away.  */
    1186             : FD_FN_UNUSED static inline void
    1187             : fd_memzero_explicit( void * d,
    1188     8428251 :                     ulong  sz ) {
    1189             :    /* We don't want to depend on explicit_bzero or memset_s, so the simplest
    1190             :       way to ensure the memset is not optimized away is to use a compiler fence,
    1191             :       identical to how explicit_bzero is implemented.
    1192             :       https://elixir.bootlin.com/glibc/glibc-2.40/source/string/explicit_bzero.c#L33 */
    1193     8428251 :    memset( d, 0, sz );
    1194     8428251 :    __asm__ __volatile__( "" ::: "memory" );
    1195     8428251 : }
    1196             : 
    1197             : /* fd_memeq(s0,s1,sz):  Compares two blocks of memory.  Returns 1 if
    1198             :    equal or sz is zero and 0 otherwise.  No memory accesses made if sz
    1199             :    is zero (pointers may be invalid).  On x86, uses repe cmpsb which is
    1200             :    preferable to __builtin_memcmp in some cases. */
    1201             : 
    1202             : #ifndef FD_USE_ARCH_MEMEQ
    1203             : #define FD_USE_ARCH_MEMEQ 0
    1204             : #endif
    1205             : 
    1206             : #if FD_HAS_X86 && FD_USE_ARCH_MEMEQ && defined(__GCC_ASM_FLAG_OUTPUTS__) && __STDC_VERSION__>=199901L
    1207             : 
    1208             : FD_FN_PURE static inline int
    1209             : fd_memeq( void const * s0,
    1210             :           void const * s1,
    1211             :           ulong        sz ) {
    1212             :   /* ZF flag is set and exported in two cases:
    1213             :       a) size is zero (via test)
    1214             :       b) buffer is equal (via repe cmpsb) */
    1215             :   int r;
    1216             :   __asm__( "test %3, %3;"
    1217             :            "repe cmpsb"
    1218             :          : "=@cce" (r), "+S" (s0), "+D" (s1), "+c" (sz)
    1219             :          : "m" (*(char const (*)[sz]) s0), "m" (*(char const (*)[sz]) s1)
    1220             :          : "cc" );
    1221             :   return r;
    1222             : }
    1223             : 
    1224             : #else
    1225             : 
    1226             : FD_FN_PURE static inline int
    1227             : fd_memeq( void const * s1,
    1228             :           void const * s2,
    1229    15941716 :           ulong        sz ) {
    1230    15941716 :   return 0==memcmp( s1, s2, sz );
    1231    15941716 : }
    1232             : 
    1233             : #endif
    1234             : 
    1235             : /* Returns 1 if all sz bytes starting at s are zero, 0 otherwise. */
    1236             : FD_FN_PURE static inline int
    1237             : fd_mem_iszero( uchar const * s,
    1238         282 :                ulong         sz ) {
    1239         282 :   for( ulong i=0UL; i<sz; i++ ) {
    1240         282 :    if( s[i]!=0 ) return 0;
    1241         282 :   }
    1242           0 :   return 1;
    1243         282 : }
    1244             : 
    1245             : /* fd_hash(seed,buf,sz), fd_hash_memcpy(seed,d,s,sz):  High quality
    1246             :    (full avalanche) high speed variable length buffer -> 64-bit hash
    1247             :    function (memcpy_hash is often as fast as plain memcpy).  Based on
    1248             :    the xxhash-r39 (open source BSD licensed) implementation.  In-place
    1249             :    and out-of-place variants provided (out-of-place variant assumes dst
    1250             :    and src do not overlap).  Caller promises valid input arguments,
    1251             :    cannot fail given valid inputs arguments.  sz==0 is fine. */
    1252             : 
    1253             : FD_FN_PURE ulong
    1254             : fd_hash( ulong        seed,
    1255             :          void const * buf,
    1256             :          ulong        sz );
    1257             : 
    1258             : ulong
    1259             : fd_hash_memcpy( ulong                    seed,
    1260             :                 void       * FD_RESTRICT d,
    1261             :                 void const * FD_RESTRICT s,
    1262             :                 ulong                    sz );
    1263             : 
    1264             : #ifndef FD_TICKCOUNT_STYLE
    1265             : #if FD_HAS_X86 /* Use RDTSC */
    1266             : #define FD_TICKCOUNT_STYLE 1
    1267             : #elif FD_HAS_ARM /* Use CNTVCT_EL0 */
    1268             : #define FD_TICKCOUNT_STYLE 2
    1269             : #else /* Use portable fallback */
    1270             : #define FD_TICKCOUNT_STYLE 0
    1271             : #endif
    1272             : #endif
    1273             : 
    1274             : #if FD_TICKCOUNT_STYLE==0 /* Portable fallback (slow).  Ticks at 1 ns / tick */
    1275             : 
    1276             : #define fd_tickcount() fd_log_wallclock() /* TODO: fix ugly pre-log usage */
    1277             : 
    1278             : #elif FD_TICKCOUNT_STYLE==1 /* RTDSC (fast) */
    1279             : 
    1280             : /* fd_tickcount:  Reads the hardware invariant tickcounter ("RDTSC").
    1281             :    This monotonically increases at an approximately constant rate
    1282             :    relative to the system wallclock and is synchronous across all CPUs
    1283             :    on a host.
    1284             : 
    1285             :    The rate this ticks at is not precisely defined (see Intel docs for
    1286             :    more details) but it is typically in the ballpark of the CPU base
    1287             :    clock frequency.  The relationship to the wallclock is very well
    1288             :    approximated as linear over short periods of time (i.e. less than a
    1289             :    fraction of a second) and this should not exhibit any sudden changes
    1290             :    in its rate relative to the wallclock.  Notably, its rate is not
    1291             :    directly impacted by CPU clock frequency adaptation / Turbo mode (see
    1292             :    other Intel performance monitoring counters for various CPU cycle
    1293             :    counters).  It can drift over longer period time for the usual clock
    1294             :    synchronization reasons.
    1295             : 
    1296             :    This is a reasonably fast O(1) cost (~6-8 ns on recent Intel).
    1297             :    Because of all compiler options and parallel execution going on in
    1298             :    modern CPUs cores, other instructions might be reordered around this
    1299             :    by the compiler and/or CPU.  It is up to the user to do lower level
    1300             :    tricks as necessary when the precise location of this in the
    1301             :    execution stream and/or when executed by the CPU is needed.  (This is
    1302             :    often unnecessary as such levels of precision are not frequently
    1303             :    required and often have self-defeating overheads.)
    1304             : 
    1305             :    It is worth noting that RDTSC and/or (even more frequently) lower
    1306             :    level performance counters are often restricted from use in user
    1307             :    space applications.  It is recommended that applications use this
    1308             :    primarily for debugging / performance tuning on unrestricted hosts
    1309             :    and/or when the developer is confident that applications using this
    1310             :    will have appropriate permissions when deployed. */
    1311             : 
    1312  7427714778 : #define fd_tickcount() ((long)__builtin_ia32_rdtsc())
    1313             : 
    1314             : #elif FD_TICKCOUNT_STYLE==2 /* armv8 (fast) */
    1315             : 
    1316             : /* fd_tickcount (ARM): https://developer.arm.com/documentation/ddi0601/2021-12/AArch64-Registers/CNTVCT-EL0--Counter-timer-Virtual-Count-register
    1317             :    Approx 24 MHz on Apple M1. */
    1318             : 
    1319             : static inline long
    1320             : fd_tickcount( void ) {
    1321             :   /* consider using 'isb' */
    1322             :   ulong value;
    1323             :   __asm__ __volatile__ (
    1324             :     "isb\n"
    1325             :     "mrs %0, cntvct_el0\n"
    1326             :     "nop"
    1327             :     : "=r" (value) );
    1328             :   return (long)value;
    1329             : }
    1330             : 
    1331             : #else
    1332             : #error "Unknown FD_TICKCOUNT_STYLE"
    1333             : #endif
    1334             : 
    1335             : long _fd_tickcount( void const * _ ); /* fd_clock_func_t compat */
    1336             : 
    1337             : #if FD_HAS_HOSTED
    1338             : 
    1339             : /* fd_yield yields the calling thread to the operating system scheduler. */
    1340             : 
    1341             : void
    1342             : fd_yield( void );
    1343             : 
    1344             : #endif
    1345             : 
    1346             : #if FD_HAS_ARM
    1347             : 
    1348             : /* fd_arm_stp16 stores two ulongs to a 16-byte memory location.
    1349             :    If LSE2 and p is aligned, is single-copy atomic. */
    1350             : 
    1351             : static inline void
    1352             : fd_arm_stp16( ulong * p,
    1353             :               ulong   a,
    1354             :               ulong   b ) {
    1355             :   __asm__(
    1356             :       "stp %x[a], %x[b], [%[p]]"
    1357             :       :
    1358             :       : [a] "r"(a), [b] "r"(b), [p] "r"(p)
    1359             :       : "memory"
    1360             :   );
    1361             : }
    1362             : 
    1363             : /* fd_arm_ldp16 loads two ulongs from a 16-byte memory location.
    1364             :    If LSE2 and p is aligned, is single-copy atomic. */
    1365             : 
    1366             : #define fd_arm_ldp16(p_,a_,b_)     \
    1367             :   __asm__(                         \
    1368             :       "ldp %x[a], %x[b], [%[p]]"   \
    1369             :       : [a] "=r"(a_), [b] "=r"(b_) \
    1370             :       : [p] "r"(p_)                \
    1371             :       : "memory"                   \
    1372             :   )
    1373             : 
    1374             : /* fd_arm_ldp16_acq_pc is like fd_arm_ldp16, but with Load-AcquirePC
    1375             :    semantics.  Requires RCPC3. */
    1376             : 
    1377             : #define fd_arm_ldp16_acq_pc(p_,a_,b_) \
    1378             :   __asm__(                            \
    1379             :       "ldiapp %x[a], %x[b], [%[p]]"   \
    1380             :       : [a] "=r"(a_), [b] "=r"(b_)    \
    1381             :       : [p] "r"(p_)                   \
    1382             :       : "memory"                      \
    1383             :   )
    1384             : 
    1385             : #endif /* FD_HAS_ARM */
    1386             : 
    1387             : FD_PROTOTYPES_END
    1388             : 
    1389             : #endif /* HEADER_fd_src_util_fd_util_base_h */

Generated by: LCOV version 1.14