Line data Source code
1 : #ifndef HEADER_fd_src_util_fd_util_base_h
2 : #define HEADER_fd_src_util_fd_util_base_h
3 :
4 : /* Base development environment */
5 :
6 : /* Compiler checks ****************************************************/
7 :
8 : #ifdef __cplusplus
9 :
10 : #if __cplusplus<201703L
11 : #error "Firedancer requires C++17 or later"
12 : #endif
13 :
14 : #else
15 :
16 : #if __STDC_VERSION__<201710L
17 : #error "Firedancer requires C Standard version C17 or later"
18 : #endif
19 :
20 : #endif //__cplusplus
21 :
22 : /* Build target capabilities ******************************************/
23 :
24 : /* Different build targets often have different levels of support for
25 : various language and hardware features. The presence of various
26 : features can be tested at preprocessor, compile, or run time via the
27 : below capability macros.
28 :
29 : Code that does not exploit any of these capabilities written within
30 : the base development environment should be broadly portable across a
31 : range of build targets ranging from on-chain virtual machines to
32 : commodity hosts to custom hardware.
33 :
34 : As such, highly portable yet high performance code is possible by
35 : writing generic implementations that do not exploit any of the below
36 : capabilities as a portable fallback along with build target specific
37 : optimized implementations that are invoked when the build target
38 : supports the appropriate capabilities.
39 :
40 : The base development itself provide lots of functionality to help
41 : with implementing portable fallbacks while making very minimal
42 : assumptions about the build targets and zero use of 3rd party
43 : libraries (these might make unknown additional assumptions about the
44 : build target, including availability of a quality implementation of
45 : the library on the build target). */
46 :
47 : /* FD_HAS_HOSTED: If the build target is hosted (e.g. resides on a host
48 : with a POSIX-ish environment ... practically speaking, stdio.h,
49 : stdlib.h, unistd.h, et al more or less behave normally ...
50 : pedantically XOPEN_SOURCE=700), FD_HAS_HOSTED will be 1. It will be
51 : zero otherwise. */
52 :
53 : #ifndef FD_HAS_HOSTED
54 : #define FD_HAS_HOSTED 0
55 : #endif
56 :
57 : /* FD_HAS_ATOMIC: If the build target supports atomic operations
58 : between threads accessing a common memory region (include threads
59 : that reside in different processes on a host communicating via a
60 : shared memory region with potentially different local virtual
61 : mappings). Practically speaking, does atomic compare-and-swap et al
62 : work? */
63 :
64 : #ifndef FD_HAS_ATOMIC
65 : #define FD_HAS_ATOMIC 0
66 : #endif
67 :
68 : /* FD_HAS_THREADS: If the build target supports a POSIX-ish notion of
69 : threads (e.g. practically speaking, global variables declared within
70 : a compile unit are visible to more than one thread of execution,
71 : pthreads.h / threading parts of C standard, the atomics parts of the
72 : C standard, ... more or less work normally), FD_HAS_THREADS will be
73 : 1. It will be zero otherwise. FD_HAS_THREADS implies FD_HAS_HOSTED
74 : and FD_HAS_ATOMIC. */
75 :
76 : #ifndef FD_HAS_THREADS
77 : #define FD_HAS_THREADS 0
78 : #endif
79 :
80 : /* FD_HAS_INT128: If the build target supports reasonably efficient
81 : 128-bit wide integer operations, define FD_HAS_INT128 to 1 to enable
82 : use of them in implementations. */
83 :
84 : #ifndef FD_HAS_INT128
85 : #define FD_HAS_INT128 0
86 : #endif
87 :
88 : /* FD_HAS_DOUBLE: If the build target supports reasonably efficient
89 : IEEE 754 64-bit wide double precision floating point options, define
90 : FD_HAS_DOUBLE to 1 to enable use of them in implementations. Note
91 : that even if the build target does not, va_args handling in the C /
92 : C++ language requires promotion of a float in an va_arg list to a
93 : double. Thus, C / C++ language that support IEEE 754 float also
94 : implies a minimum level of support for double (though not necessarily
95 : efficient or IEEE 754). That is, even if a target does not have
96 : FD_HAS_DOUBLE, there might still be limited use of double in va_arg
97 : list handling. */
98 :
99 : #ifndef FD_HAS_DOUBLE
100 : #define FD_HAS_DOUBLE 0
101 : #endif
102 :
103 : /* FD_HAS_ALLOCA: If the build target supports fast alloca-style
104 : dynamic stack memory allocation (e.g. alloca.h / __builtin_alloca
105 : more or less work normally), define FD_HAS_ALLOCA to 1 to enable use
106 : of it in implementations. */
107 :
108 : #ifndef FD_HAS_ALLOCA
109 : #define FD_HAS_ALLOCA 0
110 : #endif
111 :
112 : /* FD_HAS_X86: If the build target supports x86 specific features and
113 : can benefit from x86 specific optimizations, define FD_HAS_X86. Code
114 : needing more specific target features (Intel / AMD / SSE / AVX2 /
115 : AVX512 / etc) can specialize further as necessary with even more
116 : precise capabilities (that in turn imply FD_HAS_X86). */
117 :
118 : #ifndef FD_HAS_X86
119 : #define FD_HAS_X86 0
120 : #endif
121 :
122 : /* These allow even more precise targeting for X86. */
123 :
124 : /* FD_HAS_SSE indicates the target supports Intel SSE4 style SIMD
125 : (basically do the 128-bit wide parts of "x86intrin.h" work).
126 : Recommend using the simd/fd_sse.h APIs instead of raw Intel
127 : intrinsics for readability and to facilitate portability to non-x86
128 : platforms. Implies FD_HAS_X86. */
129 :
130 : #ifndef FD_HAS_SSE
131 : #define FD_HAS_SSE 0
132 : #endif
133 :
134 : /* FD_HAS_AVX indicates the target supports Intel AVX2 style SIMD
135 : (basically do the 256-bit wide parts of "x86intrin.h" work).
136 : Recommend using the simd/fd_avx.h APIs instead of raw Intel
137 : intrinsics for readability and to facilitate portability to non-x86
138 : platforms. Implies FD_HAS_SSE. */
139 :
140 : #ifndef FD_HAS_AVX
141 : #define FD_HAS_AVX 0
142 : #endif
143 :
144 : /* FD_HAS_AVX512 indicates the target supports Intel AVX-512 style SIMD
145 : (basically do the 512-bit wide parts of "x86intrin.h" work).
146 : Recommend using the simd/fd_avx512.h APIs instead of raw Intel
147 : intrinsics for readability and to facilitate portability to non-x86
148 : platforms. Implies FD_HAS_AVX. */
149 :
150 : #ifndef FD_HAS_AVX512
151 : #define FD_HAS_AVX512 0
152 : #endif
153 :
154 : /* FD_HAS_SHANI indicates that the target supports Intel SHA extensions
155 : which accelerate SHA-1 and SHA-256 computation. This extension is
156 : also called SHA-NI or SHA_NI (Secure Hash Algorithm New
157 : Instructions). Although proposed in 2013, they're only supported on
158 : Intel Ice Lake and AMD Zen CPUs and newer. Implies FD_HAS_AVX. */
159 :
160 : #ifndef FD_HAS_SHANI
161 : #define FD_HAS_SHANI 0
162 : #endif
163 :
164 : /* FD_HAS_GFNI indicates that the target supports Intel Galois Field
165 : extensions, which accelerate operations over binary extension fields,
166 : especially GF(2^8). These instructions are supported on Intel Ice
167 : Lake and newer and AMD Zen4 and newer CPUs. Implies FD_HAS_AVX. */
168 :
169 : #ifndef FD_HAS_GFNI
170 : #define FD_HAS_GFNI 0
171 : #endif
172 :
173 : /* FD_HAS_AESNI indicates that the target supports AES-NI extensions,
174 : which accelerate AES encryption and decryption. While AVX predates
175 : the original AES-NI extension, the combination of AES-NI+AVX adds
176 : additional opcodes (such as vaesenc, a more flexible variant of
177 : aesenc). Thus, implies FD_HAS_AVX. A conservative estimate for
178 : minimum platform support is Intel Haswell or AMD Zen. */
179 :
180 : #ifndef FD_HAS_AESNI
181 : #define FD_HAS_AESNI 0
182 : #endif
183 :
184 : /* FD_HAS_ARM: If the build target supports armv8-a specific features
185 : and can benefit from aarch64 specific optimizations, define
186 : FD_HAS_ARM. */
187 :
188 : #ifndef FD_HAS_ARM
189 : #define FD_HAS_ARM 0
190 : #endif
191 :
192 : /* FD_HAS_LZ4 indicates that the target supports LZ4 compression.
193 : Roughly, does "#include <lz4.h>" and the APIs therein work? */
194 :
195 : #ifndef FD_HAS_LZ4
196 : #define FD_HAS_LZ4 0
197 : #endif
198 :
199 : /* FD_HAS_ZSTD indicates that the target supports ZSTD compression.
200 : Roughly, does "#include <zstd.h>" and the APIs therein work? */
201 :
202 : #ifndef FD_HAS_ZSTD
203 : #define FD_HAS_ZSTD 0
204 : #endif
205 :
206 : /* FD_HAS_COVERAGE indicates that the build target is built with coverage instrumentation. */
207 :
208 : #ifndef FD_HAS_COVERAGE
209 : #define FD_HAS_COVERAGE 0
210 : #endif
211 :
212 : /* FD_HAS_ASAN indicates that the build target is using ASAN. */
213 :
214 : #ifndef FD_HAS_ASAN
215 : #define FD_HAS_ASAN 0
216 : #endif
217 :
218 : /* FD_HAS_UBSAN indicates that the build target is using UBSAN. */
219 :
220 : #ifndef FD_HAS_UBSAN
221 : #define FD_HAS_UBSAN 0
222 : #endif
223 :
224 : /* FD_HAS_DEEPASAN indicates that the build target is using ASAN with manual
225 : memory poisoning for fd_alloc, fd_wksp, and fd_scratch. */
226 :
227 : #ifndef FD_HAS_DEEPASAN
228 : #define FD_HAS_DEEPASAN 0
229 : #endif
230 :
231 : /* Base development environment ***************************************/
232 :
233 : /* The functionality provided by these vanilla headers are always
234 : available within the base development environment. Notably, stdio.h
235 : / stdlib.h / et al at are not included here as these make lots of
236 : assumptions about the build target that may not be true (especially
237 : for on-chain and custom hardware use). Code should prefer the fd
238 : util equivalents for such functionality when possible. */
239 :
240 : #include <stdalign.h>
241 : #include <string.h>
242 : #include <limits.h>
243 : #include <float.h>
244 :
245 : /* Work around some library naming irregularities */
246 : /* FIXME: Consider this for FLOAT/FLT, DOUBLE/DBL too? */
247 :
248 3 : #define SHORT_MIN SHRT_MIN
249 3 : #define SHORT_MAX SHRT_MAX
250 1052532 : #define USHORT_MAX USHRT_MAX
251 :
252 : /* Primitive types ****************************************************/
253 :
254 : /* These typedefs provide single token regularized names for all the
255 : primitive types in the base development environment:
256 :
257 : char !
258 : schar ! short int long int128 !!
259 : uchar ushort uint ulong uint128 !!
260 : float
261 : double !!!
262 :
263 : ! Does not assume the sign of char. A naked char should be treated
264 : as cstr character and mathematical operations should be avoided on
265 : them. This is less than ideal as the patterns for integer types in
266 : the C/C++ language spec itself are far more consistent with a naked
267 : char naturally being treated as signed (see above). But there are
268 : lots of conflicts between architectures, languages and standard
269 : libraries about this so any use of a naked char shouldn't assume
270 : the sign ... sigh.
271 :
272 : !! Only available if FD_HAS_INT128 is defined
273 :
274 : !!! Should only used if FD_HAS_DOUBLE is defined but see note in
275 : FD_HAS_DOUBLE about C/C++ silent promotions of float to double in
276 : va_arg lists.
277 :
278 : Note also that these token names more naturally interoperate with
279 : integer constant declarations, type generic code generation
280 : techniques, with printf-style format strings than the stdint.h /
281 : inttypes.h handling.
282 :
283 : To minimize portability issues, unexpected silent type conversion
284 : issues, align with typical developer implicit usage, align with
285 : typical build target usage, ..., assumes char / short / int / long
286 : are 8 / 16 / 32 / 64 twos complement integers and float is IEEE-754
287 : single precision. Further assumes little endian, truncating signed
288 : integer division, sign extending (arithmetic) signed right shift and
289 : signed left shift behaves the same as an unsigned left shift from bit
290 : operations point of view (technically the standard says signed left
291 : shift is undefined if the result would overflow). Also, except for
292 : int128/uint128, assumes that aligned access to these will be
293 : naturally atomic. Lastly assumes that unaligned access to these is
294 : functionally valid but does not assume that unaligned access to these
295 : is efficient or atomic.
296 :
297 : For values meant to be held in registers, code should prefer long /
298 : ulong types (improves asm generation given the prevalence of 64-bit
299 : targets and also to avoid lots of tricky bugs with silent promotions
300 : in the language ... e.g. ushort should ideally only be used for
301 : in-memory representations).
302 :
303 : These are currently not prefixed given how often they are used. If
304 : this becomes problematic prefixes can be added as necessary.
305 : Specifically, C++ allows typedefs to be defined multiple times so
306 : long as they are equivalent. Inequivalent collisions are not
307 : supported but should be rare (e.g. if a 3rd party header thinks
308 : "ulong" should be something other an "unsigned long", the 3rd party
309 : header probably should be nuked from orbit). C11 and forward also
310 : allow multiple equivalent typedefs. C99 and earlier don't but this
311 : is typically only a warning and then only if pedantic warnings are
312 : enabled. Thus, if we want to support users using C99 and earlier who
313 : want to do a strict compile and have a superfluous collision with
314 : these types in other libraries, uncomment the below (or do something
315 : equivalent for the compiler). */
316 :
317 : //#pragma GCC diagnostic push
318 : //#pragma GCC diagnostic ignored "-Wpedantic"
319 :
320 : typedef signed char schar; /* See above note of sadness */
321 :
322 : typedef unsigned char uchar;
323 : typedef unsigned short ushort;
324 : typedef unsigned int uint;
325 : typedef unsigned long ulong;
326 :
327 : #if FD_HAS_INT128
328 :
329 : __extension__ typedef __int128 int128;
330 : __extension__ typedef unsigned __int128 uint128;
331 :
332 1200000048 : #define UINT128_MAX (~(uint128)0)
333 6 : #define INT128_MAX ((int128)(UINT128_MAX>>1))
334 3 : #define INT128_MIN (-INT128_MAX-(int128)1)
335 :
336 : #endif
337 :
338 : //#pragma GCC diagnostic pop
339 :
340 : /* Compiler tricks ****************************************************/
341 :
342 : /* FD_STRINGIFY,FD_CONCAT{2,3,4}: Various macros for token
343 : stringification and pasting. FD_STRINGIFY returns the argument as a
344 : cstr (e.g. FD_STRINGIFY(foo) -> "foo"). FD_CONCAT* pastes the tokens
345 : together into a single token (e.g. FD_CONCAT3(a,b,c) -> abc). The
346 : EXPAND variants first expand their arguments and then do the token
347 : operation (e.g. FD_EXPAND_THEN_STRINGIFY(__LINE__) -> "104" if done
348 : on line 104 of the source code file). */
349 :
350 0 : #define FD_STRINGIFY(x)#x
351 7394880 : #define FD_CONCAT2(a,b)a##b
352 17523 : #define FD_CONCAT3(a,b,c)a##b##c
353 0 : #define FD_CONCAT4(a,b,c,d)a##b##c##d
354 :
355 : #define FD_EXPAND_THEN_STRINGIFY(x)FD_STRINGIFY(x)
356 4200096 : #define FD_EXPAND_THEN_CONCAT2(a,b)FD_CONCAT2(a,b)
357 >32338*10^7 : #define FD_EXPAND_THEN_CONCAT3(a,b,c)FD_CONCAT3(a,b,c)
358 5561244 : #define FD_EXPAND_THEN_CONCAT4(a,b,c,d)FD_CONCAT4(a,b,c,d)
359 :
360 : /* FD_VA_ARGS_SELECT(__VA_ARGS__,e32,e31,...e1): Macro that expands to
361 : en at compile time where n is number of items in the __VA_ARGS__
362 : list. If __VA_ARGS__ is empty, returns e1. Assumes __VA_ARGS__ has
363 : at most 32 arguments. Useful for making a variadic macro whose
364 : behavior depends on the number of arguments in __VA_ARGS__. */
365 :
366 : #define FD_VA_ARGS_SELECT(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,a,b,c,d,e,f,_,...)_
367 :
368 : /* FD_SRC_LOCATION returns a const cstr holding the line of code where
369 : FD_SRC_LOCATION was used. */
370 :
371 : #define FD_SRC_LOCATION __FILE__ "(" FD_EXPAND_THEN_STRINGIFY(__LINE__) ")"
372 :
373 : /* FD_STATIC_ASSERT tests at compile time if c is non-zero. If not,
374 : it aborts the compile with an error. err itself should be a token
375 : (e.g. not a string, no whitespace, etc). */
376 :
377 : #ifdef __cplusplus
378 : #define FD_STATIC_ASSERT(c,err) static_assert(c, #err)
379 : #else
380 30360 : #define FD_STATIC_ASSERT(c,err) _Static_assert(c, #err)
381 : #endif
382 :
383 : /* FD_ADDRESS_OF_PACKED_MEMBER(x): Linguistically does &(x) but without
384 : recent compiler complaints that &x might be unaligned if x is a
385 : member of a packed datastructure. (Often needed for interfacing with
386 : hardware / packets / etc.) */
387 :
388 3 : #define FD_ADDRESS_OF_PACKED_MEMBER( x ) (__extension__({ \
389 3 : char * _fd_aopm = (char *)&(x); \
390 3 : __asm__( "# FD_ADDRESS_OF_PACKED_MEMBER(" #x ") @" FD_SRC_LOCATION : "+r" (_fd_aopm) :: ); \
391 3 : (__typeof__(&(x)))_fd_aopm; \
392 3 : }))
393 :
394 : /* FD_PROTOTYPES_{BEGIN,END}: Headers that might be included in C++
395 : source should encapsulate the prototypes of code and globals
396 : contained in compilation units compiled as C with a
397 : FD_PROTOTYPE_{BEGIN,END} pair. */
398 :
399 : #ifdef __cplusplus
400 : #define FD_PROTOTYPES_BEGIN extern "C" {
401 : #else
402 : #define FD_PROTOTYPES_BEGIN
403 : #endif
404 :
405 : #ifdef __cplusplus
406 : #define FD_PROTOTYPES_END }
407 : #else
408 : #define FD_PROTOTYPES_END
409 : #endif
410 :
411 : /* FD_ASM_LG_ALIGN(lg_n) expands to an alignment assembler directive
412 : appropriate for the current architecture/ABI. The resulting align
413 : is 2^(lg_n) bytes, i.e. FD_ASM_LG_ALIGN(3) aligns by 8 bytes. */
414 :
415 : #if defined(__aarch64__)
416 : #define FD_ASM_LG_ALIGN(lg_n) ".align " #lg_n "\n"
417 : #elif defined(__x86_64__) || defined(__powerpc64__) || defined(__riscv)
418 : #define FD_ASM_LG_ALIGN(lg_n) ".p2align " #lg_n "\n"
419 : #endif
420 :
421 : /* FD_IMPORT declares a variable name and initializes with the contents
422 : of the file at path (with potentially some assembly directives for
423 : additional footer info). It is equivalent to:
424 :
425 : type const name[] __attribute__((aligned(align))) = {
426 :
427 : ... code that would initialize the contents of name to the
428 : ... raw binary data found in the file at path at compile time
429 : ... (with any appended information as specified by footer)
430 :
431 : };
432 :
433 : ulong const name_sz = ... number of bytes pointed to by name;
434 :
435 : More precisely, this creates a symbol "name" in the object file that
436 : points to a read-only copy of the raw data in the file at "path" as
437 : it was at compile time. 2^lg_align specifies the minimum alignment
438 : required for the copy's first byte as an unsuffixed decimal integer.
439 : footer are assembly commands to permit additional data to be appended
440 : to the copy (use "" for footer if no footer is necessary).
441 :
442 : Then it exposes a pointer to this copy in the current compilation
443 : unit as name and the byte size as name_sz. name_sz covers the first
444 : byte of the included data to the last byte of the footer inclusive.
445 :
446 : The dummy linker symbol _fd_import_name_sz will also be created in
447 : the object file as some under the hood magic to make this work. This
448 : should not be used in any compile unit as some compilers (I'm looking
449 : at you clang-15, but apparently not clang-10) will sometimes mangle
450 : its value from what it was set to in the object file even marked as
451 : absolute in the object file.
452 :
453 : This should only be used at global scope and should be done at most
454 : once over all object files / libraries used to make a program. If
455 : other compilation units want to make use of an import in a different
456 : compilation unit, they should declare:
457 :
458 : extern type const name[] __attribute__((aligned(align)));
459 :
460 : and/or:
461 :
462 : extern ulong const name_sz;
463 :
464 : as necessary (that is, do the usual to use name and name_sz as shown
465 : for the pseudo code above).
466 :
467 : Important safety tip! gcc -M will generally not detect the
468 : dependency this creates between the importing file and the imported
469 : file. This can cause incremental builds to miss changes to the
470 : imported file. Ideally, we would have FD_IMPORT automatically do
471 : something like:
472 :
473 : _Pragma( "GCC dependency \"" path "\" )
474 :
475 : This doesn't work as is because _Pragma needs some macro expansion
476 : hacks to accept this (this is doable). After that workaround, this
477 : still doesn't work because, due to tooling limitations, the pragma
478 : path is relative to the source file directory and the FD_IMPORT path
479 : is relative to the make directory (working around this would
480 : require a __FILE__-like directive for the source code directory base
481 : path). Even if that did exist, it might still not work because
482 : out-of-tree builds often require some substitutions to the gcc -M
483 : generated dependencies that this might not pick up (at least not
484 : without some build system surgery). And then it still wouldn't work
485 : because gcc -M seems to ignore all of this anyways (which is the
486 : actual show stopper as this pragma does something subtly different
487 : than what the name suggests and there isn't any obvious support for a
488 : "pseudo-include".) Another reminder that make clean and fast builds
489 : are our friend. */
490 :
491 : #if defined(__ELF__)
492 :
493 : #define FD_IMPORT( name, path, type, lg_align, footer ) \
494 : __asm__( ".section .rodata,\"a\",@progbits\n" \
495 : ".type " #name ",@object\n" \
496 : ".globl " #name "\n" \
497 : FD_ASM_LG_ALIGN(lg_align) \
498 : #name ":\n" \
499 : ".incbin \"" path "\"\n" \
500 : footer "\n" \
501 : ".size " #name ",. - " #name "\n" \
502 : "_fd_import_" #name "_sz = . - " #name "\n" \
503 : ".type " #name "_sz,@object\n" \
504 : ".globl " #name "_sz\n" \
505 : FD_ASM_LG_ALIGN(3) \
506 : #name "_sz:\n" \
507 : ".quad _fd_import_" #name "_sz\n" \
508 : ".size " #name "_sz,8\n" \
509 : ".previous\n" ); \
510 : extern type const name[] __attribute__((aligned(1<<(lg_align)))); \
511 : extern ulong const name##_sz
512 :
513 : #elif defined(__MACH__)
514 :
515 : #define FD_IMPORT( name, path, type, lg_align, footer ) \
516 : __asm__( ".section __DATA,__const\n" \
517 : ".globl _" #name "\n" \
518 : FD_ASM_LG_ALIGN(lg_align) \
519 : "_" #name ":\n" \
520 : ".incbin \"" path "\"\n" \
521 : footer "\n" \
522 : "_fd_import_" #name "_sz = . - _" #name "\n" \
523 : ".globl _" #name "_sz\n" \
524 : FD_ASM_LG_ALIGN(3) \
525 : "_" #name "_sz:\n" \
526 : ".quad _fd_import_" #name "_sz\n" \
527 : ".previous\n" ); \
528 : extern type const name[] __attribute__((aligned(1<<(lg_align)))); \
529 : extern ulong const name##_sz
530 :
531 : #endif
532 :
533 : /* FD_IMPORT_{BINARY,CSTR} are common cases for FD_IMPORT.
534 :
535 : In BINARY, the file is imported into the object file and exposed to
536 : the caller as a uchar binary data. name_sz will be the number of
537 : bytes in the file at time of import. name will have 128 byte
538 : alignment.
539 :
540 : In CSTR, the file is imported into the object caller with a '\0'
541 : termination appended and exposed to the caller as a cstr. Assuming
542 : the file is text (i.e. has no internal '\0's), strlen(name) will the
543 : number of bytes in the file and name_sz will be strlen(name)+1. name
544 : can have arbitrary alignment. */
545 :
546 : #ifdef FD_IMPORT
547 : #define FD_IMPORT_BINARY(name, path) FD_IMPORT( name, path, uchar, 7, "" )
548 : #define FD_IMPORT_CSTR( name, path) FD_IMPORT( name, path, char, 1, ".byte 0" )
549 : #endif
550 :
551 : /* Optimizer tricks ***************************************************/
552 :
553 : /* FD_RESTRICT is a pointer modifier for to designate a pointer as
554 : restricted. Hoops jumped because C++-17 still doesn't understand
555 : restrict ... sigh */
556 :
557 : #ifndef FD_RESTRICT
558 : #ifdef __cplusplus
559 : #define FD_RESTRICT __restrict
560 : #else
561 : #define FD_RESTRICT restrict
562 : #endif
563 : #endif
564 :
565 : /* fd_type_pun(p), fd_type_pun_const(p): These allow use of type
566 : punning while keeping strict aliasing optimizations enabled (e.g.
567 : some UNIX APIs, like sockaddr related APIs are dependent on type
568 : punning). These allow these API's to be used cleanly while keeping
569 : strict aliasing optimizations enabled and strict alias checking done. */
570 :
571 : static inline void *
572 129261776 : fd_type_pun( void * p ) {
573 129261776 : __asm__( "# fd_type_pun @" FD_SRC_LOCATION : "+r" (p) :: "memory" );
574 129261776 : return p;
575 129261776 : }
576 :
577 : static inline void const *
578 42249855 : fd_type_pun_const( void const * p ) {
579 42249855 : __asm__( "# fd_type_pun_const @" FD_SRC_LOCATION : "+r" (p) :: "memory" );
580 42249855 : return p;
581 42249855 : }
582 :
583 : /* FD_{LIKELY,UNLIKELY}(c): Evaluates c and returns whether it is
584 : logical true/false as long (1L/0L). It also hints to the optimizer
585 : whether it should optimize for the case of c evaluating as
586 : true/false. */
587 :
588 >12964*10^7 : #define FD_LIKELY(c) __builtin_expect( !!(c), 1L )
589 >50856*10^7 : #define FD_UNLIKELY(c) __builtin_expect( !!(c), 0L )
590 :
591 : /* FD_FN_PURE hints to the optimizer that the function, roughly
592 : speaking, does not have side effects. As such, the compiler can
593 : replace a call to the function with the result of an earlier call to
594 : that function provide the inputs and memory used hasn't changed.
595 :
596 : IMPORTANT SAFETY TIP! Recent compilers seem to take an undocumented
597 : and debatable stance that pure functions do no writes to memory.
598 : This is a sufficient condition for the above but not a necessary one.
599 :
600 : Consider, for example, the real world case of an otherwise pure
601 : function that uses pass-by-reference to return more than one value
602 : (an unpleasant practice that is sadly often necessary because C/C++,
603 : compilers and underlying platform ABIs are very bad at helping
604 : developers simply and clearly express their intent to return multiple
605 : values and then generate good assembly for such).
606 :
607 : If called multiple times sequentially, all but the first call to such
608 : a "pure" function could be optimized away because the non-volatile
609 : memory writes done in the all but the 1st call for the
610 : pass-by-reference-returns write the same value to normal memory that
611 : was written on the 1st call. That is, these calls return the same
612 : value for their direct return and do writes that do not have any
613 : visible effect.
614 :
615 : Thus, while it is safe for the compiler to eliminate all but the
616 : first call via techniques like common subexpression elimination, it
617 : is not safe for the compiler to infer that the first call did no
618 : writes.
619 :
620 : But recent compilers seem to do exactly that.
621 :
622 : Sigh ... we can't use FD_FN_PURE on such functions because of all the
623 : above linguistic, compiler, documentation and ABI infinite sadness.
624 :
625 : TL;DR To be safe against the above vagaries, recommend using
626 : FD_FN_PURE to annotate functions that do no memory writes (including
627 : trivial memory writes) and try to design HPC APIs to avoid returning
628 : multiple values as much as possible.
629 :
630 : Followup: FD_FN_PURE expands to nothing by default given additional
631 : confusion between how current languages, compilers, CI, fuzzing, and
632 : developers interpret this function attribute. We keep it around
633 : given it documents the intent of various APIs and so it can be
634 : manually enabled to find implementation surprises during bullet
635 : proofing (e.g. under compiler options like "extra-brutality").
636 : Hopefully someday, pure function attributes will someday be handled
637 : more consistently across the board. */
638 :
639 : #ifndef FD_FN_PURE
640 : #define FD_FN_PURE
641 : #endif
642 :
643 : /* FD_FN_CONST is like pure but also, even stronger, indicates that the
644 : function does not depend on the state of memory. See note above
645 : about why this expands to nothing by default. */
646 :
647 : #ifndef FD_FN_CONST
648 : #define FD_FN_CONST
649 : #endif
650 :
651 : /* FD_FN_UNUSED indicates that it is okay if the function with static
652 : linkage is not used. Allows working around -Winline in header only
653 : APIs where the compiler decides not to actually inline the function.
654 : (This belief, frequently promulgated by anti-macro cults, that "An
655 : Inline Function is As Fast As a Macro" ... an entire section in gcc's
656 : documentation devoted to it in fact ... remains among the biggest
657 : lies in computer science. Yes, an inline function is as fast as a
658 : macro ... when the compiler actually decides to treat the inline
659 : keyword more than just for entertainment purposes only. Which, as
660 : -Winline proves, it frequently doesn't. Sigh ... force_inline like
661 : compiler extensions might be an alternative here but they have their
662 : own portability issues.) */
663 :
664 48 : #define FD_FN_UNUSED __attribute__((unused))
665 :
666 : /* FD_FN_UNSANITIZED tells the compiler to disable AddressSanitizer and
667 : UndefinedBehaviorSanitizer instrumentation. For some functions, this
668 : can improve instrumented compile time by ~30x. */
669 :
670 : #define FD_FN_UNSANITIZED __attribute__((no_sanitize("address", "undefined")))
671 :
672 : /* FD_FN_SENSITIVE instruments the compiler to sanitize sensitive functions.
673 : https://eprint.iacr.org/2023/1713 (Sec 3.2)
674 : - Clear all registers with __attribute__((zero_call_used_regs("all")))
675 : - Clear stack with __attribute__((strub)), available in gcc 14+ */
676 :
677 : #if __has_attribute(strub)
678 : #define FD_FN_SENSITIVE __attribute__((strub)) __attribute__((zero_call_used_regs("all")))
679 : #elif __has_attribute(zero_call_used_regs)
680 : #define FD_FN_SENSITIVE __attribute__((zero_call_used_regs("all")))
681 : #else
682 : #define FD_FN_SENSITIVE
683 : #endif
684 :
685 : /* FD_PARAM_UNUSED indicates that it is okay if the function parameter is not
686 : used. */
687 :
688 : #define FD_PARAM_UNUSED __attribute__((unused))
689 :
690 : /* FD_TYPE_PACKED indicates that a type is to be packed, reseting its alignment
691 : to 1. */
692 :
693 : #define FD_TYPE_PACKED __attribute__((packed))
694 :
695 : /* FD_WARN_UNUSED tells the compiler the result (from a function) should
696 : be checked. This is useful to force callers to either check the result
697 : or deliberately and explicitly ignore it. Good for result codes and
698 : errors */
699 :
700 : #define FD_WARN_UNUSED __attribute__ ((warn_unused_result))
701 :
702 : /* FD_FALLTHRU tells the compiler that a case in a switch falls through
703 : to the next case. This avoids the compiler complaining, in cases where
704 : it is an intentional fall through.
705 : The "while(0)" avoids a compiler complaint in the event the case
706 : has no statement, example:
707 : switch( return_code ) {
708 : case RETURN_CASE_1: FD_FALLTHRU;
709 : case RETURN_CASE_2: FD_FALLTHRU;
710 : case RETURN_CASE_3:
711 : case_123();
712 : default:
713 : case_other();
714 : }
715 :
716 : See C++17 [[fallthrough]] and gcc __attribute__((fallthrough)) */
717 :
718 : #define FD_FALLTHRU while(0) __attribute__((fallthrough))
719 :
720 : /* FD_COMPILER_FORGET(var): Tells the compiler that it shouldn't use
721 : any knowledge it has about the provided register-compatible variable
722 : var for optimizations going forward (i.e. the variable has changed in
723 : a deterministic but unknown-to-the-compiler way where the actual
724 : change is the identity operation). Useful for inhibiting various
725 : branch nest misoptimizations (compilers unfortunately tend to
726 : radically underestimate the impact in raw average performance and
727 : jitter and the probability of branch mispredicts or the cost to the
728 : CPU of having lots of branches). This is not asm volatile (use
729 : UNPREDICTABLE below for that) and has no clobbers. So if var is not
730 : used after the forget, the compiler can optimize the FORGET away
731 : (along with operations preceding it used to produce var). */
732 :
733 37784120659 : #define FD_COMPILER_FORGET(var) __asm__( "# FD_COMPILER_FORGET(" #var ")@" FD_SRC_LOCATION : "+r" (var) )
734 :
735 : /* FD_COMPILER_UNPREDICTABLE(var): Same as FD_COMPILER_FORGET(var) but
736 : the provided variable has changed in a non-deterministic way from the
737 : compiler's POV (e.g. the value in the variable on output should not
738 : be treated as a compile time constant even if it is one
739 : linguistically). Useful for suppressing unwanted
740 : compile-time-const-based optimizations like hoisting operations with
741 : useful CPU side effects out of a critical loop. */
742 :
743 36723650 : #define FD_COMPILER_UNPREDICTABLE(var) __asm__ __volatile__( "# FD_COMPILER_UNPREDICTABLE(" #var ")@" FD_SRC_LOCATION : "+m,r" (var) )
744 :
745 : /* Atomic tricks ******************************************************/
746 :
747 : /* FD_COMPILER_MFENCE(): Tells the compiler that it can't move any
748 : memory operations (load or store) from before the MFENCE to after the
749 : MFENCE (and vice versa). The processor itself might still reorder
750 : around the fence though (that requires platform specific fences). */
751 :
752 >11981*10^7 : #define FD_COMPILER_MFENCE() __asm__ __volatile__( "# FD_COMPILER_MFENCE()@" FD_SRC_LOCATION ::: "memory" )
753 :
754 : /* FD_SPIN_PAUSE(): Yields the logical core of the calling thread to
755 : the other logical cores sharing the same underlying physical core for
756 : a few clocks without yielding it to the operating system scheduler.
757 : Typically useful for shared memory spin polling loops, especially if
758 : hyperthreading is in use. IMPORTANT SAFETY TIP! This might act as a
759 : FD_COMPILER_MFENCE on some combinations of toolchains and targets
760 : (e.g. gcc documents that __builtin_ia32_pause also does a compiler
761 : memory) but this should not be relied upon for portable code
762 : (consider making this a compiler memory fence on all platforms?) */
763 :
764 : #if FD_HAS_X86
765 20778534610 : #define FD_SPIN_PAUSE() __builtin_ia32_pause()
766 : #else
767 : #define FD_SPIN_PAUSE() ((void)0)
768 : #endif
769 :
770 : /* FD_YIELD(): Yields the logical core of the calling thread to the
771 : operating system scheduler if a hosted target and does a spin pause
772 : otherwise. */
773 :
774 : #if FD_HAS_HOSTED
775 26788 : #define FD_YIELD() fd_yield()
776 : #else
777 : #define FD_YIELD() FD_SPIN_PAUSE()
778 : #endif
779 :
780 : /* FD_VOLATILE_CONST(x): Tells the compiler is not able to predict the
781 : value obtained by dereferencing x and that dereferencing x might have
782 : other side effects (e.g. maybe another thread could change the value
783 : and the compiler has no way of knowing this). Generally speaking,
784 : the volatile keyword is broken linguistically. Volatility is not a
785 : property of the variable but of the dereferencing of a variable (e.g.
786 : what is volatile from the POV of a reader of a shared variable is not
787 : necessarily volatile from the POV a writer of that shared variable in
788 : a different thread). */
789 :
790 1044537039 : #define FD_VOLATILE_CONST(x) (*((volatile const __typeof__((x)) *)&(x)))
791 :
792 : /* FD_VOLATILE(x): tells the compiler is not able to predict the effect
793 : of modifying x and that dereferencing x might have other side effects
794 : (e.g. maybe another thread is spinning on x waiting for its value to
795 : change and the compiler has no way of knowing this). */
796 :
797 794346611 : #define FD_VOLATILE(x) (*((volatile __typeof__((x)) *)&(x)))
798 :
799 : #if FD_HAS_ATOMIC
800 :
801 : /* FD_ATOMIC_FETCH_AND_{ADD,SUB,OR,AND,XOR}(p,v):
802 :
803 : FD_ATOMIC_FETCH_AND_ADD(p,v) does
804 : f = *p;
805 : *p = f + v
806 : return f;
807 : as a single atomic operation. Similarly for the other variants. */
808 :
809 6010585 : #define FD_ATOMIC_FETCH_AND_ADD(p,v) __sync_fetch_and_add( (p), (v) )
810 6020547 : #define FD_ATOMIC_FETCH_AND_SUB(p,v) __sync_fetch_and_sub( (p), (v) )
811 65426397 : #define FD_ATOMIC_FETCH_AND_OR( p,v) __sync_fetch_and_or( (p), (v) )
812 : #define FD_ATOMIC_FETCH_AND_AND(p,v) __sync_fetch_and_and( (p), (v) )
813 : #define FD_ATOMIC_FETCH_AND_XOR(p,v) __sync_fetch_and_xor( (p), (v) )
814 :
815 : /* FD_ATOMIC_{ADD,SUB,OR,AND,XOR}_AND_FETCH(p,v):
816 :
817 : FD_ATOMIC_{ADD,SUB,OR,AND,XOR}_AND_FETCH(p,v) does
818 : r = *p + v;
819 : *p = r;
820 : return r;
821 : as a single atomic operation. Similarly for the other variants. */
822 :
823 : #define FD_ATOMIC_ADD_AND_FETCH(p,v) __sync_add_and_fetch( (p), (v) )
824 : #define FD_ATOMIC_SUB_AND_FETCH(p,v) __sync_sub_and_fetch( (p), (v) )
825 : #define FD_ATOMIC_OR_AND_FETCH( p,v) __sync_or_and_fetch( (p), (v) )
826 : #define FD_ATOMIC_AND_AND_FETCH(p,v) __sync_and_and_fetch( (p), (v) )
827 : #define FD_ATOMIC_XOR_AND_FETCH(p,v) __sync_xor_and_fetch( (p), (v) )
828 :
829 : /* FD_ATOMIC_CAS(p,c,s):
830 :
831 : o = FD_ATOMIC_CAS(p,c,s) conceptually does:
832 : o = *p;
833 : if( o==c ) *p = s;
834 : return o
835 : as a single atomic operation. */
836 :
837 329256630 : #define FD_ATOMIC_CAS(p,c,s) __sync_val_compare_and_swap( (p), (c), (s) )
838 :
839 : /* FD_ATOMIC_XCHG(p,v):
840 :
841 : o = FD_ATOMIC_XCHG( p, v ) conceptually does:
842 : o = *p
843 : *p = v
844 : return o
845 : as a single atomic operation.
846 :
847 : Intel's __sync compiler extensions from the days of yore mysteriously
848 : implemented atomic exchange via the very misleadingly named
849 : __sync_lock_test_and_set. And some implementations (and C++)
850 : debatably then implemented this API according to what the misleading
851 : name implied as opposed to what it actually did. But those
852 : implementations didn't bother to provide a replacement for atomic
853 : exchange functionality (forcing us to emulate atomic exchange more
854 : slowly via CAS there). Sigh ... we do what we can to fix this up. */
855 :
856 : #ifndef FD_ATOMIC_XCHG_STYLE
857 : #if FD_HAS_X86 && !__cplusplus
858 : #define FD_ATOMIC_XCHG_STYLE 1
859 : #else
860 : #define FD_ATOMIC_XCHG_STYLE 0
861 : #endif
862 : #endif
863 :
864 : #if FD_ATOMIC_XCHG_STYLE==0
865 : #define FD_ATOMIC_XCHG(p,v) (__extension__({ \
866 : __typeof__(*(p)) * _fd_atomic_xchg_p = (p); \
867 : __typeof__(*(p)) _fd_atomic_xchg_v = (v); \
868 : __typeof__(*(p)) _fd_atomic_xchg_t; \
869 : for(;;) { \
870 : _fd_atomic_xchg_t = FD_VOLATILE_CONST( *_fd_atomic_xchg_p ); \
871 : if( FD_LIKELY( __sync_bool_compare_and_swap( _fd_atomic_xchg_p, _fd_atomic_xchg_t, _fd_atomic_xchg_v ) ) ) break; \
872 : FD_SPIN_PAUSE(); \
873 : } \
874 : _fd_atomic_xchg_t; \
875 : }))
876 : #elif FD_ATOMIC_XCHG_STYLE==1
877 11900475 : #define FD_ATOMIC_XCHG(p,v) __sync_lock_test_and_set( (p), (v) )
878 : #else
879 : #error "Unknown FD_ATOMIC_XCHG_STYLE"
880 : #endif
881 :
882 : #endif /* FD_HAS_ATOMIC */
883 :
884 : /* FD_TL: This indicates that the variable should be thread local.
885 :
886 : FD_ONCE_{BEGIN,END}: The block:
887 :
888 : FD_ONCE_BEGIN {
889 : ... code ...
890 : } FD_ONCE_END
891 :
892 : linguistically behaves like:
893 :
894 : do {
895 : ... code ...
896 : } while(0)
897 :
898 : But provides a low overhead guarantee that:
899 : - The block will be executed by at most once over all threads
900 : in a process (i.e. the set of threads which share global
901 : variables).
902 : - No thread in a process that encounters the block will continue
903 : past it until it has executed once.
904 :
905 : This implies that caller promises a ONCE block will execute in a
906 : finite time. (Meant for doing simple lightweight initializations.)
907 :
908 : It is okay to nest ONCE blocks. The thread that executes the
909 : outermost will execute all the nested once as part of executing the
910 : outermost.
911 :
912 : A ONCE implicitly provides a compiler memory fence to reduce the risk
913 : that the compiler will assume that operations done in the once block
914 : on another thread have not been done (e.g. propagating pre-once block
915 : variable values into post-once block code). It is up to the user to
916 : provide any necessary hardware fencing (usually not necessary).
917 :
918 : FD_THREAD_ONCE_{BEGIN,END}: The block:
919 :
920 : FD_THREAD_ONCE_BEGIN {
921 : ... code ...
922 : } FD_THREAD_ONCE_END;
923 :
924 : is similar except the guarantee is that the block only covers the
925 : invoking thread and it does not provide any fencing. If a thread
926 : once begin is nested inside a once begin, that thread once begin will
927 : only be executed on the thread that executes the thread once begin.
928 : It is similarly okay to nest ONCE block inside a THREAD_ONCE block.
929 :
930 : FD_TURNSTILE_{BEGIN,BLOCKED,END} implement a turnstile for all
931 : threads in a process. Only one thread can be in the turnstile at a
932 : time. Usage:
933 :
934 : FD_TURNSTILE_BEGIN(blocking) {
935 :
936 : ... At this point, we are the only thread executing this block of
937 : ... code.
938 : ...
939 : ... Do operations that must be done by threads one-at-a-time
940 : ... here.
941 : ...
942 : ... Because compiler memory fences are done just before entering
943 : ... and after exiting this block, there is typically no need to
944 : ... use any atomics / volatile / fencing here. That is, we can
945 : ... just write "normal" code on platforms where writes to memory
946 : ... become visible to other threads in the order in which they
947 : ... were issued in the machine code (e.g. x86) as others will not
948 : ... proceed with this block until they exit it. YMMV for non-x86
949 : ... platforms (probably need additional hardware store fences in
950 : ... these macros).
951 : ...
952 : ... It is safe to use "break" and/or "continue" within this
953 : ... block. The block will exit with the appropriate compiler
954 : ... fencing and unlocking. Execution will resume immediately
955 : ... after FD_TURNSTILE_END.
956 :
957 : ... IMPORTANT SAFETY TIP! DO NOT RETURN FROM THIS BLOCK.
958 :
959 : } FD_TURNSTILE_BLOCKED {
960 :
961 : ... At this point, there was another thread in the turnstile when
962 : ... we tried to enter the turnstile.
963 : ...
964 : ... Handle blocked here.
965 : ...
966 : ... On exiting this block, if blocking was zero, we will resume
967 : ... execution immediately after FD_TURNSTILE_END. If blocking
968 : ... was non-zero, we will resume execution immediately before
969 : ... FD_TURNSTILE_BEGIN (e.g. we will retry again after a short
970 : ... spin pause).
971 : ...
972 : ... It is safe to use "break" and/or "continue" within this
973 : ... block. Both will exit this block and resume execution
974 : ... at the location indicated as per what blocking specified
975 : ... then the turnstile was entered.
976 : ...
977 : ... It is technically safe to return from this block but
978 : ... also extremely gross.
979 :
980 : } FD_TURNSTILE_END; */
981 :
982 : #if FD_HAS_THREADS /* Potentially more than one thread in the process */
983 :
984 : #ifndef FD_TL
985 : #define FD_TL __thread
986 : #endif
987 :
988 5318 : #define FD_ONCE_BEGIN do { \
989 5318 : FD_COMPILER_MFENCE(); \
990 5318 : static volatile int _fd_once_block_state = 0; \
991 5318 : for(;;) { \
992 5318 : int _fd_once_block_tmp = _fd_once_block_state; \
993 5318 : if( FD_LIKELY( _fd_once_block_tmp>0 ) ) break; \
994 5318 : if( FD_LIKELY( !_fd_once_block_tmp ) && \
995 2494 : FD_LIKELY( !FD_ATOMIC_CAS( &_fd_once_block_state, 0, -1 ) ) ) { \
996 2494 : do
997 :
998 : #define FD_ONCE_END \
999 2494 : while(0); \
1000 2494 : FD_COMPILER_MFENCE(); \
1001 0 : _fd_once_block_state = 1; \
1002 0 : break; \
1003 2494 : } \
1004 2494 : FD_YIELD(); \
1005 0 : } \
1006 5318 : } while(0)
1007 :
1008 36 : #define FD_THREAD_ONCE_BEGIN do { \
1009 36 : static FD_TL int _fd_thread_once_block_state = 0; \
1010 36 : if( FD_UNLIKELY( !_fd_thread_once_block_state ) ) { \
1011 9 : do
1012 :
1013 : #define FD_THREAD_ONCE_END \
1014 9 : while(0); \
1015 9 : _fd_thread_once_block_state = 1; \
1016 9 : } \
1017 36 : } while(0)
1018 :
1019 9 : #define FD_TURNSTILE_BEGIN(blocking) do { \
1020 9 : static volatile int _fd_turnstile_state = 0; \
1021 9 : int _fd_turnstile_blocking = (blocking); \
1022 9 : for(;;) { \
1023 9 : int _fd_turnstile_tmp = _fd_turnstile_state; \
1024 9 : if( FD_LIKELY( !_fd_turnstile_tmp ) && \
1025 9 : FD_LIKELY( !FD_ATOMIC_CAS( &_fd_turnstile_state, 0, 1 ) ) ) { \
1026 9 : FD_COMPILER_MFENCE(); \
1027 9 : do
1028 :
1029 : #define FD_TURNSTILE_BLOCKED \
1030 9 : while(0); \
1031 9 : FD_COMPILER_MFENCE(); \
1032 9 : _fd_turnstile_state = 0; \
1033 9 : FD_COMPILER_MFENCE(); \
1034 9 : break; \
1035 9 : } \
1036 9 : FD_COMPILER_MFENCE(); \
1037 0 : do
1038 :
1039 : #define FD_TURNSTILE_END \
1040 0 : while(0); \
1041 0 : FD_COMPILER_MFENCE(); \
1042 0 : if( !_fd_turnstile_blocking ) break; /* likely compile time */ \
1043 0 : FD_SPIN_PAUSE(); \
1044 0 : } \
1045 9 : } while(0)
1046 :
1047 : #else /* Only one thread in the process */
1048 :
1049 : #ifndef FD_TL
1050 : #define FD_TL /**/
1051 : #endif
1052 :
1053 : #define FD_ONCE_BEGIN do { \
1054 : static int _fd_once_block_state = 0; \
1055 : if( FD_UNLIKELY( !_fd_once_block_state ) ) { \
1056 : do
1057 :
1058 : #define FD_ONCE_END \
1059 : while(0); \
1060 : _fd_once_block_state = 1; \
1061 : } \
1062 : } while(0)
1063 :
1064 : #define FD_THREAD_ONCE_BEGIN FD_ONCE_BEGIN
1065 : #define FD_THREAD_ONCE_END FD_ONCE_END
1066 :
1067 : #define FD_TURNSTILE_BEGIN(blocking) do { \
1068 : (void)(blocking); \
1069 : FD_COMPILER_MFENCE(); \
1070 : if( 1 ) { \
1071 : do
1072 :
1073 : #define FD_TURNSTILE_BLOCKED \
1074 : while(0); \
1075 : } else { \
1076 : do
1077 :
1078 : #define FD_TURNSTILE_END \
1079 : while(0); \
1080 : } \
1081 : FD_COMPILER_MFENCE(); \
1082 : } while(0)
1083 :
1084 : #endif
1085 :
1086 : /* An ideal fd_clock_func_t is a function such that:
1087 :
1088 : long dx = clock( args );
1089 : ... stuff ...
1090 : dx = clock( args ) - dx;
1091 :
1092 : yields a strictly positive dx where dx approximates the amount of
1093 : wallclock time elapsed on the caller in some clock specific unit
1094 : (e.g. nanoseconds, CPU ticks, etc) for a reasonable amount of "stuff"
1095 : (including no "stuff"). args allows arbitrary clock specific context
1096 : to be passed to the clock implication. (clocks that need a non-const
1097 : args can cast away the const in the implementation or cast the
1098 : function pointer as necessary.) */
1099 :
1100 : typedef long (*fd_clock_func_t)( void const * args );
1101 :
1102 : FD_PROTOTYPES_BEGIN
1103 :
1104 : /* fd_memcpy(d,s,sz): On modern x86 in some circumstances, rep mov will
1105 : be faster than memcpy under the hood (basically due to RFO /
1106 : read-for-ownership optimizations in the cache protocol under the hood
1107 : that aren't easily done from the ISA ... see Intel docs on enhanced
1108 : rep mov). Compile time configurable though as this is not always
1109 : true. So application can tune to taste. Hard to beat rep mov for
1110 : code density though (2 bytes) and pretty hard to beat in situations
1111 : needing a completely generic memcpy. But it can be beaten in
1112 : specialized situations for the usual reasons. */
1113 :
1114 : /* FIXME: CONSIDER MEMCMP TOO! */
1115 : /* FIXME: CONSIDER MEMCPY RELATED FUNC ATTRS */
1116 :
1117 : #ifndef FD_USE_ARCH_MEMCPY
1118 : #define FD_USE_ARCH_MEMCPY 0
1119 : #endif
1120 :
1121 : #if FD_HAS_X86 && FD_USE_ARCH_MEMCPY && !defined(CBMC) && !FD_HAS_DEEPASAN && !FD_HAS_MSAN
1122 :
1123 : static inline void *
1124 : fd_memcpy( void * FD_RESTRICT d,
1125 : void const * FD_RESTRICT s,
1126 523352916 : ulong sz ) {
1127 523352916 : void * p = d;
1128 523352916 : __asm__ __volatile__( "rep movsb" : "+D" (p), "+S" (s), "+c" (sz) :: "memory" );
1129 523352916 : return d;
1130 523352916 : }
1131 :
1132 : #elif FD_HAS_MSAN
1133 :
1134 : void * __msan_memcpy( void * dest, void const * src, ulong n );
1135 :
1136 : static inline void *
1137 : fd_memcpy( void * FD_RESTRICT d,
1138 : void const * FD_RESTRICT s,
1139 : ulong sz ) {
1140 : return __msan_memcpy( d, s, sz );
1141 : }
1142 :
1143 : #else
1144 :
1145 : static inline void *
1146 : fd_memcpy( void * FD_RESTRICT d,
1147 : void const * FD_RESTRICT s,
1148 977151956 : ulong sz ) {
1149 : #if defined(CBMC) || FD_HAS_ASAN
1150 : if( FD_UNLIKELY( !sz ) ) return d; /* Standard says sz 0 is UB, uncomment if target is insane and doesn't treat sz 0 as a nop */
1151 : #endif
1152 977151956 : return memcpy( d, s, sz );
1153 977151956 : }
1154 :
1155 : #endif
1156 :
1157 : /* fd_memset(d,c,sz): architecturally optimized memset. See fd_memcpy
1158 : for considerations. */
1159 :
1160 : /* FIXME: CONSIDER MEMSET RELATED FUNC ATTRS */
1161 :
1162 : #ifndef FD_USE_ARCH_MEMSET
1163 : #define FD_USE_ARCH_MEMSET 0
1164 : #endif
1165 :
1166 : #if FD_HAS_X86 && FD_USE_ARCH_MEMSET && !defined(CBMC) && !FD_HAS_DEEPASAN && !FD_HAS_MSAN
1167 :
1168 : static inline void *
1169 : fd_memset( void * d,
1170 : int c,
1171 121724036 : ulong sz ) {
1172 121724036 : void * p = d;
1173 121724036 : __asm__ __volatile__( "rep stosb" : "+D" (p), "+c" (sz) : "a" (c) : "memory" );
1174 121724036 : return d;
1175 121724036 : }
1176 :
1177 : #else
1178 :
1179 : static inline void *
1180 : fd_memset( void * d,
1181 : int c,
1182 241944234 : ulong sz ) {
1183 : # ifdef CBMC
1184 : if( FD_UNLIKELY( !sz ) ) return d; /* See fd_memcpy note */
1185 : # endif
1186 241944234 : return memset( d, c, sz );
1187 241944234 : }
1188 :
1189 : #endif
1190 :
1191 : /* C23 has memset_explicit, i.e. a memset that can't be removed by the
1192 : optimizer. This is our own equivalent. */
1193 :
1194 : static void * (* volatile fd_memset_explicit)(void *, int, size_t) = memset;
1195 :
1196 : /* fd_memeq(s0,s1,sz): Compares two blocks of memory. Returns 1 if
1197 : equal or sz is zero and 0 otherwise. No memory accesses made if sz
1198 : is zero (pointers may be invalid). On x86, uses repe cmpsb which is
1199 : preferable to __builtin_memcmp in some cases. */
1200 :
1201 : #ifndef FD_USE_ARCH_MEMEQ
1202 : #define FD_USE_ARCH_MEMEQ 0
1203 : #endif
1204 :
1205 : #if FD_HAS_X86 && FD_USE_ARCH_MEMEQ && defined(__GCC_ASM_FLAG_OUTPUTS__) && __STDC_VERSION__>=199901L
1206 :
1207 : FD_FN_PURE static inline int
1208 : fd_memeq( void const * s0,
1209 : void const * s1,
1210 : ulong sz ) {
1211 : /* ZF flag is set and exported in two cases:
1212 : a) size is zero (via test)
1213 : b) buffer is equal (via repe cmpsb) */
1214 : int r;
1215 : __asm__( "test %3, %3;"
1216 : "repe cmpsb"
1217 : : "=@cce" (r), "+S" (s0), "+D" (s1), "+c" (sz)
1218 : : "m" (*(char const (*)[sz]) s0), "m" (*(char const (*)[sz]) s1)
1219 : : "cc" );
1220 : return r;
1221 : }
1222 :
1223 : #else
1224 :
1225 : FD_FN_PURE static inline int
1226 : fd_memeq( void const * s1,
1227 : void const * s2,
1228 15552536 : ulong sz ) {
1229 15552536 : return 0==memcmp( s1, s2, sz );
1230 15552536 : }
1231 :
1232 : #endif
1233 :
1234 : /* fd_hash(seed,buf,sz), fd_hash_memcpy(seed,d,s,sz): High quality
1235 : (full avalanche) high speed variable length buffer -> 64-bit hash
1236 : function (memcpy_hash is often as fast as plain memcpy). Based on
1237 : the xxhash-r39 (open source BSD licensed) implementation. In-place
1238 : and out-of-place variants provided (out-of-place variant assumes dst
1239 : and src do not overlap). Caller promises valid input arguments,
1240 : cannot fail given valid inputs arguments. sz==0 is fine. */
1241 :
1242 : FD_FN_PURE ulong
1243 : fd_hash( ulong seed,
1244 : void const * buf,
1245 : ulong sz );
1246 :
1247 : ulong
1248 : fd_hash_memcpy( ulong seed,
1249 : void * FD_RESTRICT d,
1250 : void const * FD_RESTRICT s,
1251 : ulong sz );
1252 :
1253 : #ifndef FD_TICKCOUNT_STYLE
1254 : #if FD_HAS_X86 /* Use RDTSC */
1255 : #define FD_TICKCOUNT_STYLE 1
1256 : #elif FD_HAS_ARM /* Use CNTVCT_EL0 */
1257 : #define FD_TICKCOUNT_STYLE 2
1258 : #else /* Use portable fallback */
1259 : #define FD_TICKCOUNT_STYLE 0
1260 : #endif
1261 : #endif
1262 :
1263 : #if FD_TICKCOUNT_STYLE==0 /* Portable fallback (slow). Ticks at 1 ns / tick */
1264 :
1265 : #define fd_tickcount() fd_log_wallclock() /* TODO: fix ugly pre-log usage */
1266 :
1267 : #elif FD_TICKCOUNT_STYLE==1 /* RTDSC (fast) */
1268 :
1269 : /* fd_tickcount: Reads the hardware invariant tickcounter ("RDTSC").
1270 : This monotonically increases at an approximately constant rate
1271 : relative to the system wallclock and is synchronous across all CPUs
1272 : on a host.
1273 :
1274 : The rate this ticks at is not precisely defined (see Intel docs for
1275 : more details) but it is typically in the ballpark of the CPU base
1276 : clock frequency. The relationship to the wallclock is very well
1277 : approximated as linear over short periods of time (i.e. less than a
1278 : fraction of a second) and this should not exhibit any sudden changes
1279 : in its rate relative to the wallclock. Notably, its rate is not
1280 : directly impacted by CPU clock frequency adaptation / Turbo mode (see
1281 : other Intel performance monitoring counters for various CPU cycle
1282 : counters). It can drift over longer period time for the usual clock
1283 : synchronization reasons.
1284 :
1285 : This is a reasonably fast O(1) cost (~6-8 ns on recent Intel).
1286 : Because of all compiler options and parallel execution going on in
1287 : modern CPUs cores, other instructions might be reordered around this
1288 : by the compiler and/or CPU. It is up to the user to do lower level
1289 : tricks as necessary when the precise location of this in the
1290 : execution stream and/or when executed by the CPU is needed. (This is
1291 : often unnecessary as such levels of precision are not frequently
1292 : required and often have self-defeating overheads.)
1293 :
1294 : It is worth noting that RDTSC and/or (even more frequently) lower
1295 : level performance counters are often restricted from use in user
1296 : space applications. It is recommended that applications use this
1297 : primarily for debugging / performance tuning on unrestricted hosts
1298 : and/or when the developer is confident that applications using this
1299 : will have appropriate permissions when deployed. */
1300 :
1301 4921489950 : #define fd_tickcount() ((long)__builtin_ia32_rdtsc())
1302 :
1303 : #elif FD_TICKCOUNT_STYLE==2 /* armv8 (fast) */
1304 :
1305 : /* fd_tickcount (ARM): https://developer.arm.com/documentation/ddi0601/2021-12/AArch64-Registers/CNTVCT-EL0--Counter-timer-Virtual-Count-register
1306 : Approx 24 MHz on Apple M1. */
1307 :
1308 : static inline long
1309 : fd_tickcount( void ) {
1310 : /* consider using 'isb' */
1311 : ulong value;
1312 : __asm__ __volatile__ (
1313 : "isb\n"
1314 : "mrs %0, cntvct_el0\n"
1315 : "nop"
1316 : : "=r" (value) );
1317 : return (long)value;
1318 : }
1319 :
1320 : #else
1321 : #error "Unknown FD_TICKCOUNT_STYLE"
1322 : #endif
1323 :
1324 : long _fd_tickcount( void const * _ ); /* fd_clock_func_t compat */
1325 :
1326 : #if FD_HAS_HOSTED
1327 :
1328 : /* fd_yield yields the calling thread to the operating system scheduler. */
1329 :
1330 : void
1331 : fd_yield( void );
1332 :
1333 : #endif
1334 :
1335 : FD_PROTOTYPES_END
1336 :
1337 : #endif /* HEADER_fd_src_util_fd_util_base_h */
|