Line data Source code
1 : #ifndef HEADER_fd_src_util_fd_util_base_h
2 : #define HEADER_fd_src_util_fd_util_base_h
3 :
4 : /* Base development environment */
5 :
6 : /* Compiler checks ****************************************************/
7 :
8 : #ifdef __cplusplus
9 :
10 : #if __cplusplus<201703L
11 : #error "Firedancer requires C++17 or later"
12 : #endif
13 :
14 : #else
15 :
16 : #if __STDC_VERSION__<201710L
17 : #error "Firedancer requires C Standard version C17 or later"
18 : #endif
19 :
20 : #endif //__cplusplus
21 :
22 : /* Build target capabilities ******************************************/
23 :
24 : /* Different build targets often have different levels of support for
25 : various language and hardware features. The presence of various
26 : features can be tested at preprocessor, compile, or run time via the
27 : below capability macros.
28 :
29 : Code that does not exploit any of these capabilities written within
30 : the base development environment should be broadly portable across a
31 : range of build targets ranging from on-chain virtual machines to
32 : commodity hosts to custom hardware.
33 :
34 : As such, highly portable yet high performance code is possible by
35 : writing generic implementations that do not exploit any of the below
36 : capabilities as a portable fallback along with build target specific
37 : optimized implementations that are invoked when the build target
38 : supports the appropriate capabilities.
39 :
40 : The base development itself provide lots of functionality to help
41 : with implementing portable fallbacks while making very minimal
42 : assumptions about the build targets and zero use of 3rd party
43 : libraries (these might make unknown additional assumptions about the
44 : build target, including availability of a quality implementation of
45 : the library on the build target). */
46 :
47 : /* FD_HAS_HOSTED: If the build target is hosted (e.g. resides on a host
48 : with a POSIX-ish environment ... practically speaking, stdio.h,
49 : stdlib.h, unistd.h, et al more or less behave normally ...
50 : pedantically XOPEN_SOURCE=700), FD_HAS_HOSTED will be 1. It will be
51 : zero otherwise. */
52 :
53 : #ifndef FD_HAS_HOSTED
54 : #define FD_HAS_HOSTED 0
55 : #endif
56 :
57 : /* FD_HAS_ATOMIC: If the build target supports atomic operations
58 : between threads accessing a common memory region (include threads
59 : that reside in different processes on a host communicating via a
60 : shared memory region with potentially different local virtual
61 : mappings). Practically speaking, does atomic compare-and-swap et al
62 : work? */
63 :
64 : #ifndef FD_HAS_ATOMIC
65 : #define FD_HAS_ATOMIC 0
66 : #endif
67 :
68 : /* FD_HAS_THREADS: If the build target supports a POSIX-ish notion of
69 : threads (e.g. practically speaking, global variables declared within
70 : a compile unit are visible to more than one thread of execution,
71 : pthreads.h / threading parts of C standard, the atomics parts of the
72 : C standard, ... more or less work normally), FD_HAS_THREADS will be
73 : 1. It will be zero otherwise. FD_HAS_THREADS implies FD_HAS_HOSTED
74 : and FD_HAS_ATOMIC. */
75 :
76 : #ifndef FD_HAS_THREADS
77 : #define FD_HAS_THREADS 0
78 : #endif
79 :
80 : /* FD_HAS_INT128: If the build target supports reasonably efficient
81 : 128-bit wide integer operations, define FD_HAS_INT128 to 1 to enable
82 : use of them in implementations. */
83 :
84 : #ifndef FD_HAS_INT128
85 : #define FD_HAS_INT128 0
86 : #endif
87 :
88 : /* FD_HAS_DOUBLE: If the build target supports reasonably efficient
89 : IEEE 754 64-bit wide double precision floating point options, define
90 : FD_HAS_DOUBLE to 1 to enable use of them in implementations. Note
91 : that even if the build target does not, va_args handling in the C /
92 : C++ language requires promotion of a float in an va_arg list to a
93 : double. Thus, C / C++ language that support IEEE 754 float also
94 : implies a minimum level of support for double (though not necessarily
95 : efficient or IEEE 754). That is, even if a target does not have
96 : FD_HAS_DOUBLE, there might still be limited use of double in va_arg
97 : list handling. */
98 :
99 : #ifndef FD_HAS_DOUBLE
100 : #define FD_HAS_DOUBLE 0
101 : #endif
102 :
103 : /* FD_HAS_ALLOCA: If the build target supports fast alloca-style
104 : dynamic stack memory allocation (e.g. alloca.h / __builtin_alloca
105 : more or less work normally), define FD_HAS_ALLOCA to 1 to enable use
106 : of it in implementations. */
107 :
108 : #ifndef FD_HAS_ALLOCA
109 : #define FD_HAS_ALLOCA 0
110 : #endif
111 :
112 : /* FD_HAS_X86: If the build target supports x86 specific features and
113 : can benefit from x86 specific optimizations, define FD_HAS_X86. Code
114 : needing more specific target features (Intel / AMD / SSE / AVX2 /
115 : AVX512 / etc) can specialize further as necessary with even more
116 : precise capabilities (that in turn imply FD_HAS_X86). */
117 :
118 : #ifndef FD_HAS_X86
119 : #define FD_HAS_X86 0
120 : #endif
121 :
122 : /* These allow even more precise targeting for X86. */
123 :
124 : /* FD_HAS_SSE indicates the target supports Intel SSE4 style SIMD
125 : (basically do the 128-bit wide parts of "x86intrin.h" work).
126 : Recommend using the simd/fd_sse.h APIs instead of raw Intel
127 : intrinsics for readability and to facilitate portability to non-x86
128 : platforms. Implies FD_HAS_X86. */
129 :
130 : #ifndef FD_HAS_SSE
131 : #define FD_HAS_SSE 0
132 : #endif
133 :
134 : /* FD_HAS_AVX indicates the target supports Intel AVX2 style SIMD
135 : (basically do the 256-bit wide parts of "x86intrin.h" work).
136 : Recommend using the simd/fd_avx.h APIs instead of raw Intel
137 : intrinsics for readability and to facilitate portability to non-x86
138 : platforms. Implies FD_HAS_SSE. */
139 :
140 : #ifndef FD_HAS_AVX
141 : #define FD_HAS_AVX 0
142 : #endif
143 :
144 : /* FD_HAS_AVX512 indicates the target supports Intel AVX-512 style SIMD
145 : (basically do the 512-bit wide parts of "x86intrin.h" work).
146 : Recommend using the simd/fd_avx512.h APIs instead of raw Intel
147 : intrinsics for readability and to facilitate portability to non-x86
148 : platforms. Implies FD_HAS_AVX. */
149 :
150 : #ifndef FD_HAS_AVX512
151 : #define FD_HAS_AVX512 0
152 : #endif
153 :
154 : /* FD_HAS_SHANI indicates that the target supports Intel SHA extensions
155 : which accelerate SHA-1 and SHA-256 computation. This extension is
156 : also called SHA-NI or SHA_NI (Secure Hash Algorithm New
157 : Instructions). Although proposed in 2013, they're only supported on
158 : Intel Ice Lake and AMD Zen CPUs and newer. Implies FD_HAS_AVX. */
159 :
160 : #ifndef FD_HAS_SHANI
161 : #define FD_HAS_SHANI 0
162 : #endif
163 :
164 : /* FD_HAS_GFNI indicates that the target supports Intel Galois Field
165 : extensions, which accelerate operations over binary extension fields,
166 : especially GF(2^8). These instructions are supported on Intel Ice
167 : Lake and newer and AMD Zen4 and newer CPUs. Implies FD_HAS_AVX. */
168 :
169 : #ifndef FD_HAS_GFNI
170 : #define FD_HAS_GFNI 0
171 : #endif
172 :
173 : /* FD_HAS_AESNI indicates that the target supports AES-NI extensions,
174 : which accelerate AES encryption and decryption. While AVX predates
175 : the original AES-NI extension, the combination of AES-NI+AVX adds
176 : additional opcodes (such as vaesenc, a more flexible variant of
177 : aesenc). Thus, implies FD_HAS_AVX. A conservative estimate for
178 : minimum platform support is Intel Haswell or AMD Zen. */
179 :
180 : #ifndef FD_HAS_AESNI
181 : #define FD_HAS_AESNI 0
182 : #endif
183 :
184 : /* FD_HAS_LZ4 indicates that the target supports LZ4 compression.
185 : Roughly, does "#include <lz4.h>" and the APIs therein work? */
186 :
187 : #ifndef FD_HAS_LZ4
188 : #define FD_HAS_LZ4 0
189 : #endif
190 :
191 : /* FD_HAS_ZSTD indicates that the target supports ZSTD compression.
192 : Roughly, does "#include <zstd.h>" and the APIs therein work? */
193 :
194 : #ifndef FD_HAS_ZSTD
195 : #define FD_HAS_ZSTD 0
196 : #endif
197 :
198 : /* FD_HAS_COVERAGE indicates that the build target is built with coverage instrumentation. */
199 :
200 : #ifndef FD_HAS_COVERAGE
201 : #define FD_HAS_COVERAGE 0
202 : #endif
203 :
204 : /* FD_HAS_ASAN indicates that the build target is using ASAN. */
205 :
206 : #ifndef FD_HAS_ASAN
207 : #define FD_HAS_ASAN 0
208 : #endif
209 :
210 : /* FD_HAS_UBSAN indicates that the build target is using UBSAN. */
211 :
212 : #ifndef FD_HAS_UBSAN
213 : #define FD_HAS_UBSAN 0
214 : #endif
215 :
216 : /* FD_HAS_DEEPASAN indicates that the build target is using ASAN with manual
217 : memory poisoning for fd_alloc, fd_wksp, and fd_scratch. */
218 :
219 : #ifndef FD_HAS_DEEPASAN
220 : #define FD_HAS_DEEPASAN 0
221 : #endif
222 :
223 : /* Base development environment ***************************************/
224 :
225 : /* The functionality provided by these vanilla headers are always
226 : available within the base development environment. Notably, stdio.h
227 : / stdlib.h / et al at are not included here as these make lots of
228 : assumptions about the build target that may not be true (especially
229 : for on-chain and custom hardware use). Code should prefer the fd
230 : util equivalents for such functionality when possible. */
231 :
232 : #include <stdalign.h>
233 : #include <string.h>
234 : #include <limits.h>
235 : #include <float.h>
236 :
237 : /* Work around some library naming irregularities */
238 : /* FIXME: Consider this for FLOAT/FLT, DOUBLE/DBL too? */
239 :
240 3 : #define SHORT_MIN SHRT_MIN
241 3 : #define SHORT_MAX SHRT_MAX
242 1016415 : #define USHORT_MAX USHRT_MAX
243 :
244 : /* Primitive types ****************************************************/
245 :
246 : /* These typedefs provide single token regularized names for all the
247 : primitive types in the base development environment:
248 :
249 : char !
250 : schar ! short int long int128 !!
251 : uchar ushort uint ulong uint128 !!
252 : float
253 : double !!!
254 :
255 : ! Does not assume the sign of char. A naked char should be treated
256 : as cstr character and mathematical operations should be avoided on
257 : them. This is less than ideal as the patterns for integer types in
258 : the C/C++ language spec itself are far more consistent with a naked
259 : char naturally being treated as signed (see above). But there are
260 : lots of conflicts between architectures, languages and standard
261 : libraries about this so any use of a naked char shouldn't assume
262 : the sign ... sigh.
263 :
264 : !! Only available if FD_HAS_INT128 is defined
265 :
266 : !!! Should only used if FD_HAS_DOUBLE is defined but see note in
267 : FD_HAS_DOUBLE about C/C++ silent promotions of float to double in
268 : va_arg lists.
269 :
270 : Note also that these token names more naturally interoperate with
271 : integer constant declarations, type generic code generation
272 : techniques, with printf-style format strings than the stdint.h /
273 : inttypes.h handling.
274 :
275 : To minimize portability issues, unexpected silent type conversion
276 : issues, align with typical developer implicit usage, align with
277 : typical build target usage, ..., assumes char / short / int / long
278 : are 8 / 16 / 32 / 64 twos complement integers and float is IEEE-754
279 : single precision. Further assumes little endian, truncating signed
280 : integer division, sign extending (arithmetic) signed right shift and
281 : signed left shift behaves the same as an unsigned left shift from bit
282 : operations point of view (technically the standard says signed left
283 : shift is undefined if the result would overflow). Also, except for
284 : int128/uint128, assumes that aligned access to these will be
285 : naturally atomic. Lastly assumes that unaligned access to these is
286 : functionally valid but does not assume that unaligned access to these
287 : is efficient or atomic.
288 :
289 : For values meant to be held in registers, code should prefer long /
290 : ulong types (improves asm generation given the prevalence of 64-bit
291 : targets and also to avoid lots of tricky bugs with silent promotions
292 : in the language ... e.g. ushort should ideally only be used for
293 : in-memory representations).
294 :
295 : These are currently not prefixed given how often they are used. If
296 : this becomes problematic prefixes can be added as necessary.
297 : Specifically, C++ allows typedefs to be defined multiple times so
298 : long as they are equivalent. Inequivalent collisions are not
299 : supported but should be rare (e.g. if a 3rd party header thinks
300 : "ulong" should be something other an "unsigned long", the 3rd party
301 : header probably should be nuked from orbit). C11 and forward also
302 : allow multiple equivalent typedefs. C99 and earlier don't but this
303 : is typically only a warning and then only if pedantic warnings are
304 : enabled. Thus, if we want to support users using C99 and earlier who
305 : want to do a strict compile and have a superfluous collision with
306 : these types in other libraries, uncomment the below (or do something
307 : equivalent for the compiler). */
308 :
309 : //#pragma GCC diagnostic push
310 : //#pragma GCC diagnostic ignored "-Wpedantic"
311 :
312 : typedef signed char schar; /* See above note of sadness */
313 :
314 : typedef unsigned char uchar;
315 : typedef unsigned short ushort;
316 : typedef unsigned int uint;
317 : typedef unsigned long ulong;
318 :
319 : #if FD_HAS_INT128
320 :
321 : __extension__ typedef __int128 int128;
322 : __extension__ typedef unsigned __int128 uint128;
323 :
324 1200000048 : #define UINT128_MAX (~(uint128)0)
325 6 : #define INT128_MAX ((int128)(UINT128_MAX>>1))
326 3 : #define INT128_MIN (-INT128_MAX-(int128)1)
327 :
328 : #endif
329 :
330 : //#pragma GCC diagnostic pop
331 :
332 : /* Compiler tricks ****************************************************/
333 :
334 : /* FD_STRINGIFY,FD_CONCAT{2,3,4}: Various macros for token
335 : stringification and pasting. FD_STRINGIFY returns the argument as a
336 : cstr (e.g. FD_STRINGIFY(foo) -> "foo"). FD_CONCAT* pastes the tokens
337 : together into a single token (e.g. FD_CONCAT3(a,b,c) -> abc). The
338 : EXPAND variants first expand their arguments and then do the token
339 : operation (e.g. FD_EXPAND_THEN_STRINGIFY(__LINE__) -> "104" if done
340 : on line 104 of the source code file). */
341 :
342 : #define FD_STRINGIFY(x)#x
343 7394880 : #define FD_CONCAT2(a,b)a##b
344 17523 : #define FD_CONCAT3(a,b,c)a##b##c
345 0 : #define FD_CONCAT4(a,b,c,d)a##b##c##d
346 :
347 : #define FD_EXPAND_THEN_STRINGIFY(x)FD_STRINGIFY(x)
348 4200096 : #define FD_EXPAND_THEN_CONCAT2(a,b)FD_CONCAT2(a,b)
349 >24920*10^7 : #define FD_EXPAND_THEN_CONCAT3(a,b,c)FD_CONCAT3(a,b,c)
350 5528412 : #define FD_EXPAND_THEN_CONCAT4(a,b,c,d)FD_CONCAT4(a,b,c,d)
351 :
352 : /* FD_VA_ARGS_SELECT(__VA_ARGS__,e32,e31,...e1): Macro that expands to
353 : en at compile time where n is number of items in the __VA_ARGS__
354 : list. If __VA_ARGS__ is empty, returns e1. Assumes __VA_ARGS__ has
355 : at most 32 arguments. Useful for making a variadic macro whose
356 : behavior depends on the number of arguments in __VA_ARGS__. */
357 :
358 : #define FD_VA_ARGS_SELECT(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,a,b,c,d,e,f,_,...)_
359 :
360 : /* FD_SRC_LOCATION returns a const cstr holding the line of code where
361 : FD_SRC_LOCATION was used. */
362 :
363 : #define FD_SRC_LOCATION __FILE__ "(" FD_EXPAND_THEN_STRINGIFY(__LINE__) ")"
364 :
365 : /* FD_STATIC_ASSERT tests at compile time if c is non-zero. If not,
366 : it aborts the compile with an error. err itself should be a token
367 : (e.g. not a string, no whitespace, etc). */
368 :
369 : #ifdef __cplusplus
370 : #define FD_STATIC_ASSERT(c,err) static_assert(c, #err)
371 : #else
372 21897 : #define FD_STATIC_ASSERT(c,err) _Static_assert(c, #err)
373 : #endif
374 :
375 : /* FD_ADDRESS_OF_PACKED_MEMBER(x): Linguistically does &(x) but without
376 : recent compiler complaints that &x might be unaligned if x is a
377 : member of a packed datastructure. (Often needed for interfacing with
378 : hardware / packets / etc.) */
379 :
380 3 : #define FD_ADDRESS_OF_PACKED_MEMBER( x ) (__extension__({ \
381 3 : char * _fd_aopm = (char *)&(x); \
382 3 : __asm__( "# FD_ADDRESS_OF_PACKED_MEMBER(" #x ") @" FD_SRC_LOCATION : "+r" (_fd_aopm) :: ); \
383 3 : (__typeof__(&(x)))_fd_aopm; \
384 3 : }))
385 :
386 : /* FD_PROTOTYPES_{BEGIN,END}: Headers that might be included in C++
387 : source should encapsulate the prototypes of code and globals
388 : contained in compilation units compiled as C with a
389 : FD_PROTOTYPE_{BEGIN,END} pair. */
390 :
391 : #ifdef __cplusplus
392 : #define FD_PROTOTYPES_BEGIN extern "C" {
393 : #else
394 : #define FD_PROTOTYPES_BEGIN
395 : #endif
396 :
397 : #ifdef __cplusplus
398 : #define FD_PROTOTYPES_END }
399 : #else
400 : #define FD_PROTOTYPES_END
401 : #endif
402 :
403 : /* FD_ASM_LG_ALIGN(lg_n) expands to an alignment assembler directive
404 : appropriate for the current architecture/ABI. The resulting align
405 : is 2^(lg_n) bytes, i.e. FD_ASM_LG_ALIGN(3) aligns by 8 bytes. */
406 :
407 : #if defined(__aarch64__)
408 : #define FD_ASM_LG_ALIGN(lg_n) ".align " #lg_n "\n"
409 : #elif defined(__x86_64__) || defined(__powerpc64__) || defined(__riscv)
410 : #define FD_ASM_LG_ALIGN(lg_n) ".p2align " #lg_n "\n"
411 : #endif
412 :
413 : /* FD_IMPORT declares a variable name and initializes with the contents
414 : of the file at path (with potentially some assembly directives for
415 : additional footer info). It is equivalent to:
416 :
417 : type const name[] __attribute__((aligned(align))) = {
418 :
419 : ... code that would initialize the contents of name to the
420 : ... raw binary data found in the file at path at compile time
421 : ... (with any appended information as specified by footer)
422 :
423 : };
424 :
425 : ulong const name_sz = ... number of bytes pointed to by name;
426 :
427 : More precisely, this creates a symbol "name" in the object file that
428 : points to a read-only copy of the raw data in the file at "path" as
429 : it was at compile time. 2^lg_align specifies the minimum alignment
430 : required for the copy's first byte as an unsuffixed decimal integer.
431 : footer are assembly commands to permit additional data to be appended
432 : to the copy (use "" for footer if no footer is necessary).
433 :
434 : Then it exposes a pointer to this copy in the current compilation
435 : unit as name and the byte size as name_sz. name_sz covers the first
436 : byte of the included data to the last byte of the footer inclusive.
437 :
438 : The dummy linker symbol _fd_import_name_sz will also be created in
439 : the object file as some under the hood magic to make this work. This
440 : should not be used in any compile unit as some compilers (I'm looking
441 : at you clang-15, but apparently not clang-10) will sometimes mangle
442 : its value from what it was set to in the object file even marked as
443 : absolute in the object file.
444 :
445 : This should only be used at global scope and should be done at most
446 : once over all object files / libraries used to make a program. If
447 : other compilation units want to make use of an import in a different
448 : compilation unit, they should declare:
449 :
450 : extern type const name[] __attribute__((aligned(align)));
451 :
452 : and/or:
453 :
454 : extern ulong const name_sz;
455 :
456 : as necessary (that is, do the usual to use name and name_sz as shown
457 : for the pseudo code above).
458 :
459 : Important safety tip! gcc -M will generally not detect the
460 : dependency this creates between the importing file and the imported
461 : file. This can cause incremental builds to miss changes to the
462 : imported file. Ideally, we would have FD_IMPORT automatically do
463 : something like:
464 :
465 : _Pragma( "GCC dependency \"" path "\" )
466 :
467 : This doesn't work as is because _Pragma needs some macro expansion
468 : hacks to accept this (this is doable). After that workaround, this
469 : still doesn't work because, due to tooling limitations, the pragma
470 : path is relative to the source file directory and the FD_IMPORT path
471 : is relative to the make directory (working around this would
472 : require a __FILE__-like directive for the source code directory base
473 : path). Even if that did exist, it might still not work because
474 : out-of-tree builds often require some substitutions to the gcc -M
475 : generated dependencies that this might not pick up (at least not
476 : without some build system surgery). And then it still wouldn't work
477 : because gcc -M seems to ignore all of this anyways (which is the
478 : actual show stopper as this pragma does something subtly different
479 : than what the name suggests and there isn't any obvious support for a
480 : "pseudo-include".) Another reminder that make clean and fast builds
481 : are our friend. */
482 :
483 : #if defined(__ELF__)
484 :
485 : #define FD_IMPORT( name, path, type, lg_align, footer ) \
486 : __asm__( ".section .rodata,\"a\",@progbits\n" \
487 : ".type " #name ",@object\n" \
488 : ".globl " #name "\n" \
489 : FD_ASM_LG_ALIGN(lg_align) \
490 : #name ":\n" \
491 : ".incbin \"" path "\"\n" \
492 : footer "\n" \
493 : ".size " #name ",. - " #name "\n" \
494 : "_fd_import_" #name "_sz = . - " #name "\n" \
495 : ".type " #name "_sz,@object\n" \
496 : ".globl " #name "_sz\n" \
497 : FD_ASM_LG_ALIGN(3) \
498 : #name "_sz:\n" \
499 : ".quad _fd_import_" #name "_sz\n" \
500 : ".size " #name "_sz,8\n" \
501 : ".previous\n" ); \
502 : extern type const name[] __attribute__((aligned(1<<(lg_align)))); \
503 : extern ulong const name##_sz
504 :
505 : #elif defined(__MACH__)
506 :
507 : #define FD_IMPORT( name, path, type, lg_align, footer ) \
508 : __asm__( ".section __DATA,__const\n" \
509 : ".globl _" #name "\n" \
510 : FD_ASM_LG_ALIGN(lg_align) \
511 : "_" #name ":\n" \
512 : ".incbin \"" path "\"\n" \
513 : footer "\n" \
514 : "_fd_import_" #name "_sz = . - _" #name "\n" \
515 : ".globl _" #name "_sz\n" \
516 : FD_ASM_LG_ALIGN(3) \
517 : "_" #name "_sz:\n" \
518 : ".quad _fd_import_" #name "_sz\n" \
519 : ".previous\n" ); \
520 : extern type const name[] __attribute__((aligned(1<<(lg_align)))); \
521 : extern ulong const name##_sz
522 :
523 : #endif
524 :
525 : /* FD_IMPORT_{BINARY,CSTR} are common cases for FD_IMPORT.
526 :
527 : In BINARY, the file is imported into the object file and exposed to
528 : the caller as a uchar binary data. name_sz will be the number of
529 : bytes in the file at time of import. name will have 128 byte
530 : alignment.
531 :
532 : In CSTR, the file is imported into the object caller with a '\0'
533 : termination appended and exposed to the caller as a cstr. Assuming
534 : the file is text (i.e. has no internal '\0's), strlen(name) will the
535 : number of bytes in the file and name_sz will be strlen(name)+1. name
536 : can have arbitrary alignment. */
537 :
538 : #ifdef FD_IMPORT
539 : #define FD_IMPORT_BINARY(name, path) FD_IMPORT( name, path, uchar, 7, "" )
540 : #define FD_IMPORT_CSTR( name, path) FD_IMPORT( name, path, char, 1, ".byte 0" )
541 : #endif
542 :
543 : /* Optimizer tricks ***************************************************/
544 :
545 : /* FD_RESTRICT is a pointer modifier for to designate a pointer as
546 : restricted. Hoops jumped because C++-17 still doesn't understand
547 : restrict ... sigh */
548 :
549 : #ifndef FD_RESTRICT
550 : #ifdef __cplusplus
551 : #define FD_RESTRICT __restrict
552 : #else
553 : #define FD_RESTRICT restrict
554 : #endif
555 : #endif
556 :
557 : /* fd_type_pun(p), fd_type_pun_const(p): These allow use of type
558 : punning while keeping strict aliasing optimizations enabled (e.g.
559 : some UNIX APIs, like sockaddr related APIs are dependent on type
560 : punning). These allow these API's to be used cleanly while keeping
561 : strict aliasing optimizations enabled and strict alias checking done. */
562 :
563 : static inline void *
564 40326092 : fd_type_pun( void * p ) {
565 40326092 : __asm__( "# fd_type_pun @" FD_SRC_LOCATION : "+r" (p) :: "memory" );
566 40326092 : return p;
567 40326092 : }
568 :
569 : static inline void const *
570 38081934 : fd_type_pun_const( void const * p ) {
571 38081934 : __asm__( "# fd_type_pun_const @" FD_SRC_LOCATION : "+r" (p) :: "memory" );
572 38081934 : return p;
573 38081934 : }
574 :
575 : /* FD_{LIKELY,UNLIKELY}(c): Evaluates c and returns whether it is
576 : logical true/false as long (1L/0L). It also hints to the optimizer
577 : whether it should optimize for the case of c evaluating as
578 : true/false. */
579 :
580 >12547*10^7 : #define FD_LIKELY(c) __builtin_expect( !!(c), 1L )
581 >48555*10^7 : #define FD_UNLIKELY(c) __builtin_expect( !!(c), 0L )
582 :
583 : /* FD_FN_PURE hints to the optimizer that the function, roughly
584 : speaking, does not have side effects. As such, the compiler can
585 : replace a call to the function with the result of an earlier call to
586 : that function provide the inputs and memory used hasn't changed.
587 :
588 : IMPORTANT SAFETY TIP! Recent compilers seem to take an undocumented
589 : and debatable stance that pure functions do no writes to memory.
590 : This is a sufficient condition for the above but not a necessary one.
591 :
592 : Consider, for example, the real world case of an otherwise pure
593 : function that uses pass-by-reference to return more than one value
594 : (an unpleasant practice that is sadly often necessary because C/C++,
595 : compilers and underlying platform ABIs are very bad at helping
596 : developers simply and clearly express their intent to return multiple
597 : values and then generate good assembly for such).
598 :
599 : If called multiple times sequentially, all but the first call to such
600 : a "pure" function could be optimized away because the non-volatile
601 : memory writes done in the all but the 1st call for the
602 : pass-by-reference-returns write the same value to normal memory that
603 : was written on the 1st call. That is, these calls return the same
604 : value for their direct return and do writes that do not have any
605 : visible effect.
606 :
607 : Thus, while it is safe for the compiler to eliminate all but the
608 : first call via techniques like common subexpression elimination, it
609 : is not safe for the compiler to infer that the first call did no
610 : writes.
611 :
612 : But recent compilers seem to do exactly that.
613 :
614 : Sigh ... we can't use FD_FN_PURE on such functions because of all the
615 : above linguistic, compiler, documentation and ABI infinite sadness.
616 :
617 : TL;DR To be safe against the above vagaries, recommend using
618 : FD_FN_PURE to annotate functions that do no memory writes (including
619 : trivial memory writes) and try to design HPC APIs to avoid returning
620 : multiple values as much as possible.
621 :
622 : Followup: FD_FN_PURE expands to nothing by default given additional
623 : confusion between how current languages, compilers, CI, fuzzing, and
624 : developers interpret this function attribute. We keep it around
625 : given it documents the intent of various APIs and so it can be
626 : manually enabled to find implementation surprises during bullet
627 : proofing (e.g. under compiler options like "extra-brutality").
628 : Hopefully someday, pure function attributes will someday be handled
629 : more consistently across the board. */
630 :
631 : #ifndef FD_FN_PURE
632 : #define FD_FN_PURE
633 : #endif
634 :
635 : /* FD_FN_CONST is like pure but also, even stronger, indicates that the
636 : function does not depend on the state of memory. See note above
637 : about why this expands to nothing by default. */
638 :
639 : #ifndef FD_FN_CONST
640 : #define FD_FN_CONST
641 : #endif
642 :
643 : /* FD_FN_UNUSED indicates that it is okay if the function with static
644 : linkage is not used. Allows working around -Winline in header only
645 : APIs where the compiler decides not to actually inline the function.
646 : (This belief, frequently promulgated by anti-macro cults, that "An
647 : Inline Function is As Fast As a Macro" ... an entire section in gcc's
648 : documentation devoted to it in fact ... remains among the biggest
649 : lies in computer science. Yes, an inline function is as fast as a
650 : macro ... when the compiler actually decides to treat the inline
651 : keyword more than just for entertainment purposes only. Which, as
652 : -Winline proves, it frequently doesn't. Sigh ... force_inline like
653 : compiler extensions might be an alternative here but they have their
654 : own portability issues.) */
655 :
656 48 : #define FD_FN_UNUSED __attribute__((unused))
657 :
658 : /* FD_FN_UNSANITIZED tells the compiler to disable AddressSanitizer and
659 : UndefinedBehaviorSanitizer instrumentation. For some functions, this
660 : can improve instrumented compile time by ~30x. */
661 :
662 : #define FD_FN_UNSANITIZED __attribute__((no_sanitize("address", "undefined")))
663 :
664 : /* FD_FN_SENSITIVE instruments the compiler to sanitize sensitive functions.
665 : https://eprint.iacr.org/2023/1713 (Sec 3.2)
666 : - Clear all registers with __attribute__((zero_call_used_regs("all")))
667 : - Clear stack with __attribute__((strub)), available in gcc 14+ */
668 :
669 : #if __has_attribute(strub)
670 : #define FD_FN_SENSITIVE __attribute__((strub)) __attribute__((zero_call_used_regs("all")))
671 : #elif __has_attribute(zero_call_used_regs)
672 : #define FD_FN_SENSITIVE __attribute__((zero_call_used_regs("all")))
673 : #else
674 : #define FD_FN_SENSITIVE
675 : #endif
676 :
677 : /* FD_PARAM_UNUSED indicates that it is okay if the function parameter is not
678 : used. */
679 :
680 : #define FD_PARAM_UNUSED __attribute__((unused))
681 :
682 : /* FD_TYPE_PACKED indicates that a type is to be packed, reseting its alignment
683 : to 1. */
684 :
685 : #define FD_TYPE_PACKED __attribute__((packed))
686 :
687 : /* FD_WARN_UNUSED tells the compiler the result (from a function) should
688 : be checked. This is useful to force callers to either check the result
689 : or deliberately and explicitly ignore it. Good for result codes and
690 : errors */
691 :
692 : #define FD_WARN_UNUSED __attribute__ ((warn_unused_result))
693 :
694 : /* FD_FALLTHRU tells the compiler that a case in a switch falls through
695 : to the next case. This avoids the compiler complaining, in cases where
696 : it is an intentional fall through.
697 : The "while(0)" avoids a compiler complaint in the event the case
698 : has no statement, example:
699 : switch( return_code ) {
700 : case RETURN_CASE_1: FD_FALLTHRU;
701 : case RETURN_CASE_2: FD_FALLTHRU;
702 : case RETURN_CASE_3:
703 : case_123();
704 : default:
705 : case_other();
706 : }
707 :
708 : See C++17 [[fallthrough]] and gcc __attribute__((fallthrough)) */
709 :
710 : #define FD_FALLTHRU while(0) __attribute__((fallthrough))
711 :
712 : /* FD_COMPILER_FORGET(var): Tells the compiler that it shouldn't use
713 : any knowledge it has about the provided register-compatible variable
714 : var for optimizations going forward (i.e. the variable has changed in
715 : a deterministic but unknown-to-the-compiler way where the actual
716 : change is the identity operation). Useful for inhibiting various
717 : branch nest misoptimizations (compilers unfortunately tend to
718 : radically underestimate the impact in raw average performance and
719 : jitter and the probability of branch mispredicts or the cost to the
720 : CPU of having lots of branches). This is not asm volatile (use
721 : UNPREDICTABLE below for that) and has no clobbers. So if var is not
722 : used after the forget, the compiler can optimize the FORGET away
723 : (along with operations preceding it used to produce var). */
724 :
725 36448701959 : #define FD_COMPILER_FORGET(var) __asm__( "# FD_COMPILER_FORGET(" #var ")@" FD_SRC_LOCATION : "+r" (var) )
726 :
727 : /* FD_COMPILER_UNPREDICTABLE(var): Same as FD_COMPILER_FORGET(var) but
728 : the provided variable has changed in a non-deterministic way from the
729 : compiler's POV (e.g. the value in the variable on output should not
730 : be treated as a compile time constant even if it is one
731 : linguistically). Useful for suppressing unwanted
732 : compile-time-const-based optimizations like hoisting operations with
733 : useful CPU side effects out of a critical loop. */
734 :
735 31086237 : #define FD_COMPILER_UNPREDICTABLE(var) __asm__ __volatile__( "# FD_COMPILER_UNPREDICTABLE(" #var ")@" FD_SRC_LOCATION : "+m,r" (var) )
736 :
737 : /* Atomic tricks ******************************************************/
738 :
739 : /* FD_COMPILER_MFENCE(): Tells the compiler that it can't move any
740 : memory operations (load or store) from before the MFENCE to after the
741 : MFENCE (and vice versa). The processor itself might still reorder
742 : around the fence though (that requires platform specific fences). */
743 :
744 >11054*10^7 : #define FD_COMPILER_MFENCE() __asm__ __volatile__( "# FD_COMPILER_MFENCE()@" FD_SRC_LOCATION ::: "memory" )
745 :
746 : /* FD_SPIN_PAUSE(): Yields the logical core of the calling thread to
747 : the other logical cores sharing the same underlying physical core for
748 : a few clocks without yielding it to the operating system scheduler.
749 : Typically useful for shared memory spin polling loops, especially if
750 : hyperthreading is in use. IMPORTANT SAFETY TIP! This might act as a
751 : FD_COMPILER_MFENCE on some combinations of toolchains and targets
752 : (e.g. gcc documents that __builtin_ia32_pause also does a compiler
753 : memory) but this should not be relied upon for portable code
754 : (consider making this a compiler memory fence on all platforms?) */
755 :
756 : #if FD_HAS_X86
757 14729636410 : #define FD_SPIN_PAUSE() __builtin_ia32_pause()
758 : #else
759 : #define FD_SPIN_PAUSE() ((void)0)
760 : #endif
761 :
762 : /* FD_YIELD(): Yields the logical core of the calling thread to the
763 : operating system scheduler if a hosted target and does a spin pause
764 : otherwise. */
765 :
766 : #if FD_HAS_HOSTED
767 21464 : #define FD_YIELD() fd_yield()
768 : #else
769 : #define FD_YIELD() FD_SPIN_PAUSE()
770 : #endif
771 :
772 : /* FD_VOLATILE_CONST(x): Tells the compiler is not able to predict the
773 : value obtained by dereferencing x and that dereferencing x might have
774 : other side effects (e.g. maybe another thread could change the value
775 : and the compiler has no way of knowing this). Generally speaking,
776 : the volatile keyword is broken linguistically. Volatility is not a
777 : property of the variable but of the dereferencing of a variable (e.g.
778 : what is volatile from the POV of a reader of a shared variable is not
779 : necessarily volatile from the POV a writer of that shared variable in
780 : a different thread). */
781 :
782 1354460867 : #define FD_VOLATILE_CONST(x) (*((volatile const __typeof__((x)) *)&(x)))
783 :
784 : /* FD_VOLATILE(x): tells the compiler is not able to predict the effect
785 : of modifying x and that dereferencing x might have other side effects
786 : (e.g. maybe another thread is spinning on x waiting for its value to
787 : change and the compiler has no way of knowing this). */
788 :
789 779748880 : #define FD_VOLATILE(x) (*((volatile __typeof__((x)) *)&(x)))
790 :
791 : #if FD_HAS_ATOMIC
792 :
793 : /* FD_ATOMIC_FETCH_AND_{ADD,SUB,OR,AND,XOR}(p,v):
794 :
795 : FD_ATOMIC_FETCH_AND_ADD(p,v) does
796 : f = *p;
797 : *p = f + v
798 : return f;
799 : as a single atomic operation. Similarly for the other variants. */
800 :
801 6009573 : #define FD_ATOMIC_FETCH_AND_ADD(p,v) __sync_fetch_and_add( (p), (v) )
802 6019296 : #define FD_ATOMIC_FETCH_AND_SUB(p,v) __sync_fetch_and_sub( (p), (v) )
803 65427105 : #define FD_ATOMIC_FETCH_AND_OR( p,v) __sync_fetch_and_or( (p), (v) )
804 : #define FD_ATOMIC_FETCH_AND_AND(p,v) __sync_fetch_and_and( (p), (v) )
805 : #define FD_ATOMIC_FETCH_AND_XOR(p,v) __sync_fetch_and_xor( (p), (v) )
806 :
807 : /* FD_ATOMIC_{ADD,SUB,OR,AND,XOR}_AND_FETCH(p,v):
808 :
809 : FD_ATOMIC_{ADD,SUB,OR,AND,XOR}_AND_FETCH(p,v) does
810 : r = *p + v;
811 : *p = r;
812 : return r;
813 : as a single atomic operation. Similarly for the other variants. */
814 :
815 : #define FD_ATOMIC_ADD_AND_FETCH(p,v) __sync_add_and_fetch( (p), (v) )
816 : #define FD_ATOMIC_SUB_AND_FETCH(p,v) __sync_sub_and_fetch( (p), (v) )
817 : #define FD_ATOMIC_OR_AND_FETCH( p,v) __sync_or_and_fetch( (p), (v) )
818 : #define FD_ATOMIC_AND_AND_FETCH(p,v) __sync_and_and_fetch( (p), (v) )
819 : #define FD_ATOMIC_XOR_AND_FETCH(p,v) __sync_xor_and_fetch( (p), (v) )
820 :
821 : /* FD_ATOMIC_CAS(p,c,s):
822 :
823 : o = FD_ATOMIC_CAS(p,c,s) conceptually does:
824 : o = *p;
825 : if( o==c ) *p = s;
826 : return o
827 : as a single atomic operation. */
828 :
829 329258076 : #define FD_ATOMIC_CAS(p,c,s) __sync_val_compare_and_swap( (p), (c), (s) )
830 :
831 : /* FD_ATOMIC_XCHG(p,v):
832 :
833 : o = FD_ATOMIC_XCHG( p, v ) conceptually does:
834 : o = *p
835 : *p = v
836 : return o
837 : as a single atomic operation.
838 :
839 : Intel's __sync compiler extensions from the days of yore mysteriously
840 : implemented atomic exchange via the very misleadingly named
841 : __sync_lock_test_and_set. And some implementations (and C++)
842 : debatably then implemented this API according to what the misleading
843 : name implied as opposed to what it actually did. But those
844 : implementations didn't bother to provide a replacement for atomic
845 : exchange functionality (forcing us to emulate atomic exchange more
846 : slowly via CAS there). Sigh ... we do what we can to fix this up. */
847 :
848 : #ifndef FD_ATOMIC_XCHG_STYLE
849 : #if FD_HAS_X86 && !__cplusplus
850 : #define FD_ATOMIC_XCHG_STYLE 1
851 : #else
852 : #define FD_ATOMIC_XCHG_STYLE 0
853 : #endif
854 : #endif
855 :
856 : #if FD_ATOMIC_XCHG_STYLE==0
857 : #define FD_ATOMIC_XCHG(p,v) (__extension__({ \
858 : __typeof__(*(p)) * _fd_atomic_xchg_p = (p); \
859 : __typeof__(*(p)) _fd_atomic_xchg_v = (v); \
860 : __typeof__(*(p)) _fd_atomic_xchg_t; \
861 : for(;;) { \
862 : _fd_atomic_xchg_t = FD_VOLATILE_CONST( *_fd_atomic_xchg_p ); \
863 : if( FD_LIKELY( __sync_bool_compare_and_swap( _fd_atomic_xchg_p, _fd_atomic_xchg_t, _fd_atomic_xchg_v ) ) ) break; \
864 : FD_SPIN_PAUSE(); \
865 : } \
866 : _fd_atomic_xchg_t; \
867 : }))
868 : #elif FD_ATOMIC_XCHG_STYLE==1
869 11898228 : #define FD_ATOMIC_XCHG(p,v) __sync_lock_test_and_set( (p), (v) )
870 : #else
871 : #error "Unknown FD_ATOMIC_XCHG_STYLE"
872 : #endif
873 :
874 : #endif /* FD_HAS_ATOMIC */
875 :
876 : /* FD_TL: This indicates that the variable should be thread local.
877 :
878 : FD_ONCE_{BEGIN,END}: The block:
879 :
880 : FD_ONCE_BEGIN {
881 : ... code ...
882 : } FD_ONCE_END
883 :
884 : linguistically behaves like:
885 :
886 : do {
887 : ... code ...
888 : } while(0)
889 :
890 : But provides a low overhead guarantee that:
891 : - The block will be executed by at most once over all threads
892 : in a process (i.e. the set of threads which share global
893 : variables).
894 : - No thread in a process that encounters the block will continue
895 : past it until it has executed once.
896 :
897 : This implies that caller promises a ONCE block will execute in a
898 : finite time. (Meant for doing simple lightweight initializations.)
899 :
900 : It is okay to nest ONCE blocks. The thread that executes the
901 : outermost will execute all the nested once as part of executing the
902 : outermost.
903 :
904 : A ONCE implicitly provides a compiler memory fence to reduce the risk
905 : that the compiler will assume that operations done in the once block
906 : on another thread have not been done (e.g. propagating pre-once block
907 : variable values into post-once block code). It is up to the user to
908 : provide any necessary hardware fencing (usually not necessary).
909 :
910 : FD_THREAD_ONCE_{BEGIN,END}: The block:
911 :
912 : FD_THREAD_ONCE_BEGIN {
913 : ... code ...
914 : } FD_THREAD_ONCE_END;
915 :
916 : is similar except the guarantee is that the block only covers the
917 : invoking thread and it does not provide any fencing. If a thread
918 : once begin is nested inside a once begin, that thread once begin will
919 : only be executed on the thread that executes the thread once begin.
920 : It is similarly okay to nest ONCE block inside a THREAD_ONCE block.
921 :
922 : FD_TURNSTILE_{BEGIN,BLOCKED,END} implement a turnstile for all
923 : threads in a process. Only one thread can be in the turnstile at a
924 : time. Usage:
925 :
926 : FD_TURNSTILE_BEGIN(blocking) {
927 :
928 : ... At this point, we are the only thread executing this block of
929 : ... code.
930 : ...
931 : ... Do operations that must be done by threads one-at-a-time
932 : ... here.
933 : ...
934 : ... Because compiler memory fences are done just before entering
935 : ... and after exiting this block, there is typically no need to
936 : ... use any atomics / volatile / fencing here. That is, we can
937 : ... just write "normal" code on platforms where writes to memory
938 : ... become visible to other threads in the order in which they
939 : ... were issued in the machine code (e.g. x86) as others will not
940 : ... proceed with this block until they exit it. YMMV for non-x86
941 : ... platforms (probably need additional hardware store fences in
942 : ... these macros).
943 : ...
944 : ... It is safe to use "break" and/or "continue" within this
945 : ... block. The block will exit with the appropriate compiler
946 : ... fencing and unlocking. Execution will resume immediately
947 : ... after FD_TURNSTILE_END.
948 :
949 : ... IMPORTANT SAFETY TIP! DO NOT RETURN FROM THIS BLOCK.
950 :
951 : } FD_TURNSTILE_BLOCKED {
952 :
953 : ... At this point, there was another thread in the turnstile when
954 : ... we tried to enter the turnstile.
955 : ...
956 : ... Handle blocked here.
957 : ...
958 : ... On exiting this block, if blocking was zero, we will resume
959 : ... execution immediately after FD_TURNSTILE_END. If blocking
960 : ... was non-zero, we will resume execution immediately before
961 : ... FD_TURNSTILE_BEGIN (e.g. we will retry again after a short
962 : ... spin pause).
963 : ...
964 : ... It is safe to use "break" and/or "continue" within this
965 : ... block. Both will exit this block and resume execution
966 : ... at the location indicated as per what blocking specified
967 : ... then the turnstile was entered.
968 : ...
969 : ... It is technically safe to return from this block but
970 : ... also extremely gross.
971 :
972 : } FD_TURNSTILE_END; */
973 :
974 : #if FD_HAS_THREADS /* Potentially more than one thread in the process */
975 :
976 : #ifndef FD_TL
977 : #define FD_TL __thread
978 : #endif
979 :
980 4023 : #define FD_ONCE_BEGIN do { \
981 4023 : FD_COMPILER_MFENCE(); \
982 4023 : static volatile int _fd_once_block_state = 0; \
983 4023 : for(;;) { \
984 4023 : int _fd_once_block_tmp = _fd_once_block_state; \
985 4023 : if( FD_LIKELY( _fd_once_block_tmp>0 ) ) break; \
986 4023 : if( FD_LIKELY( !_fd_once_block_tmp ) && \
987 2442 : FD_LIKELY( !FD_ATOMIC_CAS( &_fd_once_block_state, 0, -1 ) ) ) { \
988 2442 : do
989 :
990 : #define FD_ONCE_END \
991 2442 : while(0); \
992 2442 : FD_COMPILER_MFENCE(); \
993 0 : _fd_once_block_state = 1; \
994 0 : break; \
995 2442 : } \
996 2442 : FD_YIELD(); \
997 0 : } \
998 4023 : } while(0)
999 :
1000 36 : #define FD_THREAD_ONCE_BEGIN do { \
1001 36 : static FD_TL int _fd_thread_once_block_state = 0; \
1002 36 : if( FD_UNLIKELY( !_fd_thread_once_block_state ) ) { \
1003 9 : do
1004 :
1005 : #define FD_THREAD_ONCE_END \
1006 9 : while(0); \
1007 9 : _fd_thread_once_block_state = 1; \
1008 9 : } \
1009 36 : } while(0)
1010 :
1011 9 : #define FD_TURNSTILE_BEGIN(blocking) do { \
1012 9 : static volatile int _fd_turnstile_state = 0; \
1013 9 : int _fd_turnstile_blocking = (blocking); \
1014 9 : for(;;) { \
1015 9 : int _fd_turnstile_tmp = _fd_turnstile_state; \
1016 9 : if( FD_LIKELY( !_fd_turnstile_tmp ) && \
1017 9 : FD_LIKELY( !FD_ATOMIC_CAS( &_fd_turnstile_state, 0, 1 ) ) ) { \
1018 9 : FD_COMPILER_MFENCE(); \
1019 9 : do
1020 :
1021 : #define FD_TURNSTILE_BLOCKED \
1022 9 : while(0); \
1023 9 : FD_COMPILER_MFENCE(); \
1024 9 : _fd_turnstile_state = 0; \
1025 9 : FD_COMPILER_MFENCE(); \
1026 9 : break; \
1027 9 : } \
1028 9 : FD_COMPILER_MFENCE(); \
1029 0 : do
1030 :
1031 : #define FD_TURNSTILE_END \
1032 0 : while(0); \
1033 0 : FD_COMPILER_MFENCE(); \
1034 0 : if( !_fd_turnstile_blocking ) break; /* likely compile time */ \
1035 0 : FD_SPIN_PAUSE(); \
1036 0 : } \
1037 9 : } while(0)
1038 :
1039 : #else /* Only one thread in the process */
1040 :
1041 : #ifndef FD_TL
1042 : #define FD_TL /**/
1043 : #endif
1044 :
1045 : #define FD_ONCE_BEGIN do { \
1046 : static int _fd_once_block_state = 0; \
1047 : if( FD_UNLIKELY( !_fd_once_block_state ) ) { \
1048 : do
1049 :
1050 : #define FD_ONCE_END \
1051 : while(0); \
1052 : _fd_once_block_state = 1; \
1053 : } \
1054 : } while(0)
1055 :
1056 : #define FD_THREAD_ONCE_BEGIN FD_ONCE_BEGIN
1057 : #define FD_THREAD_ONCE_END FD_ONCE_END
1058 :
1059 : #define FD_TURNSTILE_BEGIN(blocking) do { \
1060 : (void)(blocking); \
1061 : FD_COMPILER_MFENCE(); \
1062 : if( 1 ) { \
1063 : do
1064 :
1065 : #define FD_TURNSTILE_BLOCKED \
1066 : while(0); \
1067 : } else { \
1068 : do
1069 :
1070 : #define FD_TURNSTILE_END \
1071 : while(0); \
1072 : } \
1073 : FD_COMPILER_MFENCE(); \
1074 : } while(0)
1075 :
1076 : #endif
1077 :
1078 : /* An ideal fd_clock_func_t is a function such that:
1079 :
1080 : long dx = clock( args );
1081 : ... stuff ...
1082 : dx = clock( args ) - dx;
1083 :
1084 : yields a strictly positive dx where dx approximates the amount of
1085 : wallclock time elapsed on the caller in some clock specific unit
1086 : (e.g. nanoseconds, CPU ticks, etc) for a reasonable amount of "stuff"
1087 : (including no "stuff"). args allows arbitrary clock specific context
1088 : to be passed to the clock implication. (clocks that need a non-const
1089 : args can cast away the const in the implementation or cast the
1090 : function pointer as necessary.) */
1091 :
1092 : typedef long (*fd_clock_func_t)( void const * args );
1093 :
1094 : FD_PROTOTYPES_BEGIN
1095 :
1096 : /* fd_memcpy(d,s,sz): On modern x86 in some circumstances, rep mov will
1097 : be faster than memcpy under the hood (basically due to RFO /
1098 : read-for-ownership optimizations in the cache protocol under the hood
1099 : that aren't easily done from the ISA ... see Intel docs on enhanced
1100 : rep mov). Compile time configurable though as this is not always
1101 : true. So application can tune to taste. Hard to beat rep mov for
1102 : code density though (2 bytes) and pretty hard to beat in situations
1103 : needing a completely generic memcpy. But it can be beaten in
1104 : specialized situations for the usual reasons. */
1105 :
1106 : /* FIXME: CONSIDER MEMCMP TOO! */
1107 : /* FIXME: CONSIDER MEMCPY RELATED FUNC ATTRS */
1108 :
1109 : #ifndef FD_USE_ARCH_MEMCPY
1110 : #define FD_USE_ARCH_MEMCPY 0
1111 : #endif
1112 :
1113 : #if FD_HAS_X86 && FD_USE_ARCH_MEMCPY && !defined(CBMC) && !FD_HAS_DEEPASAN && !FD_HAS_MSAN
1114 :
1115 : static inline void *
1116 : fd_memcpy( void * FD_RESTRICT d,
1117 : void const * FD_RESTRICT s,
1118 521160321 : ulong sz ) {
1119 521160321 : void * p = d;
1120 521160321 : __asm__ __volatile__( "rep movsb" : "+D" (p), "+S" (s), "+c" (sz) :: "memory" );
1121 521160321 : return d;
1122 521160321 : }
1123 :
1124 : #elif FD_HAS_MSAN
1125 :
1126 : void * __msan_memcpy( void * dest, void const * src, ulong n );
1127 :
1128 : static inline void *
1129 : fd_memcpy( void * FD_RESTRICT d,
1130 : void const * FD_RESTRICT s,
1131 : ulong sz ) {
1132 : return __msan_memcpy( d, s, sz );
1133 : }
1134 :
1135 : #else
1136 :
1137 : static inline void *
1138 : fd_memcpy( void * FD_RESTRICT d,
1139 : void const * FD_RESTRICT s,
1140 995525749 : ulong sz ) {
1141 : #if defined(CBMC) || FD_HAS_ASAN
1142 : if( FD_UNLIKELY( !sz ) ) return d; /* Standard says sz 0 is UB, uncomment if target is insane and doesn't treat sz 0 as a nop */
1143 : #endif
1144 995525749 : return memcpy( d, s, sz );
1145 995525749 : }
1146 :
1147 : #endif
1148 :
1149 : /* fd_memset(d,c,sz): architecturally optimized memset. See fd_memcpy
1150 : for considerations. */
1151 :
1152 : /* FIXME: CONSIDER MEMSET RELATED FUNC ATTRS */
1153 :
1154 : #ifndef FD_USE_ARCH_MEMSET
1155 : #define FD_USE_ARCH_MEMSET 0
1156 : #endif
1157 :
1158 : #if FD_HAS_X86 && FD_USE_ARCH_MEMSET && !defined(CBMC) && !FD_HAS_DEEPASAN && !FD_HAS_MSAN
1159 :
1160 : static inline void *
1161 : fd_memset( void * d,
1162 : int c,
1163 121617507 : ulong sz ) {
1164 121617507 : void * p = d;
1165 121617507 : __asm__ __volatile__( "rep stosb" : "+D" (p), "+c" (sz) : "a" (c) : "memory" );
1166 121617507 : return d;
1167 121617507 : }
1168 :
1169 : #else
1170 :
1171 : static inline void *
1172 : fd_memset( void * d,
1173 : int c,
1174 241731182 : ulong sz ) {
1175 : # ifdef CBMC
1176 : if( FD_UNLIKELY( !sz ) ) return d; /* See fd_memcpy note */
1177 : # endif
1178 241731182 : return memset( d, c, sz );
1179 241731182 : }
1180 :
1181 : #endif
1182 :
1183 : /* C23 has memset_explicit, i.e. a memset that can't be removed by the
1184 : optimizer. This is our own equivalent. */
1185 :
1186 : static void * (* volatile fd_memset_explicit)(void *, int, size_t) = memset;
1187 :
1188 : /* fd_memeq(s0,s1,sz): Compares two blocks of memory. Returns 1 if
1189 : equal or sz is zero and 0 otherwise. No memory accesses made if sz
1190 : is zero (pointers may be invalid). On x86, uses repe cmpsb which is
1191 : preferable to __builtin_memcmp in some cases. */
1192 :
1193 : #ifndef FD_USE_ARCH_MEMEQ
1194 : #define FD_USE_ARCH_MEMEQ 0
1195 : #endif
1196 :
1197 : #if FD_HAS_X86 && FD_USE_ARCH_MEMEQ && defined(__GCC_ASM_FLAG_OUTPUTS__) && __STDC_VERSION__>=199901L
1198 :
1199 : FD_FN_PURE static inline int
1200 : fd_memeq( void const * s0,
1201 : void const * s1,
1202 : ulong sz ) {
1203 : /* ZF flag is set and exported in two cases:
1204 : a) size is zero (via test)
1205 : b) buffer is equal (via repe cmpsb) */
1206 : int r;
1207 : __asm__( "test %3, %3;"
1208 : "repe cmpsb"
1209 : : "=@cce" (r), "+S" (s0), "+D" (s1), "+c" (sz)
1210 : : "m" (*(char const (*)[sz]) s0), "m" (*(char const (*)[sz]) s1)
1211 : : "cc" );
1212 : return r;
1213 : }
1214 :
1215 : #else
1216 :
1217 : FD_FN_PURE static inline int
1218 : fd_memeq( void const * s1,
1219 : void const * s2,
1220 15140742 : ulong sz ) {
1221 15140742 : return 0==memcmp( s1, s2, sz );
1222 15140742 : }
1223 :
1224 : #endif
1225 :
1226 : /* fd_hash(seed,buf,sz), fd_hash_memcpy(seed,d,s,sz): High quality
1227 : (full avalanche) high speed variable length buffer -> 64-bit hash
1228 : function (memcpy_hash is often as fast as plain memcpy). Based on
1229 : the xxhash-r39 (open source BSD licensed) implementation. In-place
1230 : and out-of-place variants provided (out-of-place variant assumes dst
1231 : and src do not overlap). Caller promises valid input arguments,
1232 : cannot fail given valid inputs arguments. sz==0 is fine. */
1233 :
1234 : FD_FN_PURE ulong
1235 : fd_hash( ulong seed,
1236 : void const * buf,
1237 : ulong sz );
1238 :
1239 : ulong
1240 : fd_hash_memcpy( ulong seed,
1241 : void * FD_RESTRICT d,
1242 : void const * FD_RESTRICT s,
1243 : ulong sz );
1244 :
1245 : #ifndef FD_TICKCOUNT_STYLE
1246 : #if FD_HAS_X86 /* Use RDTSC */
1247 : #define FD_TICKCOUNT_STYLE 1
1248 : #else /* Use portable fallback */
1249 : #define FD_TICKCOUNT_STYLE 0
1250 : #endif
1251 : #endif
1252 :
1253 : #if FD_TICKCOUNT_STYLE==0 /* Portable fallback (slow). Ticks at 1 ns / tick */
1254 :
1255 : #define fd_tickcount() fd_log_wallclock() /* TODO: fix ugly pre-log usage */
1256 :
1257 : #elif FD_TICKCOUNT_STYLE==1 /* RTDSC (fast) */
1258 :
1259 : /* fd_tickcount: Reads the hardware invariant tickcounter ("RDTSC").
1260 : This monotonically increases at an approximately constant rate
1261 : relative to the system wallclock and is synchronous across all CPUs
1262 : on a host.
1263 :
1264 : The rate this ticks at is not precisely defined (see Intel docs for
1265 : more details) but it is typically in the ballpark of the CPU base
1266 : clock frequency. The relationship to the wallclock is very well
1267 : approximated as linear over short periods of time (i.e. less than a
1268 : fraction of a second) and this should not exhibit any sudden changes
1269 : in its rate relative to the wallclock. Notably, its rate is not
1270 : directly impacted by CPU clock frequency adaptation / Turbo mode (see
1271 : other Intel performance monitoring counters for various CPU cycle
1272 : counters). It can drift over longer period time for the usual clock
1273 : synchronization reasons.
1274 :
1275 : This is a reasonably fast O(1) cost (~6-8 ns on recent Intel).
1276 : Because of all compiler options and parallel execution going on in
1277 : modern CPUs cores, other instructions might be reordered around this
1278 : by the compiler and/or CPU. It is up to the user to do lower level
1279 : tricks as necessary when the precise location of this in the
1280 : execution stream and/or when executed by the CPU is needed. (This is
1281 : often unnecessary as such levels of precision are not frequently
1282 : required and often have self-defeating overheads.)
1283 :
1284 : It is worth noting that RDTSC and/or (even more frequently) lower
1285 : level performance counters are often restricted from use in user
1286 : space applications. It is recommended that applications use this
1287 : primarily for debugging / performance tuning on unrestricted hosts
1288 : and/or when the developer is confident that applications using this
1289 : will have appropriate permissions when deployed. */
1290 :
1291 4632056166 : #define fd_tickcount() ((long)__builtin_ia32_rdtsc())
1292 :
1293 : #else
1294 : #error "Unknown FD_TICKCOUNT_STYLE"
1295 : #endif
1296 :
1297 : long _fd_tickcount( void const * _ ); /* fd_clock_func_t compat */
1298 :
1299 : #if FD_HAS_HOSTED
1300 :
1301 : /* fd_yield yields the calling thread to the operating system scheduler. */
1302 :
1303 : void
1304 : fd_yield( void );
1305 :
1306 : #endif
1307 :
1308 : FD_PROTOTYPES_END
1309 :
1310 : #endif /* HEADER_fd_src_util_fd_util_base_h */
|