Line data Source code
1 : #ifndef HEADER_fd_src_util_fd_util_base_h
2 : #define HEADER_fd_src_util_fd_util_base_h
3 :
4 : /* Base development environment */
5 :
6 : /* Compiler checks ****************************************************/
7 :
8 : #ifdef __cplusplus
9 :
10 : #if __cplusplus<201703L
11 : #error "Firedancer requires C++17 or later"
12 : #endif
13 :
14 : #else
15 :
16 : #if __STDC_VERSION__<201710L
17 : #error "Firedancer requires C Standard version C17 or later"
18 : #endif
19 :
20 : #endif //__cplusplus
21 :
22 : /* Versioning macros **************************************************/
23 :
24 : /* FD_VERSION_{MAJOR,MINOR,PATCH} programmatically specify the
25 : firedancer version. */
26 :
27 0 : #define FD_VERSION_MAJOR (0)
28 0 : #define FD_VERSION_MINOR (0)
29 0 : #define FD_VERSION_PATCH (0)
30 :
31 : /* Build target capabilities ******************************************/
32 :
33 : /* Different build targets often have different levels of support for
34 : various language and hardware features. The presence of various
35 : features can be tested at preprocessor, compile, or run time via the
36 : below capability macros.
37 :
38 : Code that does not exploit any of these capabilities written within
39 : the base development environment should be broadly portable across a
40 : range of build targets ranging from on-chain virtual machines to
41 : commodity hosts to custom hardware.
42 :
43 : As such, highly portable yet high performance code is possible by
44 : writing generic implementations that do not exploit any of the below
45 : capabilities as a portable fallback along with build target specific
46 : optimized implementations that are invoked when the build target
47 : supports the appropriate capabilities.
48 :
49 : The base development itself provide lots of functionality to help
50 : with implementing portable fallbacks while making very minimal
51 : assumptions about the build targets and zero use of 3rd party
52 : libraries (these might make unknown additional assumptions about the
53 : build target, including availability of a quality implementation of
54 : the library on the build target). */
55 :
56 : /* FD_HAS_HOSTED: If the build target is hosted (e.g. resides on a host
57 : with a POSIX-ish environment ... practically speaking, stdio.h,
58 : stdlib.h, unistd.h, et al more or less behave normally ...
59 : pedantically XOPEN_SOURCE=700), FD_HAS_HOSTED will be 1. It will be
60 : zero otherwise. */
61 :
62 : #ifndef FD_HAS_HOSTED
63 : #define FD_HAS_HOSTED 0
64 : #endif
65 :
66 : /* FD_HAS_ATOMIC: If the build target supports atomic operations
67 : between threads accessing a common memory region (include threads
68 : that reside in different processes on a host communicating via a
69 : shared memory region with potentially different local virtual
70 : mappings). Practically speaking, does atomic compare-and-swap et al
71 : work? */
72 :
73 : #ifndef FD_HAS_ATOMIC
74 : #define FD_HAS_ATOMIC 0
75 : #endif
76 :
77 : /* FD_HAS_THREADS: If the build target supports a POSIX-ish notion of
78 : threads (e.g. practically speaking, global variables declared within
79 : a compile unit are visible to more than one thread of execution,
80 : pthreads.h / threading parts of C standard, the atomics parts of the
81 : C standard, ... more or less work normally), FD_HAS_THREADS will be
82 : 1. It will be zero otherwise. FD_HAS_THREADS implies FD_HAS_HOSTED
83 : and FD_HAS_ATOMIC. */
84 :
85 : #ifndef FD_HAS_THREADS
86 : #define FD_HAS_THREADS 0
87 : #endif
88 :
89 : /* FD_HAS_INT128: If the build target supports reasonably efficient
90 : 128-bit wide integer operations, define FD_HAS_INT128 to 1 to enable
91 : use of them in implementations. */
92 :
93 : #ifndef FD_HAS_INT128
94 : #define FD_HAS_INT128 0
95 : #endif
96 :
97 : /* FD_HAS_DOUBLE: If the build target supports reasonably efficient
98 : IEEE 754 64-bit wide double precision floating point options, define
99 : FD_HAS_DOUBLE to 1 to enable use of them in implementations. Note
100 : that even if the build target does not, va_args handling in the C /
101 : C++ language requires promotion of a float in an va_arg list to a
102 : double. Thus, C / C++ language that support IEEE 754 float also
103 : implies a minimum level of support for double (though not necessarily
104 : efficient or IEEE 754). That is, even if a target does not have
105 : FD_HAS_DOUBLE, there might still be limited use of double in va_arg
106 : list handling. */
107 :
108 : #ifndef FD_HAS_DOUBLE
109 : #define FD_HAS_DOUBLE 0
110 : #endif
111 :
112 : /* FD_HAS_ALLOCA: If the build target supports fast alloca-style
113 : dynamic stack memory allocation (e.g. alloca.h / __builtin_alloca
114 : more or less work normally), define FD_HAS_ALLOCA to 1 to enable use
115 : of it in implementations. */
116 :
117 : #ifndef FD_HAS_ALLOCA
118 : #define FD_HAS_ALLOCA 0
119 : #endif
120 :
121 : /* FD_HAS_X86: If the build target supports x86 specific features and
122 : can benefit from x86 specific optimizations, define FD_HAS_X86. Code
123 : needing more specific target features (Intel / AMD / SSE / AVX2 /
124 : AVX512 / etc) can specialize further as necessary with even more
125 : precise capabilities (that in turn imply FD_HAS_X86). */
126 :
127 : #ifndef FD_HAS_X86
128 : #define FD_HAS_X86 0
129 : #endif
130 :
131 : /* These allow even more precise targeting for X86. */
132 :
133 : /* FD_HAS_SSE indicates the target supports Intel SSE4 style SIMD
134 : (basically do the 128-bit wide parts of "x86intrin.h" work).
135 : Recommend using the simd/fd_sse.h APIs instead of raw Intel
136 : intrinsics for readability and to facilitate portability to non-x86
137 : platforms. Implies FD_HAS_X86. */
138 :
139 : #ifndef FD_HAS_SSE
140 : #define FD_HAS_SSE 0
141 : #endif
142 :
143 : /* FD_HAS_AVX indicates the target supports Intel AVX2 style SIMD
144 : (basically do the 256-bit wide parts of "x86intrin.h" work).
145 : Recommend using the simd/fd_avx.h APIs instead of raw Intel
146 : intrinsics for readability and to facilitate portability to non-x86
147 : platforms. Implies FD_HAS_SSE. */
148 :
149 : #ifndef FD_HAS_AVX
150 : #define FD_HAS_AVX 0
151 : #endif
152 :
153 : /* FD_HAS_AVX512 indicates the target supports Intel AVX-512 style SIMD
154 : (basically do the 512-bit wide parts of "x86intrin.h" work).
155 : Recommend using the simd/fd_avx512.h APIs instead of raw Intel
156 : intrinsics for readability and to facilitate portability to non-x86
157 : platforms. Implies FD_HAS_AVX. */
158 :
159 : #ifndef FD_HAS_AVX512
160 : #define FD_HAS_AVX512 0
161 : #endif
162 :
163 : /* FD_HAS_SHANI indicates that the target supports Intel SHA extensions
164 : which accelerate SHA-1 and SHA-256 computation. This extension is
165 : also called SHA-NI or SHA_NI (Secure Hash Algorithm New
166 : Instructions). Although proposed in 2013, they're only supported on
167 : Intel Ice Lake and AMD Zen CPUs and newer. Implies FD_HAS_AVX. */
168 :
169 : #ifndef FD_HAS_SHANI
170 : #define FD_HAS_SHANI 0
171 : #endif
172 :
173 : /* FD_HAS_GFNI indicates that the target supports Intel Galois Field
174 : extensions, which accelerate operations over binary extension fields,
175 : especially GF(2^8). These instructions are supported on Intel Ice
176 : Lake and newer and AMD Zen4 and newer CPUs. Implies FD_HAS_AVX. */
177 :
178 : #ifndef FD_HAS_GFNI
179 : #define FD_HAS_GFNI 0
180 : #endif
181 :
182 : /* FD_HAS_AESNI indicates that the target supports AES-NI extensions,
183 : which accelerate AES encryption and decryption. While AVX predates
184 : the original AES-NI extension, the combination of AES-NI+AVX adds
185 : additional opcodes (such as vaesenc, a more flexible variant of
186 : aesenc). Thus, implies FD_HAS_AVX. A conservative estimate for
187 : minimum platform support is Intel Haswell or AMD Zen. */
188 :
189 : #ifndef FD_HAS_AESNI
190 : #define FD_HAS_AESNI 0
191 : #endif
192 :
193 : /* FD_HAS_LZ4 indicates that the target supports LZ4 compression.
194 : Roughly, does "#include <lz4.h>" and the APIs therein work? */
195 :
196 : #ifndef FD_HAS_LZ4
197 : #define FD_HAS_LZ4 0
198 : #endif
199 :
200 : /* FD_HAS_ZSTD indicates that the target supports ZSTD compression.
201 : Roughly, does "#include <zstd.h>" and the APIs therein work? */
202 :
203 : #ifndef FD_HAS_ZSTD
204 : #define FD_HAS_ZSTD 0
205 : #endif
206 :
207 : /* FD_HAS_COVERAGE indicates that the build target is built with coverage instrumentation. */
208 :
209 : #ifndef FD_HAS_COVERAGE
210 : #define FD_HAS_COVERAGE 0
211 : #endif
212 :
213 : /* FD_HAS_ASAN indicates that the build target is using ASAN. */
214 :
215 : #ifndef FD_HAS_ASAN
216 : #define FD_HAS_ASAN 0
217 : #endif
218 :
219 : /* FD_HAS_UBSAN indicates that the build target is using UBSAN. */
220 :
221 : #ifndef FD_HAS_UBSAN
222 : #define FD_HAS_UBSAN 0
223 : #endif
224 :
225 : /* FD_HAS_DEEPASAN indicates that the build target is using ASAN with manual
226 : memory poisoning for fd_alloc, fd_wksp, and fd_scratch. */
227 :
228 : #ifndef FD_HAS_DEEPASAN
229 : #define FD_HAS_DEEPASAN 0
230 : #endif
231 :
232 : /* Base development environment ***************************************/
233 :
234 : /* The functionality provided by these vanilla headers are always
235 : available within the base development environment. Notably, stdio.h
236 : / stdlib.h / et al at are not included here as these make lots of
237 : assumptions about the build target that may not be true (especially
238 : for on-chain and custom hardware use). Code should prefer the fd
239 : util equivalents for such functionality when possible. */
240 :
241 : #include <stdalign.h>
242 : #include <string.h>
243 : #include <limits.h>
244 : #include <float.h>
245 :
246 : /* Work around some library naming irregularities */
247 : /* FIXME: Consider this for FLOAT/FLT, DOUBLE/DBL too? */
248 :
249 3 : #define SHORT_MIN SHRT_MIN
250 3 : #define SHORT_MAX SHRT_MAX
251 777916527 : #define USHORT_MAX USHRT_MAX
252 :
253 : /* Primitive types ****************************************************/
254 :
255 : /* These typedefs provide single token regularized names for all the
256 : primitive types in the base development environment:
257 :
258 : char !
259 : schar ! short int long int128 !!
260 : uchar ushort uint ulong uint128 !!
261 : float
262 : double !!!
263 :
264 : ! Does not assume the sign of char. A naked char should be treated
265 : as cstr character and mathematical operations should be avoided on
266 : them. This is less than ideal as the patterns for integer types in
267 : the C/C++ language spec itself are far more consistent with a naked
268 : char naturally being treated as signed (see above). But there are
269 : lots of conflicts between architectures, languages and standard
270 : libraries about this so any use of a naked char shouldn't assume
271 : the sign ... sigh.
272 :
273 : !! Only available if FD_HAS_INT128 is defined
274 :
275 : !!! Should only used if FD_HAS_DOUBLE is defined but see note in
276 : FD_HAS_DOUBLE about C/C++ silent promotions of float to double in
277 : va_arg lists.
278 :
279 : Note also that these token names more naturally interoperate with
280 : integer constant declarations, type generic code generation
281 : techniques, with printf-style format strings than the stdint.h /
282 : inttypes.h handling.
283 :
284 : To minimize portability issues, unexpected silent type conversion
285 : issues, align with typical developer implicit usage, align with
286 : typical build target usage, ..., assumes char / short / int / long
287 : are 8 / 16 / 32 / 64 twos complement integers and float is IEEE-754
288 : single precision. Further assumes little endian, truncating signed
289 : integer division, sign extending (arithmetic) signed right shift and
290 : signed left shift behaves the same as an unsigned left shift from bit
291 : operations point of view (technically the standard says signed left
292 : shift is undefined if the result would overflow). Also, except for
293 : int128/uint128, assumes that aligned access to these will be
294 : naturally atomic. Lastly assumes that unaligned access to these is
295 : functionally valid but does not assume that unaligned access to these
296 : is efficient or atomic.
297 :
298 : For values meant to be held in registers, code should prefer long /
299 : ulong types (improves asm generation given the prevalence of 64-bit
300 : targets and also to avoid lots of tricky bugs with silent promotions
301 : in the language ... e.g. ushort should ideally only be used for
302 : in-memory representations).
303 :
304 : These are currently not prefixed given how often they are used. If
305 : this becomes problematic prefixes can be added as necessary.
306 : Specifically, C++ allows typedefs to be defined multiple times so
307 : long as they are equivalent. Inequivalent collisions are not
308 : supported but should be rare (e.g. if a 3rd party header thinks
309 : "ulong" should be something other an "unsigned long", the 3rd party
310 : header probably should be nuked from orbit). C11 and forward also
311 : allow multiple equivalent typedefs. C99 and earlier don't but this
312 : is typically only a warning and then only if pedantic warnings are
313 : enabled. Thus, if we want to support users using C99 and earlier who
314 : want to do a strict compile and have a superfluous collision with
315 : these types in other libraries, uncomment the below (or do something
316 : equivalent for the compiler). */
317 :
318 : //#pragma GCC diagnostic push
319 : //#pragma GCC diagnostic ignored "-Wpedantic"
320 :
321 : typedef signed char schar; /* See above note of sadness */
322 :
323 : typedef unsigned char uchar;
324 : typedef unsigned short ushort;
325 : typedef unsigned int uint;
326 : typedef unsigned long ulong;
327 :
328 : #if FD_HAS_INT128
329 :
330 : __extension__ typedef __int128 int128;
331 : __extension__ typedef unsigned __int128 uint128;
332 :
333 1200000045 : #define UINT128_MAX (~(uint128)0)
334 6 : #define INT128_MAX ((int128)(UINT128_MAX>>1))
335 3 : #define INT128_MIN (-INT128_MAX-(int128)1)
336 :
337 : #endif
338 :
339 : //#pragma GCC diagnostic pop
340 :
341 : /* Compiler tricks ****************************************************/
342 :
343 : /* FD_STRINGIFY,FD_CONCAT{2,3,4}: Various macros for token
344 : stringification and pasting. FD_STRINGIFY returns the argument as a
345 : cstr (e.g. FD_STRINGIFY(foo) -> "foo"). FD_CONCAT* pastes the tokens
346 : together into a single token (e.g. FD_CONCAT3(a,b,c) -> abc). The
347 : EXPAND variants first expand their arguments and then do the token
348 : operation (e.g. FD_EXPAND_THEN_STRINGIFY(__LINE__) -> "104" if done
349 : on line 104 of the source code file). */
350 :
351 : #define FD_STRINGIFY(x)#x
352 4505091 : #define FD_CONCAT2(a,b)a##b
353 17118 : #define FD_CONCAT3(a,b,c)a##b##c
354 5522688 : #define FD_CONCAT4(a,b,c,d)a##b##c##d
355 :
356 : #define FD_EXPAND_THEN_STRINGIFY(x)FD_STRINGIFY(x)
357 4505091 : #define FD_EXPAND_THEN_CONCAT2(a,b)FD_CONCAT2(a,b)
358 >38188*10^7 : #define FD_EXPAND_THEN_CONCAT3(a,b,c)FD_CONCAT3(a,b,c)
359 5522688 : #define FD_EXPAND_THEN_CONCAT4(a,b,c,d)FD_CONCAT4(a,b,c,d)
360 :
361 : /* FD_VA_ARGS_SELECT(__VA_ARGS__,e32,e31,...e1): Macro that expands to
362 : en at compile time where n is number of items in the __VA_ARGS__
363 : list. If __VA_ARGS__ is empty, returns e1. Assumes __VA_ARGS__ has
364 : at most 32 arguments. Useful for making a variadic macro whose
365 : behavior depends on the number of arguments in __VA_ARGS__. */
366 :
367 : #define FD_VA_ARGS_SELECT(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,a,b,c,d,e,f,_,...)_
368 :
369 : /* FD_SRC_LOCATION returns a const cstr holding the line of code where
370 : FD_SRC_LOCATION was used. */
371 :
372 : #define FD_SRC_LOCATION __FILE__ "(" FD_EXPAND_THEN_STRINGIFY(__LINE__) ")"
373 :
374 : /* FD_STATIC_ASSERT tests at compile time if c is non-zero. If not,
375 : it aborts the compile with an error. err itself should be a token
376 : (e.g. not a string, no whitespace, etc). */
377 :
378 : #ifdef __cplusplus
379 : #define FD_STATIC_ASSERT(c,err) static_assert(c, #err)
380 : #else
381 17668 : #define FD_STATIC_ASSERT(c,err) _Static_assert(c, #err)
382 : #endif
383 :
384 : /* FD_ADDRESS_OF_PACKED_MEMBER(x): Linguistically does &(x) but without
385 : recent compiler complaints that &x might be unaligned if x is a
386 : member of a packed datastructure. (Often needed for interfacing with
387 : hardware / packets / etc.) */
388 :
389 3 : #define FD_ADDRESS_OF_PACKED_MEMBER( x ) (__extension__({ \
390 3 : char * _fd_aopm = (char *)&(x); \
391 3 : __asm__( "# FD_ADDRESS_OF_PACKED_MEMBER(" #x ") @" FD_SRC_LOCATION : "+r" (_fd_aopm) :: ); \
392 3 : (__typeof__(&(x)))_fd_aopm; \
393 3 : }))
394 :
395 : /* FD_PROTOTYPES_{BEGIN,END}: Headers that might be included in C++
396 : source should encapsulate the prototypes of code and globals
397 : contained in compilation units compiled as C with a
398 : FD_PROTOTYPE_{BEGIN,END} pair. */
399 :
400 : #ifdef __cplusplus
401 : #define FD_PROTOTYPES_BEGIN extern "C" {
402 : #else
403 : #define FD_PROTOTYPES_BEGIN
404 : #endif
405 :
406 : #ifdef __cplusplus
407 : #define FD_PROTOTYPES_END }
408 : #else
409 : #define FD_PROTOTYPES_END
410 : #endif
411 :
412 : /* FD_ASM_LG_ALIGN(lg_n) expands to an alignment assembler directive
413 : appropriate for the current architecture/ABI. The resulting align
414 : is 2^(lg_n) bytes, i.e. FD_ASM_LG_ALIGN(3) aligns by 8 bytes. */
415 :
416 : #if defined(__aarch64__)
417 : #define FD_ASM_LG_ALIGN(lg_n) ".align " #lg_n "\n"
418 : #elif defined(__x86_64__) || defined(__powerpc64__)
419 : #define FD_ASM_LG_ALIGN(lg_n) ".p2align " #lg_n "\n"
420 : #endif
421 :
422 : /* FD_IMPORT declares a variable name and initializes with the contents
423 : of the file at path (with potentially some assembly directives for
424 : additional footer info). It is equivalent to:
425 :
426 : type const name[] __attribute__((aligned(align))) = {
427 :
428 : ... code that would initialize the contents of name to the
429 : ... raw binary data found in the file at path at compile time
430 : ... (with any appended information as specified by footer)
431 :
432 : };
433 :
434 : ulong const name_sz = ... number of bytes pointed to by name;
435 :
436 : More precisely, this creates a symbol "name" in the object file that
437 : points to a read-only copy of the raw data in the file at "path" as
438 : it was at compile time. 2^lg_align specifies the minimum alignment
439 : required for the copy's first byte as an unsuffixed decimal integer.
440 : footer are assembly commands to permit additional data to be appended
441 : to the copy (use "" for footer if no footer is necessary).
442 :
443 : Then it exposes a pointer to this copy in the current compilation
444 : unit as name and the byte size as name_sz. name_sz covers the first
445 : byte of the included data to the last byte of the footer inclusive.
446 :
447 : The dummy linker symbol _fd_import_name_sz will also be created in
448 : the object file as some under the hood magic to make this work. This
449 : should not be used in any compile unit as some compilers (I'm looking
450 : at you clang-15, but apparently not clang-10) will sometimes mangle
451 : its value from what it was set to in the object file even marked as
452 : absolute in the object file.
453 :
454 : This should only be used at global scope and should be done at most
455 : once over all object files / libraries used to make a program. If
456 : other compilation units want to make use of an import in a different
457 : compilation unit, they should declare:
458 :
459 : extern type const name[] __attribute__((aligned(align)));
460 :
461 : and/or:
462 :
463 : extern ulong const name_sz;
464 :
465 : as necessary (that is, do the usual to use name and name_sz as shown
466 : for the pseudo code above).
467 :
468 : Important safety tip! gcc -M will generally not detect the
469 : dependency this creates between the importing file and the imported
470 : file. This can cause incremental builds to miss changes to the
471 : imported file. Ideally, we would have FD_IMPORT automatically do
472 : something like:
473 :
474 : _Pragma( "GCC dependency \"" path "\" )
475 :
476 : This doesn't work as is because _Pragma needs some macro expansion
477 : hacks to accept this (this is doable). After that workaround, this
478 : still doesn't work because, due to tooling limitations, the pragma
479 : path is relative to the source file directory and the FD_IMPORT path
480 : is relative to the the make directory (working around this would
481 : require a __FILE__-like directive for the source code directory base
482 : path). Even if that did exist, it might still not work because
483 : out-of-tree builds often require some substitutions to the gcc -M
484 : generated dependencies that this might not pick up (at least not
485 : without some build system surgery). And then it still wouldn't work
486 : because gcc -M seems to ignore all of this anyways (which is the
487 : actual show stopper as this pragma does something subtly different
488 : than what the name suggests and there isn't any obvious support for a
489 : "pseudo-include".) Another reminder that make clean and fast builds
490 : are our friend. */
491 :
492 : #if defined(__ELF__)
493 :
494 : #define FD_IMPORT( name, path, type, lg_align, footer ) \
495 : __asm__( ".section .rodata,\"a\",@progbits\n" \
496 : ".type " #name ",@object\n" \
497 : ".globl " #name "\n" \
498 : FD_ASM_LG_ALIGN(lg_align) \
499 : #name ":\n" \
500 : ".incbin \"" path "\"\n" \
501 : footer "\n" \
502 : ".size " #name ",. - " #name "\n" \
503 : "_fd_import_" #name "_sz = . - " #name "\n" \
504 : ".type " #name "_sz,@object\n" \
505 : ".globl " #name "_sz\n" \
506 : FD_ASM_LG_ALIGN(3) \
507 : #name "_sz:\n" \
508 : ".quad _fd_import_" #name "_sz\n" \
509 : ".size " #name "_sz,8\n" \
510 : ".previous\n" ); \
511 : extern type const name[] __attribute__((aligned(1<<(lg_align)))); \
512 : extern ulong const name##_sz
513 :
514 : #elif defined(__MACH__)
515 :
516 : #define FD_IMPORT( name, path, type, lg_align, footer ) \
517 : __asm__( ".section __DATA,__const\n" \
518 : ".globl _" #name "\n" \
519 : FD_ASM_LG_ALIGN(lg_align) \
520 : "_" #name ":\n" \
521 : ".incbin \"" path "\"\n" \
522 : footer "\n" \
523 : "_fd_import_" #name "_sz = . - _" #name "\n" \
524 : ".globl _" #name "_sz\n" \
525 : FD_ASM_LG_ALIGN(3) \
526 : "_" #name "_sz:\n" \
527 : ".quad _fd_import_" #name "_sz\n" \
528 : ".previous\n" ); \
529 : extern type const name[] __attribute__((aligned(1<<(lg_align)))); \
530 : extern ulong const name##_sz
531 :
532 : #endif
533 :
534 : /* FD_IMPORT_{BINARY,CSTR} are common cases for FD_IMPORT.
535 :
536 : In BINARY, the file is imported into the object file and exposed to
537 : the caller as a uchar binary data. name_sz will be the number of
538 : bytes in the file at time of import. name will have 128 byte
539 : alignment.
540 :
541 : In CSTR, the file is imported into the object caller with a '\0'
542 : termination appended and exposed to the caller as a cstr. Assuming
543 : the file is text (i.e. has no internal '\0's), strlen(name) will the
544 : number of bytes in the file and name_sz will be strlen(name)+1. name
545 : can have arbitrary alignment. */
546 :
547 : #ifdef FD_IMPORT
548 : #define FD_IMPORT_BINARY(name, path) FD_IMPORT( name, path, uchar, 7, "" )
549 : #define FD_IMPORT_CSTR( name, path) FD_IMPORT( name, path, char, 1, ".byte 0" )
550 : #endif
551 :
552 : /* Optimizer tricks ***************************************************/
553 :
554 : /* FD_RESTRICT is a pointer modifier for to designate a pointer as
555 : restricted. Hoops jumped because C++-17 still doesn't understand
556 : restrict ... sigh */
557 :
558 : #ifndef FD_RESTRICT
559 : #ifdef __cplusplus
560 : #define FD_RESTRICT __restrict
561 : #else
562 : #define FD_RESTRICT restrict
563 : #endif
564 : #endif
565 :
566 : /* fd_type_pun(p), fd_type_pun_const(p): These allow use of type
567 : punning while keeping strict aliasing optimizations enabled (e.g.
568 : some UNIX APIs, like sockaddr related APIs are dependent on type
569 : punning). These allow these API's to be used cleanly while keeping
570 : strict aliasing optimizations enabled and strict alias checking done. */
571 :
572 : static inline void *
573 55832808 : fd_type_pun( void * p ) {
574 55832808 : __asm__( "# fd_type_pun @" FD_SRC_LOCATION : "+r" (p) :: "memory" );
575 55832808 : return p;
576 55832808 : }
577 :
578 : static inline void const *
579 238578943 : fd_type_pun_const( void const * p ) {
580 238578943 : __asm__( "# fd_type_pun_const @" FD_SRC_LOCATION : "+r" (p) :: "memory" );
581 238578943 : return p;
582 238578943 : }
583 :
584 : /* FD_{LIKELY,UNLIKELY}(c): Evaluates c and returns whether it is
585 : logical true/false as long (1L/0L). It also hints to the optimizer
586 : whether it should optimize for the case of c evaluating as
587 : true/false. */
588 :
589 >14207*10^7 : #define FD_LIKELY(c) __builtin_expect( !!(c), 1L )
590 >61437*10^7 : #define FD_UNLIKELY(c) __builtin_expect( !!(c), 0L )
591 :
592 : /* FD_FN_PURE hints to the optimizer that the function, roughly
593 : speaking, does not have side effects. As such, the compiler can
594 : replace a call to the function with the result of an earlier call to
595 : that function provide the inputs and memory used hasn't changed.
596 :
597 : IMPORTANT SAFETY TIP! Recent compilers seem to take an undocumented
598 : and debatable stance that pure functions do no writes to memory.
599 : This is a sufficient condition for the above but not a necessary one.
600 :
601 : Consider, for example, the real world case of an otherwise pure
602 : function that uses pass-by-reference to return more than one value
603 : (an unpleasant practice that is sadly often necessary because C/C++,
604 : compilers and underlying platform ABIs are very bad at helping
605 : developers simply and clearly express their intent to return multiple
606 : values and then generate good assembly for such).
607 :
608 : If called multiple times sequentially, all but the first call to such
609 : a "pure" function could be optimized away because the non-volatile
610 : memory writes done in the all but the 1st call for the
611 : pass-by-reference-returns write the same value to normal memory that
612 : was written on the 1st call. That is, these calls return the same
613 : value for their direct return and do writes that do not have any
614 : visible effect.
615 :
616 : Thus, while it is safe for the compiler to eliminate all but the
617 : first call via techniques like common subexpression elimination, it
618 : is not safe for the compiler to infer that the first call did no
619 : writes.
620 :
621 : But recent compilers seem to do exactly that.
622 :
623 : Sigh ... we can't use FD_FN_PURE on such functions because of all the
624 : above linguistic, compiler, documentation and ABI infinite sadness.
625 :
626 : TL;DR To be safe against the above vagaries, recommend using
627 : FD_FN_PURE to annotate functions that do no memory writes (including
628 : trivial memory writes) and try to design HPC APIs to avoid returning
629 : multiple values as much as possible. */
630 :
631 : #define FD_FN_PURE __attribute__((pure))
632 :
633 : /* FD_FN_CONST is like pure but also, even stronger, indicates that the
634 : function does not depend on the state of memory. */
635 :
636 : #define FD_FN_CONST __attribute__((const))
637 :
638 : /* FD_FN_UNUSED indicates that it is okay if the function with static
639 : linkage is not used. Allows working around -Winline in header only
640 : APIs where the compiler decides not to actually inline the function.
641 : (This belief, frequently promulgated by anti-macro cults, that "An
642 : Inline Function is As Fast As a Macro" ... an entire section in gcc's
643 : documentation devoted to it in fact ... remains among the biggest
644 : lies in computer science. Yes, an inline function is as fast as a
645 : macro ... when the compiler actually decides to treat the inline
646 : keyword more than just for entertainment purposes only. Which, as
647 : -Winline proves, it frequently doesn't. Sigh ... force_inline like
648 : compiler extensions might be an alternative here but they have their
649 : own portability issues.) */
650 :
651 42 : #define FD_FN_UNUSED __attribute__((unused))
652 :
653 : /* FD_FN_UNSANITIZED tells the compiler to disable AddressSanitizer and
654 : UndefinedBehaviorSanitizer instrumentation. For some functions, this
655 : can improve instrumented compile time by ~30x. */
656 :
657 : #define FD_FN_UNSANITIZED __attribute__((no_sanitize("address", "undefined")))
658 :
659 : /* FD_FN_SENSITIVE instruments the compiler to sanitize sensitive functions.
660 : https://eprint.iacr.org/2023/1713 (Sec 3.2)
661 : - Clear all registers with __attribute__((zero_call_used_regs("all")))
662 : - Clear stack with __attribute__((strub)), available in gcc 14+ */
663 :
664 : #if __has_attribute(strub)
665 : #define FD_FN_SENSITIVE __attribute__((strub)) __attribute__((zero_call_used_regs("all")))
666 : #elif __has_attribute(zero_call_used_regs)
667 : #define FD_FN_SENSITIVE __attribute__((zero_call_used_regs("all")))
668 : #else
669 : #define FD_FN_SENSITIVE
670 : #endif
671 :
672 : /* FD_PARAM_UNUSED indicates that it is okay if the function parameter is not
673 : used. */
674 :
675 : #define FD_PARAM_UNUSED __attribute__((unused))
676 :
677 : /* FD_TYPE_PACKED indicates that a type is to be packed, reseting its alignment
678 : to 1. */
679 :
680 : #define FD_TYPE_PACKED __attribute__((packed))
681 :
682 : /* FD_WARN_UNUSED tells the compiler the result (from a function) should
683 : be checked. This is useful to force callers to either check the result
684 : or deliberately and explicitly ignore it. Good for result codes and
685 : errors */
686 :
687 : #define FD_WARN_UNUSED __attribute__ ((warn_unused_result))
688 :
689 : /* FD_FALLTHRU tells the compiler that a case in a switch falls through
690 : to the next case. This avoids the compiler complaining, in cases where
691 : it is an intentional fall through.
692 : The "while(0)" avoids a compiler complaint in the event the case
693 : has no statement, example:
694 : switch( return_code ) {
695 : case RETURN_CASE_1: FD_FALLTHRU;
696 : case RETURN_CASE_2: FD_FALLTHRU;
697 : case RETURN_CASE_3:
698 : case_123();
699 : default:
700 : case_other();
701 : }
702 :
703 : See C++17 [[fallthrough]] and gcc __attribute__((fallthrough)) */
704 :
705 : #define FD_FALLTHRU while(0) __attribute__((fallthrough))
706 :
707 : /* FD_COMPILER_FORGET(var): Tells the compiler that it shouldn't use
708 : any knowledge it has about the provided register-compatible variable
709 : var for optimizations going forward (i.e. the variable has changed in
710 : a deterministic but unknown-to-the-compiler way where the actual
711 : change is the identity operation). Useful for inhibiting various
712 : branch nest misoptimizations (compilers unfortunately tend to
713 : radically underestimate the impact in raw average performance and
714 : jitter and the probability of branch mispredicts or the cost to the
715 : CPU of having lots of branches). This is not asm volatile (use
716 : UNPREDICTABLE below for that) and has no clobbers. So if var is not
717 : used after the forget, the compiler can optimize the FORGET away
718 : (along with operations preceding it used to produce var). */
719 :
720 42548311777 : #define FD_COMPILER_FORGET(var) __asm__( "# FD_COMPILER_FORGET(" #var ")@" FD_SRC_LOCATION : "+r" (var) )
721 :
722 : /* FD_COMPILER_UNPREDICTABLE(var): Same as FD_COMPILER_FORGET(var) but
723 : the provided variable has changed in a non-deterministic way from the
724 : compiler's POV (e.g. the value in the variable on output should not
725 : be treated as a compile time constant even if it is one
726 : linguistically). Useful for suppressing unwanted
727 : compile-time-const-based optimizations like hoisting operations with
728 : useful CPU side effects out of a critical loop. */
729 :
730 31289889 : #define FD_COMPILER_UNPREDICTABLE(var) __asm__ __volatile__( "# FD_COMPILER_UNPREDICTABLE(" #var ")@" FD_SRC_LOCATION : "+r" (var) )
731 :
732 : /* Atomic tricks ******************************************************/
733 :
734 : /* FD_COMPILER_MFENCE(): Tells the compiler that that it can't move any
735 : memory operations (load or store) from before the MFENCE to after the
736 : MFENCE (and vice versa). The processor itself might still reorder
737 : around the fence though (that requires platform specific fences). */
738 :
739 >13047*10^7 : #define FD_COMPILER_MFENCE() __asm__ __volatile__( "# FD_COMPILER_MFENCE()@" FD_SRC_LOCATION ::: "memory" )
740 :
741 : /* FD_SPIN_PAUSE(): Yields the logical core of the calling thread to
742 : the other logical cores sharing the same underlying physical core for
743 : a few clocks without yielding it to the operating system scheduler.
744 : Typically useful for shared memory spin polling loops, especially if
745 : hyperthreading is in use. IMPORTANT SAFETY TIP! This might act as a
746 : FD_COMPILER_MFENCE on some combinations of toolchains and targets
747 : (e.g. gcc documents that __builtin_ia32_pause also does a compiler
748 : memory) but this should not be relied upon for portable code
749 : (consider making this a compiler memory fence on all platforms?) */
750 :
751 : #if FD_HAS_X86
752 16618720462 : #define FD_SPIN_PAUSE() __builtin_ia32_pause()
753 : #else
754 : #define FD_SPIN_PAUSE() ((void)0)
755 : #endif
756 :
757 : /* FD_YIELD(): Yields the logical core of the calling thread to the
758 : operating system scheduler if a hosted target and does a spin pause
759 : otherwise. */
760 :
761 : #if FD_HAS_HOSTED
762 61767 : #define FD_YIELD() fd_yield()
763 : #else
764 : #define FD_YIELD() FD_SPIN_PAUSE()
765 : #endif
766 :
767 : /* FD_VOLATILE_CONST(x): Tells the compiler is not able to predict the
768 : value obtained by dereferencing x and that dereferencing x might have
769 : other side effects (e.g. maybe another thread could change the value
770 : and the compiler has no way of knowing this). Generally speaking,
771 : the volatile keyword is broken linguistically. Volatility is not a
772 : property of the variable but of the dereferencing of a variable (e.g.
773 : what is volatile from the POV of a reader of a shared variable is not
774 : necessarily volatile from the POV a writer of that shared variable in
775 : a different thread). */
776 :
777 2188607911 : #define FD_VOLATILE_CONST(x) (*((volatile const __typeof__((x)) *)&(x)))
778 :
779 : /* FD_VOLATILE(x): tells the compiler is not able to predict the effect
780 : of modifying x and that dereferencing x might have other side effects
781 : (e.g. maybe another thread is spinning on x waiting for its value to
782 : change and the compiler has no way of knowing this). */
783 :
784 1189982370 : #define FD_VOLATILE(x) (*((volatile __typeof__((x)) *)&(x)))
785 :
786 : #if FD_HAS_ATOMIC
787 :
788 : /* FD_ATOMIC_FETCH_AND_{ADD,SUB,OR,AND,XOR}(p,v):
789 :
790 : FD_ATOMIC_FETCH_AND_ADD(p,v) does
791 : f = *p;
792 : *p = f + v
793 : return f;
794 : as a single atomic operation. Similarly for the other variants. */
795 :
796 130931841 : #define FD_ATOMIC_FETCH_AND_ADD(p,v) __sync_fetch_and_add( (p), (v) )
797 131337885 : #define FD_ATOMIC_FETCH_AND_SUB(p,v) __sync_fetch_and_sub( (p), (v) )
798 45011349 : #define FD_ATOMIC_FETCH_AND_OR( p,v) __sync_fetch_and_or( (p), (v) )
799 : #define FD_ATOMIC_FETCH_AND_AND(p,v) __sync_fetch_and_and( (p), (v) )
800 : #define FD_ATOMIC_FETCH_AND_XOR(p,v) __sync_fetch_and_xor( (p), (v) )
801 :
802 : /* FD_ATOMIC_{ADD,SUB,OR,AND,XOR}_AND_FETCH(p,v):
803 :
804 : FD_ATOMIC_{ADD,SUB,OR,AND,XOR}_AND_FETCH(p,v) does
805 : r = *p + v;
806 : *p = r;
807 : return r;
808 : as a single atomic operation. Similarly for the other variants. */
809 :
810 : #define FD_ATOMIC_ADD_AND_FETCH(p,v) __sync_add_and_fetch( (p), (v) )
811 : #define FD_ATOMIC_SUB_AND_FETCH(p,v) __sync_sub_and_fetch( (p), (v) )
812 : #define FD_ATOMIC_OR_AND_FETCH( p,v) __sync_or_and_fetch( (p), (v) )
813 : #define FD_ATOMIC_AND_AND_FETCH(p,v) __sync_and_and_fetch( (p), (v) )
814 : #define FD_ATOMIC_XOR_AND_FETCH(p,v) __sync_xor_and_fetch( (p), (v) )
815 :
816 : /* FD_ATOMIC_CAS(p,c,s):
817 :
818 : o = FD_ATOMIC_CAS(p,c,s) conceptually does:
819 : o = *p;
820 : if( o==c ) *p = s;
821 : return o
822 : as a single atomic operation. */
823 :
824 595297881 : #define FD_ATOMIC_CAS(p,c,s) __sync_val_compare_and_swap( (p), (c), (s) )
825 :
826 : /* FD_ATOMIC_XCHG(p,v):
827 :
828 : o = FD_ATOMIC_XCHG( p, v ) conceptually does:
829 : o = *p
830 : *p = v
831 : return o
832 : as a single atomic operation.
833 :
834 : Intel's __sync compiler extensions from the days of yore mysteriously
835 : implemented atomic exchange via the very misleadingly named
836 : __sync_lock_test_and_set. And some implementations (and C++)
837 : debatably then implemented this API according to what the misleading
838 : name implied as opposed to what it actually did. But those
839 : implementations didn't bother to provide an replacement for atomic
840 : exchange functionality (forcing us to emulate atomic exchange more
841 : slowly via CAS there). Sigh ... we do what we can to fix this up. */
842 :
843 : #ifndef FD_ATOMIC_XCHG_STYLE
844 : #if FD_HAS_X86 && !__cplusplus
845 : #define FD_ATOMIC_XCHG_STYLE 1
846 : #else
847 : #define FD_ATOMIC_XCHG_STYLE 0
848 : #endif
849 : #endif
850 :
851 : #if FD_ATOMIC_XCHG_STYLE==0
852 : #define FD_ATOMIC_XCHG(p,v) (__extension__({ \
853 : __typeof__(*(p)) * _fd_atomic_xchg_p = (p); \
854 : __typeof__(*(p)) _fd_atomic_xchg_v = (v); \
855 : __typeof__(*(p)) _fd_atomic_xchg_t; \
856 : for(;;) { \
857 : _fd_atomic_xchg_t = FD_VOLATILE_CONST( *_fd_atomic_xchg_p ); \
858 : if( FD_LIKELY( __sync_bool_compare_and_swap( _fd_atomic_xchg_p, _fd_atomic_xchg_t, _fd_atomic_xchg_v ) ) ) break; \
859 : FD_SPIN_PAUSE(); \
860 : } \
861 : _fd_atomic_xchg_t; \
862 : }))
863 : #elif FD_ATOMIC_XCHG_STYLE==1
864 1086995802 : #define FD_ATOMIC_XCHG(p,v) __sync_lock_test_and_set( (p), (v) )
865 : #else
866 : #error "Unknown FD_ATOMIC_XCHG_STYLE"
867 : #endif
868 :
869 : #endif /* FD_HAS_ATOMIC */
870 :
871 : /* FD_TL: This indicates that the variable should be thread local.
872 :
873 : FD_ONCE_{BEGIN,END}: The block:
874 :
875 : FD_ONCE_BEGIN {
876 : ... code ...
877 : } FD_ONCE_END
878 :
879 : linguistically behaves like:
880 :
881 : do {
882 : ... code ...
883 : } while(0)
884 :
885 : But provides a low overhead guarantee that:
886 : - The block will be executed by at most once over all threads
887 : in a process (i.e. the set of threads which share global
888 : variables).
889 : - No thread in a process that encounters the block will continue
890 : past it until it has executed once.
891 :
892 : This implies that caller promises a ONCE block will execute in a
893 : finite time. (Meant for doing simple lightweight initializations.)
894 :
895 : It is okay to nest ONCE blocks. The thread that executes the
896 : outermost will execute all the nested once as part of executing the
897 : outermost.
898 :
899 : A ONCE implicitly provides a compiler memory fence to reduce the risk
900 : that the compiler will assume that operations done in the once block
901 : on another thread have not been done (e.g. propagating pre-once block
902 : variable values into post-once block code). It is up to the user to
903 : provide any necessary hardware fencing (usually not necessary).
904 :
905 : FD_THREAD_ONCE_{BEGIN,END}: The block:
906 :
907 : FD_THREAD_ONCE_BEGIN {
908 : ... code ...
909 : } FD_THREAD_ONCE_END;
910 :
911 : is similar except the guarantee is that the block only covers the
912 : invoking thread and it does not provide any fencing. If a thread
913 : once begin is nested inside a once begin, that thread once begin will
914 : only be executed on the thread that executes the thread once begin.
915 : It is similarly okay to nest ONCE block inside a THREAD_ONCE block. */
916 :
917 : #if FD_HAS_THREADS /* Potentially more than one thread in the process */
918 :
919 : #ifndef FD_TL
920 : #define FD_TL __thread
921 : #endif
922 :
923 4722 : #define FD_ONCE_BEGIN do { \
924 4722 : FD_COMPILER_MFENCE(); \
925 4722 : static volatile int _fd_once_block_state = 0; \
926 4722 : for(;;) { \
927 4722 : int _fd_once_block_tmp = _fd_once_block_state; \
928 4722 : if( FD_LIKELY( _fd_once_block_tmp>0 ) ) break; \
929 4722 : if( FD_LIKELY( !_fd_once_block_tmp ) && \
930 2784 : FD_LIKELY( !FD_ATOMIC_CAS( &_fd_once_block_state, 0, -1 ) ) ) { \
931 2784 : do
932 :
933 : #define FD_ONCE_END \
934 2784 : while(0); \
935 2784 : FD_COMPILER_MFENCE(); \
936 2784 : _fd_once_block_state = 1; \
937 2784 : break; \
938 2784 : } \
939 2784 : FD_YIELD(); \
940 0 : } \
941 4722 : } while(0)
942 :
943 36 : #define FD_THREAD_ONCE_BEGIN do { \
944 36 : static FD_TL int _fd_thread_once_block_state = 0; \
945 36 : if( FD_UNLIKELY( !_fd_thread_once_block_state ) ) { \
946 9 : do
947 :
948 : #define FD_THREAD_ONCE_END \
949 9 : while(0); \
950 9 : _fd_thread_once_block_state = 1; \
951 9 : } \
952 36 : } while(0)
953 :
954 : #else /* Only one thread in the process */
955 :
956 : #ifndef FD_TL
957 : #define FD_TL /**/
958 : #endif
959 :
960 : #define FD_ONCE_BEGIN do { \
961 : static int _fd_once_block_state = 0; \
962 : if( FD_UNLIKELY( !_fd_once_block_state ) ) { \
963 : do
964 :
965 : #define FD_ONCE_END \
966 : while(0); \
967 : _fd_once_block_state = 1; \
968 : } \
969 : } while(0)
970 :
971 : #define FD_THREAD_ONCE_BEGIN FD_ONCE_BEGIN
972 : #define FD_THREAD_ONCE_END FD_ONCE_END
973 :
974 : #endif
975 :
976 : FD_PROTOTYPES_BEGIN
977 :
978 : /* fd_memcpy(d,s,sz): On modern x86 in some circumstances, rep mov will
979 : be faster than memcpy under the hood (basically due to RFO /
980 : read-for-ownership optimizations in the cache protocol under the hood
981 : that aren't easily done from the ISA ... see Intel docs on enhanced
982 : rep mov). Compile time configurable though as this is not always
983 : true. So application can tune to taste. Hard to beat rep mov for
984 : code density though (2 bytes) and pretty hard to beat in situations
985 : needing a completely generic memcpy. But it can be beaten in
986 : specialized situations for the usual reasons. */
987 :
988 : /* FIXME: CONSIDER MEMCMP TOO! */
989 : /* FIXME: CONSIDER MEMCPY RELATED FUNC ATTRS */
990 :
991 : #ifndef FD_USE_ARCH_MEMCPY
992 : #define FD_USE_ARCH_MEMCPY 0
993 : #endif
994 :
995 : #if FD_HAS_X86 && FD_USE_ARCH_MEMCPY && !defined(CBMC) && !FD_HAS_DEEPASAN && !FD_HAS_MSAN
996 :
997 : static inline void *
998 : fd_memcpy( void * FD_RESTRICT d,
999 : void const * FD_RESTRICT s,
1000 539575472 : ulong sz ) {
1001 539575472 : void * p = d;
1002 539575472 : __asm__ __volatile__( "rep movsb" : "+D" (p), "+S" (s), "+c" (sz) :: "memory" );
1003 539575472 : return d;
1004 539575472 : }
1005 :
1006 : #elif FD_HAS_MSAN
1007 :
1008 : void * __msan_memcpy( void * dest, void const * src, ulong n );
1009 :
1010 : static inline void *
1011 : fd_memcpy( void * FD_RESTRICT d,
1012 : void const * FD_RESTRICT s,
1013 : ulong sz ) {
1014 : return __msan_memcpy( d, s, sz );
1015 : }
1016 :
1017 : #else
1018 :
1019 : static inline void *
1020 : fd_memcpy( void * FD_RESTRICT d,
1021 : void const * FD_RESTRICT s,
1022 1028923441 : ulong sz ) {
1023 : #if defined(CBMC) || FD_HAS_ASAN
1024 : if( FD_UNLIKELY( !sz ) ) return d; /* Standard says sz 0 is UB, uncomment if target is insane and doesn't treat sz 0 as a nop */
1025 : #endif
1026 1028923441 : return memcpy( d, s, sz );
1027 1028923441 : }
1028 :
1029 : #endif
1030 :
1031 : /* fd_memset(d,c,sz): architecturally optimized memset. See fd_memcpy
1032 : for considerations. */
1033 :
1034 : /* FIXME: CONSIDER MEMSET RELATED FUNC ATTRS */
1035 :
1036 : #ifndef FD_USE_ARCH_MEMSET
1037 : #define FD_USE_ARCH_MEMSET 0
1038 : #endif
1039 :
1040 : #if FD_HAS_X86 && FD_USE_ARCH_MEMSET && !defined(CBMC) && !FD_HAS_DEEPASAN && !FD_HAS_MSAN
1041 :
1042 : static inline void *
1043 : fd_memset( void * d,
1044 : int c,
1045 240369989 : ulong sz ) {
1046 240369989 : void * p = d;
1047 240369989 : __asm__ __volatile__( "rep stosb" : "+D" (p), "+c" (sz) : "a" (c) : "memory" );
1048 240369989 : return d;
1049 240369989 : }
1050 :
1051 : #else
1052 :
1053 : static inline void *
1054 : fd_memset( void * d,
1055 : int c,
1056 470713268 : ulong sz ) {
1057 : # ifdef CBMC
1058 : if( FD_UNLIKELY( !sz ) ) return d; /* See fd_memcpy note */
1059 : # endif
1060 470713268 : return memset( d, c, sz );
1061 470713268 : }
1062 :
1063 : #endif
1064 :
1065 : /* C23 has memset_explicit, i.e. a memset that can't be removed by the
1066 : optimizer. This is our own equivalent. */
1067 :
1068 : static void * (* volatile fd_memset_explicit)(void *, int, size_t) = memset;
1069 :
1070 : /* fd_memeq(s0,s1,sz): Compares two blocks of memory. Returns 1 if
1071 : equal or sz is zero and 0 otherwise. No memory accesses made if sz
1072 : is zero (pointers may be invalid). On x86, uses repe cmpsb which is
1073 : preferable to __builtin_memcmp in some cases. */
1074 :
1075 : #ifndef FD_USE_ARCH_MEMEQ
1076 : #define FD_USE_ARCH_MEMEQ 0
1077 : #endif
1078 :
1079 : #if FD_HAS_X86 && FD_USE_ARCH_MEMEQ && defined(__GCC_ASM_FLAG_OUTPUTS__) && __STDC_VERSION__>=199901L
1080 :
1081 : FD_FN_PURE static inline int
1082 : fd_memeq( void const * s0,
1083 : void const * s1,
1084 : ulong sz ) {
1085 : /* ZF flag is set and exported in two cases:
1086 : a) size is zero (via test)
1087 : b) buffer is equal (via repe cmpsb) */
1088 : int r;
1089 : __asm__( "test %3, %3;"
1090 : "repe cmpsb"
1091 : : "=@cce" (r), "+S" (s0), "+D" (s1), "+c" (sz)
1092 : : "m" (*(char const (*)[sz]) s0), "m" (*(char const (*)[sz]) s1)
1093 : : "cc" );
1094 : return r;
1095 : }
1096 :
1097 : #else
1098 :
1099 : FD_FN_PURE static inline int
1100 : fd_memeq( void const * s1,
1101 : void const * s2,
1102 15052830 : ulong sz ) {
1103 15052830 : return 0==memcmp( s1, s2, sz );
1104 15052830 : }
1105 :
1106 : #endif
1107 :
1108 : /* fd_hash(seed,buf,sz), fd_hash_memcpy(seed,d,s,sz): High quality
1109 : (full avalanche) high speed variable length buffer -> 64-bit hash
1110 : function (memcpy_hash is often as fast as plain memcpy). Based on
1111 : the xxhash-r39 (open source BSD licensed) implementation. In-place
1112 : and out-of-place variants provided (out-of-place variant assumes dst
1113 : and src do not overlap). Caller promises valid input arguments,
1114 : cannot fail given valid inputs arguments. sz==0 is fine. */
1115 :
1116 : FD_FN_PURE ulong
1117 : fd_hash( ulong seed,
1118 : void const * buf,
1119 : ulong sz );
1120 :
1121 : ulong
1122 : fd_hash_memcpy( ulong seed,
1123 : void * FD_RESTRICT d,
1124 : void const * FD_RESTRICT s,
1125 : ulong sz );
1126 :
1127 : #ifndef FD_TICKCOUNT_STYLE
1128 : #if FD_HAS_X86 /* Use RDTSC */
1129 : #define FD_TICKCOUNT_STYLE 1
1130 : #else /* Use portable fallback */
1131 : #define FD_TICKCOUNT_STYLE 0
1132 : #endif
1133 : #endif
1134 :
1135 : #if FD_TICKCOUNT_STYLE==0 /* Portable fallback (slow). Ticks at 1 ns / tick */
1136 :
1137 : #define fd_tickcount() fd_log_wallclock() /* TODO: fix ugly pre-log usage */
1138 :
1139 : #elif FD_TICKCOUNT_STYLE==1 /* RTDSC (fast) */
1140 :
1141 : /* fd_tickcount: Reads the hardware invariant tickcounter ("RDTSC").
1142 : This monotonically increases at an approximately constant rate
1143 : relative to the system wallclock and is synchronous across all CPUs
1144 : on a host.
1145 :
1146 : The rate this ticks at is not precisely defined (see Intel docs for
1147 : more details) but it is typically in the ballpark of the CPU base
1148 : clock frequency. The relationship to the wallclock is very well
1149 : approximated as linear over short periods of time (i.e. less than a
1150 : fraction of a second) and this should not exhibit any sudden changes
1151 : in its rate relative to the wallclock. Notably, its rate is not
1152 : directly impacted by CPU clock frequency adaptation / Turbo mode (see
1153 : other Intel performance monitoring counters for various CPU cycle
1154 : counters). It can drift over longer period time for the usual clock
1155 : synchronization reasons.
1156 :
1157 : This is a reasonably fast O(1) cost (~6-8 ns on recent Intel).
1158 : Because of all compiler options and parallel execution going on in
1159 : modern CPUs cores, other instructions might be reordered around this
1160 : by the compiler and/or CPU. It is up to the user to do lower level
1161 : tricks as necessary when the precise location of this in the
1162 : execution stream and/or when executed by the CPU is needed. (This is
1163 : often unnecessary as such levels of precision are not frequently
1164 : required and often have self-defeating overheads.)
1165 :
1166 : It is worth noting that RDTSC and/or (even more frequently) lower
1167 : level performance counters are often restricted from use in user
1168 : space applications. It is recommended that applications use this
1169 : primarily for debugging / performance tuning on unrestricted hosts
1170 : and/or when the developer is confident that applications using this
1171 : will have appropriate permissions when deployed. */
1172 :
1173 1290686786 : #define fd_tickcount() ((long)__builtin_ia32_rdtsc())
1174 :
1175 : #else
1176 : #error "Unknown FD_TICKCOUNT_STYLE"
1177 : #endif
1178 :
1179 : #if FD_HAS_HOSTED
1180 :
1181 : /* fd_yield yields the calling thread to the operating system scheduler. */
1182 :
1183 : void
1184 : fd_yield( void );
1185 :
1186 : #endif
1187 :
1188 : FD_PROTOTYPES_END
1189 :
1190 : #endif /* HEADER_fd_src_util_fd_util_base_h */
|