Line data Source code
1 : #ifndef HEADER_fd_src_util_bits_fd_bits_h
2 : #define HEADER_fd_src_util_bits_fd_bits_h
3 :
4 : /* Bit manipulation APIs */
5 :
6 : #include "../sanitize/fd_sanitize.h"
7 :
8 : FD_PROTOTYPES_BEGIN
9 :
10 : /* fd_ulong_is_pow2 ( x ) returns 1 if x is a positive integral power of 2 and 0 otherwise.
11 : fd_ulong_pow2 ( n ) returns 2^n mod 2^64. U.B. if n is negative.
12 :
13 : fd_ulong_mask_bit ( b ) returns the ulong with bit b set and all other bits 0. U.B. if b is not in [0,64).
14 : fd_ulong_clear_bit ( x, b ) returns x with bit b cleared. U.B. if b is not in [0,64).
15 : fd_ulong_set_bit ( x, b ) returns x with bit b set. U.B. if b is not in [0,64).
16 : fd_ulong_flip_bit ( x, b ) returns x with bit b flipped. U.B. if b is not in [0,64).
17 : fd_ulong_extract_bit( x, b ) returns bit b of x as an int in {0,1}. U.B. if b is not in [0,64).
18 : fd_ulong_insert_bit ( x, b, y ) returns x with bit b set to y. U.B. if b is not in [0,64) and/or y is not in {0,1}.
19 :
20 : fd_ulong_mask_lsb ( n ) returns the ulong bits [0,n) set and all other bits 0. U.B. if n is not in [0,64].
21 : fd_ulong_clear_lsb ( x, n ) returns x with bits [0,n) cleared. U.B. if n is not in [0,64].
22 : fd_ulong_set_lsb ( x, n ) returns x with bits [0,n) set. U.B. if n is not in [0,64].
23 : fd_ulong_flip_lsb ( x, n ) returns x with bits [0,n) flipped. U.B. if n is not in [0,64].
24 : fd_ulong_extract_lsb( x, n ) returns bits [0,n) of x. U.B. if n is not in [0,64].
25 : fd_ulong_insert_lsb ( x, n, y ) returns x with bits [0,n) set to y. U.B. if n is not in [0,64] and/or y is not in [0,2^n).
26 :
27 : fd_ulong_mask ( l, h ) returns the ulong bits [l,h] set and all other bits 0. U.B. if not 0<=l<=h<64.
28 : fd_ulong_clear ( x, l, h ) returns x with bits [l,h] cleared. U.B. if not 0<=l<=h<64.
29 : fd_ulong_set ( x, l, h ) returns x with bits [l,h] set. U.B. if not 0<=l<=h<64.
30 : fd_ulong_flip ( x, l, h ) returns x with bits [l,h] flipped. U.B. if not 0<=l<=h<64.
31 : fd_ulong_extract ( x, l, h ) returns bits [l,h] of x. U.B. if not 0<=l<=h<64.
32 : fd_ulong_insert ( x, l, h, y ) returns x with bits [l,h] set to y.
33 : U.B. if not 0<=l<=h<64 and/or y is not in in [0,2^(h-l+1)).
34 :
35 : fd_ulong_lsb ( x ) returns 2^i where i is the index of x's least significant set bit (x = 0 returns 0)
36 : fd_ulong_pop_lsb ( x ) returns x with the least significant set bit cleared (0 returns 0).
37 :
38 : FIXME: CONSIDER HAVING (A,X) INSTEAD OF (X,A)?
39 : fd_ulong_is_aligned ( x, a ) returns 1 if x is an integral multiple of a and 0 otherwise. U.B. if !fd_ulong_is_pow2( a )
40 : fd_ulong_alignment ( x, a ) returns x mod a. U.B. if !fd_ulong_is_pow2( a )
41 : fd_ulong_align_dn ( x, a ) returns x rounded down to the closest multiple of a <= x. U.B. if !fd_ulong_is_pow2( a )
42 : fd_ulong_align_up ( x, a ) returns x rounded up to the closest multiple of a >= x mod 2^64.
43 : U.B. if !fd_ulong_is_pow2( a )
44 :
45 : fd_ulong_blend ( m, t, f ) returns forms a ulong by selecting bits from t where m is 1 and from f where m is 0.
46 : fd_ulong_if ( c, t, f ) returns t if c is 1 and f if c is 0. U.B. if c is not in {0,1}
47 : fd_ulong_store_if ( c, p, v ) if c is non-zero, stores v to *p. Otherwise does nothing.
48 : fd_ulong_abs ( x ) returns |x| as a ulong
49 : fd_ulong_min ( x, y ) returns min(x,y)
50 : fd_ulong_max ( x, y ) returns max(x,y)
51 :
52 : fd_ulong_shift_left ( x, n ) returns x with its bits shifted left n times (n>63 shifts to zero), U.B. if n<0
53 : fd_ulong_shift_right ( x, n ) returns x with its bits shifted right n times (n>63 shifts to zero), U.B. if n<0
54 : fd_ulong_rotate_left ( x, n ) returns x with its bits rotated left n times (negative values rotate right)
55 : fd_ulong_rotate_right( x, n ) returns x with its bits rotated right n times (negative values rotate left)
56 :
57 : fd_ulong_popcnt ( x ) returns the number of bits set in x, in [0,64].
58 : fd_ulong_find_lsb ( x ) returns the index of the least significant bit set in x, in [0,64). U.B. if x is zero.
59 : fd_ulong_find_lsb_w_default( x, d ) returns the index of the least significant bit set in x, in [0,64). d if x is zero.
60 : fd_ulong_find_msb ( x ) returns the index of the most significant bit set in x, in [0,64). U.B. if x is zero.
61 : fd_ulong_find_msb_w_default( x, d ) returns the index of the most significant bit set in x, in [0,64). d if x is zero.
62 : fd_ulong_bswap ( x ) returns x with its bytes swapped
63 : fd_ulong_pow2_up ( x ) returns y mod 2^64 where y is the smallest integer power of 2 >= x. U.B. if x is zero.
64 : (current implementation returns 0 if x==0).
65 : fd_ulong_pow2_dn ( x ) returns the largest integer power of 2 <= x. U.B. if x is zero.
66 : (current implementation returns 1 if x==0).
67 :
68 : Similarly for uchar,ushort,uint,uint128. Note that the signed
69 : versions of shift_left, rotate_left, rotate_right operate on the bit
70 : pattern of the underlying type directly. Signed shift_right is sign
71 : extending while unsigned shift_right is zero padding (such that if x
72 : is negative/non-negative, a large magnitude shift will shift to
73 : -1/0).
74 :
75 : Support for zig-zag encoding is also provided. E.g.
76 :
77 : fd_long_zz_enc( x ) returns the zig-zag encodes long x and returns it as a ulong.
78 : fd_long_zz_dec( y ) returns the zig-zag decodes ulong y and returns it as a long.
79 :
80 : zig-zag encoding losslessly maps a signed integer to an unsigned
81 : integer such that, if the magnitude of the signed integer was small,
82 : the magnitude of the unsigned integer will be small too.
83 :
84 : Note that, though fd_ulong_if and friends look like a completely
85 : superfluous wrapper for the trinary operator, they have subtly
86 : different linguistic meanings. This seemingly trivial difference can
87 : have profound effects on code generation quality, especially on
88 : modern architectures. For example:
89 :
90 : c ? x[i] : x[j];
91 :
92 : linguistically means, if c is non-zero, load the i-th element of
93 : x. Otherwise, load the j-th element of x. But:
94 :
95 : fd_ulong_if( c, x[i], x[j] )
96 :
97 : means load the i-th element of x _and_ load the j-th element of x
98 : _and_ then select the first value if c is non-zero or the second
99 : value otherwise. Further, it explicitly says that c and the loads
100 : can be evaluated in any order.
101 :
102 : As such, in the trinary case, the compiler will be loathe to do
103 : either load before computing c because the language explicitly says
104 : "don't do that". In the unlikely case it overcomes this barrier,
105 : it then has the difficult job of proving that reordering c with the
106 : loads is safe (e.g. evaluating c might affect the value that would be
107 : loaded if c has side effects). In the unlikely case it overcomes
108 : that barrier, it then has the difficult job of proving it is safe to
109 : do the loads before evaluating c (e.g. c might be protecting against
110 : a load that could seg-fault).
111 :
112 : In the fd_ulong_if case though, the compiler has been explicitly told
113 : up front that the loads are safe and c and the loads can be done in
114 : any order. Now the optimizer finds it a lot easier to optimize
115 : because it isn't accidentally over-constrained and doesn't have to
116 : prove anything. E.g. it can use otherwise unused instruction slots
117 : before the operation to hide load latency while computing c in
118 : parallel via ILP. Further, it can then ideally can use a conditional
119 : move operation to eliminate unnecessary consumption of BTB resources.
120 : And, if c is known at compile time, the compiler can prune
121 : unnecessary code for the unselected option (e.g. the compiler knows
122 : that omitting an unused normal load has no observable effect in the
123 : machine model).
124 :
125 : Faster, more deterministic, less BTB resources consumed and good
126 : compile time behavior. Everything the trinary operator should have
127 : been, rather than the giant pile of linguistic fail that it is.
128 :
129 : Overall, compilers traditionally are much much better at pruning
130 : unneeded operations than speculating execution (especially for code
131 : paths that a language says not to do and doubly so for code paths
132 : that are not obviously safe in general). And most of this is because
133 : languages are not designed correctly to help developers express their
134 : intent and constraints to the compiler.
135 :
136 : This dynamic has had multi-billion dollar commercial impacts though
137 : the cause-and-effect has gone largely unrecognized.
138 :
139 : At one extreme, a major reason why Itanium failed was languages
140 : didn't have the right constructs and machine models for developers to
141 : "do the right thing". Even given such, most devs wouldn't know to
142 : use them because they were erroneously taught to code to a machine
143 : abstraction that hasn't applied to the real world since the early
144 : 1980s. Compilers then were not able to utilize all the speculative
145 : execution / ILP / ... features that were key to performance on that
146 : architecture. The developer community, not being able to see any
147 : benefits (much less large enough benefits to justify short term
148 : switching costs) and not wanting to write tons of hand-tuned
149 : non-portable ASM kernels, shrugged and Itanium withered away.
150 :
151 : At the other extreme, CUDA gave developers a good GPU abstraction and
152 : extended languages and compilers to make it possible for developers
153 : code to that abstraction (e.g. express locality explicitly instead of
154 : the constant lying-by-omission about the importance of locality that
155 : virtually everything else in tech does ... the speed of light isn't
156 : infinite or even all that fast relative to modern CPUs ... stop
157 : pretending that it is). CUDA enabled GPUs have since thrived in
158 : gaming, high performance computing, machine learning, crypto, etc.
159 : Profits had by all (well, by Nvidia at least).
160 :
161 : TL;DR
162 :
163 : * It'd be laughable if it weren't so pathetic that CPU ISAs and
164 : programming languages usually forget to expose one of the most
165 : fundamental and important digital logic circuits to devs ... the
166 : 2:1 mux.
167 :
168 : * Developers will usually do the right thing if languages let them.
169 :
170 : TODO: mask_msb, clear_msb, set_msb, flip_msb, extract_msb,
171 : insert_msb, bitrev, sign, copysign, flipsign, rounding right shift,
172 : ... */
173 :
174 : #define FD_SRC_UTIL_BITS_FD_BITS_IMPL(T,w) \
175 377481748 : FD_FN_CONST static inline int fd_##T##_is_pow2 ( T x ) { return (!!x) & (!(x & (x-(T)1))); } \
176 201327363 : FD_FN_CONST static inline T fd_##T##_pow2 ( int n ) { return (T)(((T)(n<w))<<(n&(w-1))); } \
177 637938276 : FD_FN_CONST static inline T fd_##T##_mask_bit ( int b ) { return (T)(((T)1)<<b); } \
178 51144 : FD_FN_CONST static inline T fd_##T##_clear_bit ( T x, int b ) { return (T)(x & ~fd_##T##_mask_bit(b)); } \
179 637877460 : FD_FN_CONST static inline T fd_##T##_set_bit ( T x, int b ) { return (T)(x | fd_##T##_mask_bit(b)); } \
180 2976 : FD_FN_CONST static inline T fd_##T##_flip_bit ( T x, int b ) { return (T)(x ^ fd_##T##_mask_bit(b)); } \
181 901875646 : FD_FN_CONST static inline int fd_##T##_extract_bit ( T x, int b ) { return (int)((x>>b) & (T)1); } \
182 5952 : FD_FN_CONST static inline T fd_##T##_insert_bit ( T x, int b, int y ) { return (T)((x & ~fd_##T##_mask_bit(b))|(((T)y)<<b)); } \
183 1412434390 : FD_FN_CONST static inline T fd_##T##_mask_lsb ( int n ) { return (T)((((T)(n<w))<<(n&(w-1)))-((T)1)); } \
184 1410852285 : FD_FN_CONST static inline T fd_##T##_clear_lsb ( T x, int n ) { return (T)(x & ~fd_##T##_mask_lsb(n)); } \
185 3036 : FD_FN_CONST static inline T fd_##T##_set_lsb ( T x, int n ) { return (T)(x | fd_##T##_mask_lsb(n)); } \
186 3036 : FD_FN_CONST static inline T fd_##T##_flip_lsb ( T x, int n ) { return (T)(x ^ fd_##T##_mask_lsb(n)); } \
187 3036 : FD_FN_CONST static inline T fd_##T##_extract_lsb ( T x, int n ) { return (T)(x & fd_##T##_mask_lsb(n)); } \
188 1410849249 : FD_FN_CONST static inline T fd_##T##_insert_lsb ( T x, int n, T y ) { return (T)(fd_##T##_clear_lsb(x,n) | y); } \
189 695316 : FD_FN_CONST static inline T fd_##T##_mask ( int l, int h ) { return (T)( fd_##T##_mask_lsb(h-l+1) << l ); } \
190 397296 : FD_FN_CONST static inline T fd_##T##_clear ( T x, int l, int h ) { return (T)(x & ~fd_##T##_mask(l,h)); } \
191 132432 : FD_FN_CONST static inline T fd_##T##_set ( T x, int l, int h ) { return (T)(x | fd_##T##_mask(l,h)); } \
192 132432 : FD_FN_CONST static inline T fd_##T##_flip ( T x, int l, int h ) { return (T)(x ^ fd_##T##_mask(l,h)); } \
193 132432 : FD_FN_CONST static inline T fd_##T##_extract ( T x, int l, int h ) { return (T)( (x>>l) & fd_##T##_mask_lsb(h-l+1) ); } \
194 264864 : FD_FN_CONST static inline T fd_##T##_insert ( T x, int l, int h, T y ) { return (T)(fd_##T##_clear(x,l,h) | (y<<l)); } \
195 2154732 : FD_FN_CONST static inline T fd_##T##_lsb ( T x ) { return (T)(x ^ (x & (x-(T)1))); } \
196 3407368173 : FD_FN_CONST static inline T fd_##T##_pop_lsb ( T x ) { return (T)(x & (x-(T)1)); } \
197 9686831631 : FD_FN_CONST static inline int fd_##T##_is_aligned ( T x, T a ) { a--; return !(x & a); } \
198 66960 : FD_FN_CONST static inline T fd_##T##_alignment ( T x, T a ) { a--; return (T)( x & a); } \
199 370507924 : FD_FN_CONST static inline T fd_##T##_align_dn ( T x, T a ) { a--; return (T)( x & ~a); } \
200 13110555410 : FD_FN_CONST static inline T fd_##T##_align_up ( T x, T a ) { a--; return (T)((x+a) & ~a); } \
201 251658240 : FD_FN_CONST static inline T fd_##T##_blend ( T m, T t, T f ) { return (T)((t & m) | (f & ~m)); } \
202 28189336568 : FD_FN_CONST static inline T fd_##T##_if ( int c, T t, T f ) { return c ? t : f; /* cmov */ } \
203 314289630 : /* */ static inline void fd_##T##_store_if ( int c, T * p, T v ) { T _[ 1 ]; *( c ? p : _ ) = v; /* cmov */ } \
204 489555636 : FD_FN_CONST static inline T fd_##T##_abs ( T x ) { return x; } \
205 20092407909 : FD_FN_CONST static inline T fd_##T##_min ( T x, T y ) { return (x<y) ? x : y; /* cmov */ } \
206 13782991680 : FD_FN_CONST static inline T fd_##T##_max ( T x, T y ) { return (x>y) ? x : y; /* cmov */ } \
207 754974822 : FD_FN_CONST static inline T fd_##T##_shift_left ( T x, int n ) { return (T)(((n>(w-1)) ? ((T)0) : x) << (n&(w-1))); } \
208 511844859 : FD_FN_CONST static inline T fd_##T##_shift_right ( T x, int n ) { return (T)(((n>(w-1)) ? ((T)0) : x) >> (n&(w-1))); } \
209 >36173*10^7 : FD_FN_CONST static inline T fd_##T##_rotate_left ( T x, int n ) { return (T)((x << (n&(w-1))) | (x >> ((-n)&(w-1)))); } \
210 4574736620 : FD_FN_CONST static inline T fd_##T##_rotate_right( T x, int n ) { return (T)((x >> (n&(w-1))) | (x << ((-n)&(w-1)))); }
211 :
212 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(uchar, 8)
213 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(ushort,16)
214 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(uint, 32)
215 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(ulong, 64)
216 :
217 : #if FD_HAS_INT128 /* FIXME: These probably could benefit from x86 specializations */
218 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(uint128,128)
219 : #endif
220 :
221 : #undef FD_SRC_UTIL_BITS_FD_BITS_IMPL
222 :
223 1892759368 : FD_FN_CONST static inline int fd_uchar_popcnt ( uchar x ) { return __builtin_popcount ( (uint)x ); }
224 816 : FD_FN_CONST static inline int fd_ushort_popcnt( ushort x ) { return __builtin_popcount ( (uint)x ); }
225 1530810 : FD_FN_CONST static inline int fd_uint_popcnt ( uint x ) { return __builtin_popcount ( x ); }
226 12985983653 : FD_FN_CONST static inline int fd_ulong_popcnt ( ulong x ) { return __builtin_popcountl( x ); }
227 :
228 : #if FD_HAS_INT128
229 : FD_FN_CONST static inline int
230 49536 : fd_uint128_popcnt( uint128 x ) {
231 49536 : return __builtin_popcountl( (ulong) x ) + __builtin_popcountl( (ulong)(x>>64) );
232 49536 : }
233 : #endif
234 :
235 : #include "fd_bits_find_lsb.h" /* Provides find_lsb_w_default too */
236 : #include "fd_bits_find_msb.h" /* Provides find_msb_w_default too */ /* Note that find_msb==floor( log2( x ) ) for non-zero x */
237 :
238 4211355 : FD_FN_CONST static inline uchar fd_uchar_bswap ( uchar x ) { return x; }
239 297810731 : FD_FN_CONST static inline ushort fd_ushort_bswap( ushort x ) { return __builtin_bswap16( x ); }
240 13478694132 : FD_FN_CONST static inline uint fd_uint_bswap ( uint x ) { return __builtin_bswap32( x ); }
241 1194251144 : FD_FN_CONST static inline ulong fd_ulong_bswap ( ulong x ) { return __builtin_bswap64( x ); }
242 :
243 : #if FD_HAS_INT128
244 : FD_FN_CONST static inline uint128
245 387 : fd_uint128_bswap( uint128 x ) {
246 387 : ulong xl = (ulong) x;
247 387 : ulong xh = (ulong)(x>>64);
248 387 : return (((uint128)fd_ulong_bswap( xl )) << 64) | ((uint128)fd_ulong_bswap( xh ));
249 387 : }
250 : #endif
251 :
252 : /* FIXME: consider find_msb based solution (probably not as the combination
253 : of FD_FN_CONST and the use of inline asm for find_msb on some targets
254 : is probably less than ideal). */
255 :
256 : FD_FN_CONST static inline uchar
257 111 : fd_uchar_pow2_up( uchar _x ) {
258 111 : uint x = (uint)_x;
259 111 : x--;
260 111 : x |= (x>> 1);
261 111 : x |= (x>> 2);
262 111 : x |= (x>> 4);
263 111 : x++;
264 111 : return (uchar)x;
265 111 : }
266 :
267 : FD_FN_CONST static inline uchar
268 111 : fd_uchar_pow2_dn( uchar _x ) {
269 111 : uint x = (uint)_x;
270 111 : x >>= 1;
271 111 : x |= (x>> 1);
272 111 : x |= (x>> 2);
273 111 : x |= (x>> 4);
274 111 : x++;
275 111 : return (uchar)x;
276 111 : }
277 :
278 : FD_FN_CONST static inline ushort
279 411 : fd_ushort_pow2_up( ushort _x ) {
280 411 : uint x = (uint)_x;
281 411 : x--;
282 411 : x |= (x>> 1);
283 411 : x |= (x>> 2);
284 411 : x |= (x>> 4);
285 411 : x |= (x>> 8);
286 411 : x++;
287 411 : return (ushort)x;
288 411 : }
289 :
290 : FD_FN_CONST static inline ushort
291 411 : fd_ushort_pow2_dn( ushort _x ) {
292 411 : uint x = (uint)_x;
293 411 : x >>= 1;
294 411 : x |= (x>> 1);
295 411 : x |= (x>> 2);
296 411 : x |= (x>> 4);
297 411 : x |= (x>> 8);
298 411 : x++;
299 411 : return (ushort)x;
300 411 : }
301 :
302 : FD_FN_CONST static inline uint
303 1587 : fd_uint_pow2_up( uint x ) {
304 1587 : x--;
305 1587 : x |= (x>> 1);
306 1587 : x |= (x>> 2);
307 1587 : x |= (x>> 4);
308 1587 : x |= (x>> 8);
309 1587 : x |= (x>>16);
310 1587 : x++;
311 1587 : return x;
312 1587 : }
313 :
314 : FD_FN_CONST static inline uint
315 1587 : fd_uint_pow2_dn( uint x ) {
316 1587 : x >>= 1;
317 1587 : x |= (x>> 1);
318 1587 : x |= (x>> 2);
319 1587 : x |= (x>> 4);
320 1587 : x |= (x>> 8);
321 1587 : x |= (x>>16);
322 1587 : x++;
323 1587 : return x;
324 1587 : }
325 :
326 : FD_FN_CONST static inline ulong
327 21738735 : fd_ulong_pow2_up( ulong x ) {
328 21738735 : x--;
329 21738735 : x |= (x>> 1);
330 21738735 : x |= (x>> 2);
331 21738735 : x |= (x>> 4);
332 21738735 : x |= (x>> 8);
333 21738735 : x |= (x>>16);
334 21738735 : x |= (x>>32);
335 21738735 : x++;
336 21738735 : return x;
337 21738735 : }
338 :
339 : FD_FN_CONST static inline ulong
340 15157425 : fd_ulong_pow2_dn( ulong x ) {
341 15157425 : x >>= 1;
342 15157425 : x |= (x>> 1);
343 15157425 : x |= (x>> 2);
344 15157425 : x |= (x>> 4);
345 15157425 : x |= (x>> 8);
346 15157425 : x |= (x>>16);
347 15157425 : x |= (x>>32);
348 15157425 : x++;
349 15157425 : return x;
350 15157425 : }
351 :
352 : #if FD_HAS_INT128
353 : FD_FN_CONST static inline uint128
354 24771 : fd_uint128_pow2_up( uint128 x ) {
355 24771 : x--;
356 24771 : x |= (x>> 1);
357 24771 : x |= (x>> 2);
358 24771 : x |= (x>> 4);
359 24771 : x |= (x>> 8);
360 24771 : x |= (x>>16);
361 24771 : x |= (x>>32);
362 24771 : x |= (x>>64);
363 24771 : x++;
364 24771 : return x;
365 24771 : }
366 :
367 : FD_FN_CONST static inline uint128
368 24771 : fd_uint128_pow2_dn( uint128 x ) {
369 24771 : x >>= 1;
370 24771 : x |= (x>> 1);
371 24771 : x |= (x>> 2);
372 24771 : x |= (x>> 4);
373 24771 : x |= (x>> 8);
374 24771 : x |= (x>>16);
375 24771 : x |= (x>>32);
376 24771 : x |= (x>>64);
377 24771 : x++;
378 24771 : return x;
379 24771 : }
380 : #endif
381 :
382 : /* Brokenness of indeterminant char sign strikes again ... sigh. We
383 : explicitly provide the unsigned variant of the token to these
384 : macros because the uchar token is not related to the schar token
385 : by simply prepending u to schar. */
386 :
387 : /* Note: the implementations of abs, right_shift and zz_enc below do not
388 : exploit the sign extending right shift behavior specified by the
389 : machine model (and thus can be used safely in more general machine
390 : models) but are slightly more expensive.
391 :
392 : FD_FN_CONST static inline UT fd_##T##_abs( T x ) { UT u = (UT)x; UT m = (UT)-(u>>(w-1)); return (UT)((u+m)^m); }
393 :
394 : FD_FN_CONST static inline T
395 : fd_##T##_shift_right( T x,
396 : int n ) {
397 : UT u = (UT)x;
398 : UT m = (UT)-(u >> (w-1));
399 : return (T)(fd_##UT##_shift_right( u ^ m, n ) ^ m);
400 : }
401 :
402 : FD_FN_CONST static inline UT fd_##T##_zz_enc( T x ) { UT u = (UT)x; return (UT)((-(u>>(w-1))) ^ (u<<1)); }
403 : */
404 :
405 : #define FD_SRC_UTIL_BITS_FD_BITS_IMPL(T,UT,w) \
406 3801650594 : FD_FN_CONST static inline T fd_##T##_if ( int c, T t, T f ) { return c ? t : f; /* cmov */ } \
407 626259613 : /* */ static inline void fd_##T##_store_if ( int c, T * p, T v ) { T _[ 1 ]; *( c ? p : _ ) = v; /* cmov */ } \
408 1704374573 : FD_FN_CONST static inline UT fd_##T##_abs ( T x ) { UT m = (UT)(x >> (w-1)); return (UT)((((UT)x)+m)^m); } \
409 470638095 : FD_FN_CONST static inline T fd_##T##_min ( T x, T y ) { return (x<=y) ? x : y; /* cmov */ } \
410 413951649 : FD_FN_CONST static inline T fd_##T##_max ( T x, T y ) { return (x>=y) ? x : y; /* cmov */ } \
411 251658240 : FD_FN_CONST static inline T fd_##T##_shift_left ( T x, int n ) { return (T)fd_##UT##_shift_left ( (UT)x, n ); } \
412 251658240 : FD_FN_CONST static inline T fd_##T##_shift_right ( T x, int n ) { return (T)(x >> ((n>(w-1)) ? (w-1) : n)); /* cmov */ } \
413 1097648128 : FD_FN_CONST static inline T fd_##T##_rotate_left ( T x, int n ) { return (T)fd_##UT##_rotate_left ( (UT)x, n ); } \
414 1097648128 : FD_FN_CONST static inline T fd_##T##_rotate_right( T x, int n ) { return (T)fd_##UT##_rotate_right( (UT)x, n ); } \
415 196683 : FD_FN_CONST static inline UT fd_##T##_zz_enc ( T x ) { return (UT)(((UT)(x>>(w-1))) ^ (((UT)x)<<1)); } \
416 149301 : FD_FN_CONST static inline T fd_##T##_zz_dec ( UT x ) { return (T)((x>>1) ^ (-(x & (UT)1))); }
417 :
418 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(schar, uchar, 8)
419 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(short, ushort, 16)
420 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(int, uint, 32)
421 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(long, ulong, 64)
422 : #if FD_HAS_INT128
423 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(int128,uint128,128)
424 : #endif
425 :
426 : #undef FD_SRC_UTIL_BITS_FD_BITS_IMPL
427 :
428 : /* Brokenness of indeterminant char sign strikes again ... sigh. We
429 : can't provide a char_min/char_max between platforms as they don't
430 : necessarily produce the same results. Likewise, we don't provide a
431 : fd_char_abs because it will not produce equivalent results between
432 : platforms. But it is useful to have a char_if and char_store_if to
433 : help with making branchless string operation implementations. */
434 :
435 53568063 : FD_FN_CONST static inline char fd_char_if( int c, char t, char f ) { return c ? t : f; }
436 :
437 50331648 : static inline void fd_char_store_if( int c, char * p, char v ) { char _[ 1 ]; *( c ? p : _ ) = v; /* cmov */ }
438 :
439 : /* FIXME: ADD HASHING PAIRS FOR UCHAR AND USHORT? */
440 :
441 : /* High quality (full avalanche) high speed integer to integer hashing.
442 : fd_uint_hash has the properties that [0,2^32) hashes to a random
443 : looking permutation of [0,2^32) and hash(0)==0. Similarly for
444 : fd_ulong_hash. Based on Google's Murmur3 hash finalizer (public
445 : domain). Not cryptographically secure but passes various strict
446 : tests of randomness when used as a PRNG. */
447 :
448 : FD_FN_CONST static inline uint
449 113770548 : fd_uint_hash( uint x ) {
450 113770548 : x ^= x >> 16;
451 113770548 : x *= 0x85ebca6bU;
452 113770548 : x ^= x >> 13;
453 113770548 : x *= 0xc2b2ae35U;
454 113770548 : x ^= x >> 16;
455 113770548 : return x;
456 113770548 : }
457 :
458 : FD_FN_CONST static inline ulong
459 >14895*10^7 : fd_ulong_hash( ulong x ) {
460 >14895*10^7 : x ^= x >> 33;
461 >14895*10^7 : x *= 0xff51afd7ed558ccdUL;
462 >14895*10^7 : x ^= x >> 33;
463 >14895*10^7 : x *= 0xc4ceb9fe1a85ec53UL;
464 >14895*10^7 : x ^= x >> 33;
465 >14895*10^7 : return x;
466 >14895*10^7 : }
467 :
468 : /* Inverses of the above. E.g.:
469 : fd_uint_hash_inverse( fd_uint_hash( (uint)x ) )==(uint)x
470 : and:
471 : fd_uint_hash( fd_uint_hash_inverse( (uint)x ) )==(uint)x
472 : Similarly for fd_ulong_hash_inverse. These by themselves are similar
473 : quality hashes to the above and having the inverses of the above can
474 : be useful. The fact these have (nearly) identical operations /
475 : operation counts concretely demonstrates that none of these are
476 : standalone cryptographically secure. */
477 :
478 : FD_FN_CONST static inline uint
479 103809084 : fd_uint_hash_inverse( uint x ) {
480 103809084 : x ^= x >> 16;
481 103809084 : x *= 0x7ed1b41dU;
482 103809084 : x ^= (x >> 13) ^ (x >> 26);
483 103809084 : x *= 0xa5cb9243U;
484 103809084 : x ^= x >> 16;
485 103809084 : return x;
486 103809084 : }
487 :
488 : FD_FN_CONST static inline ulong
489 13089422283 : fd_ulong_hash_inverse( ulong x ) {
490 13089422283 : x ^= x >> 33;
491 13089422283 : x *= 0x9cb4b2f8129337dbUL;
492 13089422283 : x ^= x >> 33;
493 13089422283 : x *= 0x4f74430c22a54005UL;
494 13089422283 : x ^= x >> 33;
495 13089422283 : return x;
496 13089422283 : }
497 :
498 : /* fd_ulong_base10_dig_cnt returns the number of digits in the base 10
499 : representation of x. FIXME: USE BETTER ALGO. */
500 :
501 : #define FD_SRC_UTIL_BITS_FD_BITS_IMPL(T,M) \
502 : FD_FN_CONST static inline ulong \
503 291 : fd_##T##_base10_dig_cnt( T _x ) { \
504 291 : ulong x = (ulong)_x; \
505 291 : ulong cnt = 1UL; \
506 291 : ulong thresh = 10UL; \
507 1857 : do { \
508 1857 : if( FD_LIKELY( x<thresh ) ) break; \
509 1857 : cnt++; \
510 1566 : thresh *= 10UL; \
511 1566 : } while( FD_LIKELY( cnt<M ) ); \
512 291 : return cnt; \
513 291 : }
514 :
515 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(uchar, 3UL) /* 255 -> 3 dig */
516 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(ushort, 5UL) /* 65535 -> 5 dig */
517 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(uint, 10UL) /* 4294967295 -> 10 dig */
518 : FD_SRC_UTIL_BITS_FD_BITS_IMPL(ulong, 20UL) /* 18446744073709551615 -> 20 dig */
519 :
520 : #undef FD_SRC_UTIL_BITS_FD_BITS_IMPL
521 :
522 : /* fd_float_if, fd_float_store_if, fd_float_abs are described above.
523 : Ideally, the system will implement fd_float_abs by just clearing the
524 : sign bit. fd_float_eq tests to floating point values for whether or
525 : not their bit representations are identical. Useful when IEEE
526 : handling of equality with +/-0 or nan are not desired (e.g. can test
527 : if nans have different signs or syndromes). */
528 :
529 54718958 : FD_FN_CONST static inline float fd_float_if ( int c, float t, float f ) { return c ? t : f; }
530 50331648 : /* */ static inline void fd_float_store_if( int c, float * p, float v ) { float _[ 1 ]; *( c ? p : _ ) = v; }
531 96095148 : FD_FN_CONST static inline float fd_float_abs ( float x ) { return __builtin_fabsf( x ); }
532 : FD_FN_CONST static inline int
533 : fd_float_eq( float x,
534 100663296 : float y ) {
535 100663296 : union { float f; uint u; } tx, ty;
536 100663296 : tx.f = x;
537 100663296 : ty.f = y;
538 100663296 : return tx.u==ty.u;
539 100663296 : }
540 :
541 : /* fd_double_if, fd_double_store_if, fd_double_abs and fd_double_eq are
542 : double precision versions of the above. */
543 :
544 : #if FD_HAS_DOUBLE
545 54491892 : FD_FN_CONST static inline double fd_double_if ( int c, double t, double f ) { return c ? t : f; }
546 50331648 : /* */ static inline void fd_double_store_if( int c, double * p, double v ) { double _[ 1 ]; *( c ? p : _ ) = v; }
547 96161058 : FD_FN_CONST static inline double fd_double_abs ( double x ) { return __builtin_fabs( x ); }
548 : FD_FN_CONST static inline int
549 : fd_double_eq( double x,
550 100663296 : double y ) {
551 100663296 : union { double f; ulong u; } tx, ty;
552 100663296 : tx.f = x;
553 100663296 : ty.f = y;
554 100663296 : return tx.u==ty.u;
555 100663296 : }
556 : #endif
557 :
558 : /* fd_swap swaps the values in a and b. Assumes a and b have the same
559 : plain-old-data type. Best if a and b are primitive types (e.g.
560 : ulong). This macro is robust (it evaluates a and b the minimal
561 : number of times). Note that compilers are pretty good about identify
562 : swap operations and replacing them with more optimal assembly for
563 : types and architectures where alternative implementations (like xor
564 : tricks) might be faster. */
565 :
566 658066659 : #define fd_swap(a,b) do { __typeof__((a)) _fd_swap_tmp = (a); (a) = (b); (b) = _fd_swap_tmp; } while(0)
567 :
568 : /* fd_swap_if swaps the values in a and b if c is non-zero and leaves
569 : them unchanged otherwise. Assumes a and b have the same
570 : plain-old-data type. Best if a and b are primitive types (e.g.
571 : ulong) as the compiler will likely replace the trinaries below with
572 : cmovs, making this branchless. This macro is robust (it evaluates a,
573 : b and c the minimal number of times). */
574 :
575 662237040 : #define fd_swap_if(c,a,b) do { \
576 662237040 : int _fd_swap_if_c = (c); \
577 662237040 : __typeof__((a)) _fd_swap_if_a = (a); \
578 662237040 : __typeof__((b)) _fd_swap_if_b = (b); \
579 662237040 : (a) = _fd_swap_if_c ? _fd_swap_if_b : _fd_swap_if_a; /* cmov */ \
580 662237040 : (b) = _fd_swap_if_c ? _fd_swap_if_a : _fd_swap_if_b; /* cmov */ \
581 662237040 : } while(0)
582 :
583 : /* fd_ptr_if is a generic version of the above for pointers. This macro
584 : is robust.
585 :
586 : IMPORTANT SAFETY TIP! The output type is the type of t. Thus, if
587 : the condition is false and t and f have different pointer types
588 : (which is inherently gross and we might want to consider outright
589 : rejecting in the future via, unfortunately non-standard, compiler
590 : builtins), this would implicitly cast the pointer f to the type of t.
591 : As such, strict aliasing rules in the language also imply mixed usage
592 : cases need to be wrapped in a fd_type_pun. In short, mixing pointer
593 : types between t and f is strongly discouraged. */
594 :
595 : #if FD_HAS_UBSAN
596 : #define fd_ptr_if(c,t,f) ((__typeof__((t)))( (c) ? (ulong)(t) : (ulong)(f) ))
597 : #else
598 6329156340 : #define fd_ptr_if(c,t,f) ((__typeof__((t)))fd_ulong_if( (c), (ulong)(t), (ulong)(f) ))
599 : #endif
600 :
601 : /* FD_ULONG_{MASK_LSB,MASK_MSB,ALIGN_UP} are the same as
602 : fd_ulong_{mask_lsb,mask_msb,align_up} but can be used at compile
603 : time. The tradeoff is n/a must be safe against multiple evaluation
604 : at compile time. x should be ulong compatible and n/a should be int
605 : compatible. */
606 :
607 11799960 : #define FD_ULONG_MASK_LSB( n ) ((((ulong)((n)<=63)) << ((n) & 63)) - 1UL)
608 : #define FD_ULONG_MASK_MSB( n ) (~FD_ULONG_MASK_LSB(64-(n)))
609 9221481 : #define FD_ULONG_ALIGN_UP( x, a ) (((x)+((a)-1UL)) & (~((a)-1UL)))
610 :
611 : /* Unaligned access annotations.
612 :
613 : FD_LOAD( T, src ) is equivalent to:
614 : return (*(T const *)(src))
615 : but src can have arbitrary alignment.
616 :
617 : FD_STORE( T, dst, val ) is equivalent to:
618 : T * ptr = (T *)(dst);
619 : *ptr = (val);
620 : return ptr
621 : but dst can have arbitrary alignment.
622 :
623 : Note: Ideally, we would infer the type T in FD_LOAD from src (e.g.
624 : use typeof(*(src)). But there are some nasty linguistic and
625 : optimizer interactions when src is a constant pointer in a truly
626 : generic implementation. Similarly for FD_STORE.
627 :
628 : fd_T_load_n( src ) where T is in [uchar,ushort,uint,ulong] loads n
629 : bytes into the least significant n bytes of a T, zeros any remaining
630 : bytes and returns the result. fd_T_load_n_fast is the same but
631 : assumes it is safe to tail read a couple of bytes past the end of src
632 : if such is beneficial for higher performance.
633 :
634 : Accesses that would normally be atomic (e.g. an aligned access to a
635 : primitive type like a ulong) are not guaranteed to be atomic if done
636 : through these annotations. */
637 :
638 : #ifndef FD_UNALIGNED_ACCESS_STYLE
639 : #if FD_HAS_X86
640 : #define FD_UNALIGNED_ACCESS_STYLE 0 /* 1 is broken ... */
641 : #else
642 : #define FD_UNALIGNED_ACCESS_STYLE 0
643 : #endif
644 : #endif
645 :
646 : #if FD_UNALIGNED_ACCESS_STYLE==0 /* memcpy elision based */
647 :
648 : /* This implementation does not assume it is safe to access unaligned
649 : memory directly (and thus can be used on platforms outside the
650 : development environment's machine model) but it does still assume
651 : little endian byte ordering.
652 :
653 : It is based on memcpy and, in principle, the compiler should elide
654 : the memcpy and replace this with optimized asm on platforms where
655 : this is safe (which is virtually all commercially viable platforms as
656 : packet processing deal heavily with unaligned accesses and virtually
657 : all platforms are near universally networked and networking needs to
658 : do packet processing efficiently). But this fails often enough in
659 : practice that this should not be relied upon, especially if
660 : performance is important as performance is glacial when the compiler
661 : mucks up. (fd_memcpy is an especially bad idea here because the
662 : risk of the compiler mucking it up is much greater.)
663 :
664 : It is also more than little bizarre that this is an encouraged
665 : practice nowadays. That is, practically, we are using a low level
666 : language (C/C++) that have language constructs (dereference a
667 : pointer to a primitive type) that map directly onto low level
668 : hardware operations (asm load operation) that are actually supported
669 : by the target hardware here (fine if pointer is not aligned to
670 : width of the type).
671 :
672 : But instead of encouraging developers to write short, readable,
673 : library-independent code that generates fast and ultra compact asm,
674 : they are encouraged to write long, obtuse, library-dependent code
675 : that naively would generate slow bloated asm in hopes the compiler
676 : will realize can be turned into the simple implementation and turn it
677 : back into the developer's original intent and then generate good asm.
678 : Hmmm. */
679 :
680 : #define FD_LOAD( T, src ) \
681 4066236838 : (__extension__({ T _fd_load_tmp; memcpy( &_fd_load_tmp, (T const *)(src), sizeof(T) ); _fd_load_tmp; }))
682 :
683 : #define FD_STORE( T, dst, val ) \
684 38689392204 : (__extension__({ T _fd_store_tmp = (val); (T *)memcpy( (T *)(dst), &_fd_store_tmp, sizeof(T) ); }))
685 :
686 12288 : FD_FN_PURE static inline uchar fd_uchar_load_1 ( void const * p ) { return *(uchar const *)p; }
687 :
688 12282 : FD_FN_PURE static inline ushort fd_ushort_load_1 ( void const * p ) { return (ushort)*(uchar const *)p; }
689 12282 : FD_FN_PURE static inline ushort fd_ushort_load_2 ( void const * p ) { ushort t; memcpy( &t, p, 2UL ); return t; }
690 :
691 12270 : FD_FN_PURE static inline uint fd_uint_load_1 ( void const * p ) { return (uint )*(uchar const *)p; }
692 12270 : FD_FN_PURE static inline uint fd_uint_load_2 ( void const * p ) { ushort t; memcpy( &t, p, 2UL ); return (uint )t; }
693 30243 : FD_FN_PURE static inline uint fd_uint_load_3 ( void const * p ) { uint t = 0UL; memcpy( &t, p, 3UL ); return (uint )t; }
694 161021151 : FD_FN_PURE static inline uint fd_uint_load_4 ( void const * p ) { uint t; memcpy( &t, p, 4UL ); return t; }
695 :
696 2790050589 : FD_FN_PURE static inline ulong fd_ulong_load_1 ( void const * p ) { return (ulong )*(uchar const *)p; }
697 1383821027 : FD_FN_PURE static inline ulong fd_ulong_load_2 ( void const * p ) { ushort t; memcpy( &t, p, 2UL ); return (ulong)t; }
698 6123 : FD_FN_PURE static inline ulong fd_ulong_load_3 ( void const * p ) { uint t = 0UL; memcpy( &t, p, 3UL ); return (ulong)t; }
699 66184270 : FD_FN_PURE static inline ulong fd_ulong_load_4 ( void const * p ) { uint t; memcpy( &t, p, 4UL ); return (ulong)t; }
700 6123 : FD_FN_PURE static inline ulong fd_ulong_load_5 ( void const * p ) { ulong t = 0UL; memcpy( &t, p, 5UL ); return t; }
701 6141 : FD_FN_PURE static inline ulong fd_ulong_load_6 ( void const * p ) { ulong t = 0UL; memcpy( &t, p, 6UL ); return t; }
702 6123 : FD_FN_PURE static inline ulong fd_ulong_load_7 ( void const * p ) { ulong t = 0UL; memcpy( &t, p, 7UL ); return t; }
703 2180937362 : FD_FN_PURE static inline ulong fd_ulong_load_8 ( void const * p ) { ulong t; memcpy( &t, p, 8UL ); return t; }
704 :
705 : #define fd_uchar_load_1_fast fd_uchar_load_1
706 :
707 : #define fd_ushort_load_1_fast fd_ushort_load_1
708 : #define fd_ushort_load_2_fast fd_ushort_load_2
709 :
710 : #define fd_uint_load_1_fast fd_uint_load_1
711 : #define fd_uint_load_2_fast fd_uint_load_2
712 6135 : FD_FN_PURE static inline uint fd_uint_load_3_fast ( void const * p ) { uint t; memcpy( &t, p, 4UL ); return ((uint )t) & 0x00ffffffU; }
713 123310860 : #define fd_uint_load_4_fast fd_uint_load_4
714 :
715 : #define fd_ulong_load_1_fast fd_ulong_load_1
716 9 : #define fd_ulong_load_2_fast fd_ulong_load_2
717 6132 : FD_FN_PURE static inline ulong fd_ulong_load_3_fast ( void const * p ) { uint t; memcpy( &t, p, 4UL ); return ((ulong)t) & 0x0000000000ffffffUL; }
718 9 : #define fd_ulong_load_4_fast fd_ulong_load_4
719 6123 : FD_FN_PURE static inline ulong fd_ulong_load_5_fast ( void const * p ) { ulong t; memcpy( &t, p, 8UL ); return t & 0x000000ffffffffffUL; }
720 6123 : FD_FN_PURE static inline ulong fd_ulong_load_6_fast ( void const * p ) { ulong t; memcpy( &t, p, 8UL ); return t & 0x0000ffffffffffffUL; }
721 6123 : FD_FN_PURE static inline ulong fd_ulong_load_7_fast ( void const * p ) { ulong t; memcpy( &t, p, 8UL ); return t & 0x00ffffffffffffffUL; }
722 90258896 : #define fd_ulong_load_8_fast fd_ulong_load_8
723 :
724 : #elif FD_UNALIGNED_ACCESS_STYLE==1 /* direct access */
725 :
726 : #define FD_LOAD( T, src ) (__extension__({ \
727 : T const * _fd_store_tmp = (T const *)(src); \
728 : FD_COMPILER_FORGET( _fd_store_tmp ); \
729 : *_fd_store_tmp; \
730 : }))
731 :
732 : #define FD_STORE( T, dst, val ) (__extension__({ \
733 : T * _fd_store_tmp = (T *)fd_type_pun( (void *)(dst) ); \
734 : *_fd_store_tmp = (val); \
735 : FD_COMPILER_MFENCE(); \
736 : _fd_store_tmp; \
737 : }))
738 :
739 : FD_FN_PURE static inline uchar fd_uchar_load_1 ( void const * p ) { FD_COMPILER_FORGET( p ) ; return ( *(uchar const *)p); }
740 :
741 : FD_FN_PURE static inline ushort fd_ushort_load_1 ( void const * p ) { FD_COMPILER_FORGET( p ); return ((ushort)*(uchar const *)p); }
742 : FD_FN_PURE static inline ushort fd_ushort_load_2 ( void const * p ) { FD_COMPILER_FORGET( p ); return ( *(ushort const *)p); }
743 :
744 : FD_FN_PURE static inline uint fd_uint_load_1 ( void const * p ) { FD_COMPILER_FORGET( p ); return ((uint )*(uchar const *)p); }
745 : FD_FN_PURE static inline uint fd_uint_load_2 ( void const * p ) { FD_COMPILER_FORGET( p ); return ((uint )*(ushort const *)p); }
746 : FD_FN_PURE static inline uint fd_uint_load_3 ( void const * p ) { FD_COMPILER_FORGET( p ); return fd_uint_load_2 (p) | (fd_uint_load_1 (((uchar const *)p)+2UL)<<16); }
747 : FD_FN_PURE static inline uint fd_uint_load_4 ( void const * p ) { FD_COMPILER_FORGET( p ); return ( *(uint const *)p); }
748 :
749 : FD_FN_PURE static inline ulong fd_ulong_load_1 ( void const * p ) { FD_COMPILER_FORGET( p ); return ((ulong )*(uchar const *)p); }
750 : FD_FN_PURE static inline ulong fd_ulong_load_2 ( void const * p ) { FD_COMPILER_FORGET( p ); return ((ulong )*(ushort const *)p); }
751 : FD_FN_PURE static inline ulong fd_ulong_load_3 ( void const * p ) { FD_COMPILER_FORGET( p ); return fd_ulong_load_2(p) | (fd_ulong_load_1(((uchar const *)p)+2UL)<<16); }
752 : FD_FN_PURE static inline ulong fd_ulong_load_4 ( void const * p ) { FD_COMPILER_FORGET( p ); return ((ulong )*(uint const *)p); }
753 : FD_FN_PURE static inline ulong fd_ulong_load_5 ( void const * p ) { FD_COMPILER_FORGET( p ); return fd_ulong_load_4(p) | (fd_ulong_load_1(((uchar const *)p)+4UL)<<32); }
754 : FD_FN_PURE static inline ulong fd_ulong_load_6 ( void const * p ) { FD_COMPILER_FORGET( p ); return fd_ulong_load_4(p) | (fd_ulong_load_2(((uchar const *)p)+4UL)<<32); }
755 : FD_FN_PURE static inline ulong fd_ulong_load_7 ( void const * p ) { FD_COMPILER_FORGET( p ); return fd_ulong_load_6(p) | (fd_ulong_load_1(((uchar const *)p)+6UL)<<48); }
756 : FD_FN_PURE static inline ulong fd_ulong_load_8 ( void const * p ) { FD_COMPILER_FORGET( p ); return ( *(ulong const *)p); }
757 :
758 : #define fd_uchar_load_1_fast fd_uchar_load_1
759 :
760 : #define fd_ushort_load_1_fast fd_ushort_load_1
761 : #define fd_ushort_load_2_fast fd_ushort_load_2
762 :
763 : #define fd_uint_load_1_fast fd_uint_load_1
764 : #define fd_uint_load_2_fast fd_uint_load_2
765 : FD_FN_PURE static inline uint fd_uint_load_3_fast ( void const * p ) { FD_COMPILER_FORGET( p ); return ( *(uint const *)p) & 0x00ffffffU; } /* Tail read 1B */
766 : #define fd_uint_load_4_fast fd_uint_load_4
767 :
768 : #define fd_ulong_load_1_fast fd_ulong_load_1
769 : #define fd_ulong_load_2_fast fd_ulong_load_2
770 : FD_FN_PURE static inline ulong fd_ulong_load_3_fast ( void const * p ) { FD_COMPILER_FORGET( p ); return ((ulong)*(uint const *)p) & 0x0000000000ffffffUL; } /* Tail read 1B */
771 : #define fd_ulong_load_4_fast fd_ulong_load_4
772 : FD_FN_PURE static inline ulong fd_ulong_load_5_fast ( void const * p ) { FD_COMPILER_FORGET( p ); return ( *(ulong const *)p) & 0x000000ffffffffffUL; } /* Tail read 3B */
773 : FD_FN_PURE static inline ulong fd_ulong_load_6_fast ( void const * p ) { FD_COMPILER_FORGET( p ); return ( *(ulong const *)p) & 0x0000ffffffffffffUL; } /* Tail read 2B */
774 : FD_FN_PURE static inline ulong fd_ulong_load_7_fast ( void const * p ) { FD_COMPILER_FORGET( p ); return ( *(ulong const *)p) & 0x00ffffffffffffffUL; } /* Tail read 1B */
775 : #define fd_ulong_load_8_fast fd_ulong_load_8
776 :
777 : #else
778 : #error "Unsupported FD_UNALIGNED_ACCESS_STYLE"
779 : #endif
780 :
781 : /* fd_ulong_svw_enc_sz returns the number of bytes needed to encode
782 : x as a symmetric variable width encoded integer. This is at most
783 : FD_ULONG_SVW_ENC_MAX (9). Result will be in {1,2,3,4,5,8,9}. */
784 :
785 : #define FD_ULONG_SVW_ENC_MAX (9UL) /* For compile time use */
786 :
787 : FD_FN_UNUSED FD_FN_CONST static ulong /* Work around -Winline */
788 746104836 : fd_ulong_svw_enc_sz( ulong x ) {
789 : /* FIXME: CONSIDER FIND_MSB BASED TABLE LOOKUP? */
790 746104836 : if( FD_LIKELY( x<(1UL<< 6) ) ) return 1UL;
791 298580496 : if( FD_LIKELY( x<(1UL<<10) ) ) return 2UL;
792 44062746 : if( FD_LIKELY( x<(1UL<<18) ) ) return 3UL;
793 35686500 : if( FD_LIKELY( x<(1UL<<24) ) ) return 4UL;
794 30959472 : if( FD_LIKELY( x<(1UL<<32) ) ) return 5UL;
795 24571452 : if( FD_LIKELY( x<(1UL<<56) ) ) return 8UL;
796 5695980 : return 9UL;
797 24571452 : }
798 :
799 : /* fd_ulong_svw_enc appends x to the byte stream b as a symmetric
800 : variable width encoded integer. b should have room from
801 : fd_ulong_svw_env_sz(x) (note that 9 is sufficient for all possible
802 : x). Returns the next location in the byte system. */
803 :
804 : FD_FN_UNUSED static uchar * /* Work around -Winline */
805 : fd_ulong_svw_enc( uchar * b,
806 50704383 : ulong x ) {
807 50704383 : if( FD_LIKELY( x<(1UL<< 6) ) ) { b[0] = (uchar) (x<< 1); b+=1; } /* 0 | x( 6) | 0 */
808 45123810 : else if( FD_LIKELY( x<(1UL<<10) ) ) { FD_STORE( ushort, b, (ushort)( 0x8001UL | (x<<3)) ); b+=2; } /* 100 | x(10) | 001 */
809 41982351 : else if( FD_LIKELY( x<(1UL<<18) ) ) { FD_STORE( ushort, b, (ushort)( 0x5UL | (x<<3)) ); b[2] = (uchar)(0xa0UL | (x>>13)); b+=3; } /* 101 | x(18) | 101 */
810 35614692 : else if( FD_LIKELY( x<(1UL<<24) ) ) { FD_STORE( uint, b, (uint )( 0xc0000003UL | (x<<4)) ); b+=4; } /* 1100 | x(24) | 0011 */
811 30887763 : else if( FD_LIKELY( x<(1UL<<32) ) ) { FD_STORE( uint, b, (uint )( 0xbUL | (x<<4)) ); b[4] = (uchar)(0xd0UL | (x>>28)); b+=5; } /* 1101 | x(32) | 1011 */
812 24523458 : else if( FD_LIKELY( x<(1UL<<56) ) ) { FD_STORE( ulong, b, 0xe000000000000007UL | (x<<4) ); b+=8; } /* 1110 | x(56) | 0111 */
813 5648181 : else { FD_STORE( ulong, b, 0xfUL | (x<<4) ); b[8] = (uchar)(0xf0UL | (x>>60)); b+=9; } /* 1111 | x(64) | 1111 */
814 50704383 : return b;
815 50704383 : }
816 :
817 : /* fd_ulong_svw_enc_fixed appends x to the byte stream b as a symmetric
818 : csz width encoded integer. csz is assumed to be in {1,2,3,4,5,8,9}.
819 : b should have room from csz bytes and x should be known apriori to be
820 : compatible with csz. Useful for updating in place an existing
821 : encoded integer to a value that is <= the current value. Returns
822 : b+csz. */
823 :
824 : FD_FN_UNUSED static uchar * /* Work around -Winline */
825 : fd_ulong_svw_enc_fixed( uchar * b,
826 : ulong csz,
827 1099798905 : ulong x ) {
828 1099798905 : if( FD_LIKELY( csz==1UL ) ) { b[0] = (uchar) (x<< 1); } /* 0 | x( 6) | 0 */
829 408062199 : else if( FD_LIKELY( csz==2UL ) ) { FD_STORE( ushort, b, (ushort)( 0x8001UL | (x<<3)) ); } /* 100 | x(10) | 001 */
830 51097707 : else if( FD_LIKELY( csz==3UL ) ) { FD_STORE( ushort, b, (ushort)( 0x5UL | (x<<3)) ); b[2] = (uchar)(0xa0UL | (x>>13)); } /* 101 | x(18) | 101 */
831 35391201 : else if( FD_LIKELY( csz==4UL ) ) { FD_STORE( uint, b, (uint )( 0xc0000003UL | (x<<4)) ); } /* 1100 | x(24) | 0011 */
832 30664593 : else if( FD_LIKELY( csz==5UL ) ) { FD_STORE( uint, b, (uint )( 0xbUL | (x<<4)) ); b[4] = (uchar)(0xd0UL | (x>>28)); } /* 1101 | x(32) | 1011 */
833 24375081 : else if( FD_LIKELY( csz==8UL ) ) { FD_STORE( ulong, b, 0xe000000000000007UL | (x<<4) ); } /* 1110 | x(56) | 0111 */
834 5500425 : else /* csz==9UL */ { FD_STORE( ulong, b, 0xfUL | (x<<4) ); b[8] = (uchar)(0xf0UL | (x>>60)); } /* 1111 | x(64) | 1111 */
835 1099798905 : return b+csz;
836 1099798905 : }
837 :
838 : /* fd_ulong_svw_dec_sz returns the number of bytes representing an svw
839 : encoded integer. b points to the first byte of the encoded integer.
840 : Result will be in {1,2,3,4,5,8,9}. */
841 :
842 : FD_FN_PURE static inline ulong
843 4001308551 : fd_ulong_svw_dec_sz( uchar const * b ) {
844 :
845 : /* LSB: Compressed size
846 : xxxx|xxx0 -> 1B
847 : xxxx|x001 -> 2B
848 : xxxx|x101 -> 3B
849 : xxxx|0011 -> 4B
850 : xxxx|1011 -> 5B
851 : xxxx|0111 -> 8B
852 : xxxx|1111 -> 9B
853 :
854 : 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
855 : 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000
856 : 9 1 3 1 5 1 2 1 8 1 3 1 4 1 2 1 */
857 :
858 4001308551 : return (0x9131512181314121UL >> ((((ulong)b[0]) & 15UL) << 2)) & 15UL;
859 4001308551 : }
860 :
861 : /* fd_ulong_svw_dec_tail_sz returns the number of bytes representing an
862 : svw encoded integer. b points to one after the last byte of the
863 : encoded integer. Result will be in {1,2,3,4,5,8,9}. */
864 :
865 : FD_FN_PURE static inline ulong
866 228150786 : fd_ulong_svw_dec_tail_sz( uchar const * b ) {
867 :
868 : /* MSB: Compressed size
869 : 0xxx|xxxx -> 1B
870 : 100x|xxxx -> 2B
871 : 101x|xxxx -> 3B
872 : 1100|xxxx -> 4B
873 : 1101|xxxx -> 5B
874 : 1110|xxxx -> 8B
875 : 1111|xxxx -> 9B
876 :
877 : 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
878 : 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000
879 : 9 8 5 4 3 3 2 2 1 1 1 1 1 1 1 1 */
880 :
881 228150786 : return (0x9854332211111111UL >> ((((ulong)b[-1]) >> 4) << 2)) & 15UL;
882 228150786 : }
883 :
884 : /* fd_ulong_svw_dec_fixed decodes a ulong encoded as a symmetric
885 : variable width encoded integer whose width is known. b points to the
886 : first byte of the encoded integer and the encoded integer is csz
887 : byte. csz is assumed to be in {1,2,3,4,5,8,9}. */
888 :
889 : FD_FN_UNUSED static ulong /* Work around -Winline */
890 : fd_ulong_svw_dec_fixed( uchar const * b,
891 4386418218 : ulong csz ) {
892 4386418218 : if( FD_LIKELY( csz==1UL ) ) return (fd_ulong_load_1( b ) >> 1);
893 1596379875 : if( FD_LIKELY( csz==2UL ) ) return (fd_ulong_load_2( b ) >> 3) & 1023UL;
894 303209841 : if( FD_LIKELY( csz==3UL ) ) return (fd_ulong_load_2( b ) >> 3) | ((((ulong)b[2]) & 0x1fUL) << 13);
895 212571103 : if( FD_LIKELY( csz==4UL ) ) return (fd_ulong_load_4( b ) >> 4) & 16777215UL;
896 184211082 : if( FD_LIKELY( csz==5UL ) ) return (fd_ulong_load_4( b ) >> 4) | ((((ulong)b[4]) & 0x0fUL) << 28);
897 146399088 : if( FD_LIKELY( csz==8UL ) ) return (fd_ulong_load_8( b ) >> 4) & 72057594037927935UL;
898 33150531 : return /*csz==9UL*/ (fd_ulong_load_8( b ) >> 4) | ( ((ulong)b[8]) << 60);
899 146399088 : }
900 :
901 : /* fd_ulong_svw_dec decodes a ulong encoded as a symmetric variable
902 : width encoded integer. b points to the first byte of the encoded
903 : integer. Returns a pointer to the first byte after the symvarint and
904 : *_x will hold the uncompressed value on return. If the byte stream
905 : might be corrupt, it should be safe to read up to 9 bytes starting a
906 : b. */
907 :
908 : static inline uchar const *
909 : fd_ulong_svw_dec( uchar const * b,
910 101036571 : ulong * _x ) {
911 101036571 : ulong csz = fd_ulong_svw_dec_sz( b );
912 101036571 : *_x = fd_ulong_svw_dec_fixed( b, csz ); b += csz;
913 101036571 : return b;
914 101036571 : }
915 :
916 : /* fd_ulong_svw_dec_tail decodes a ulong encoded as a symmetric variable
917 : width encoded integer. b points to the first byte after the encoded
918 : integer. Returns a pointer to the first byte of the encoded integer
919 : and *_x will have the hold the uncompressed value on return. If the
920 : byte stream might be corrupt, it should be safe to read up to 9 bytes
921 : immediately before b. */
922 :
923 : static inline uchar const *
924 : fd_ulong_svw_dec_tail( uchar const * b,
925 100663296 : ulong * _x ) {
926 100663296 : ulong csz = fd_ulong_svw_dec_tail_sz( b );
927 100663296 : b -= csz; *_x = fd_ulong_svw_dec_fixed( b, csz );
928 100663296 : return b;
929 100663296 : }
930 :
931 : /* FD_LAYOUT_{INIT,APPEND,FINI} are useful for compile time
932 : determination of the required footprint of shared memory regions with
933 : dynamic sizes and complex alignment restrictions.
934 :
935 : FD_LAYOUT_INIT starts a layout. Returns a handle to the layout.
936 :
937 : FD_LAYOUT_APPEND appends a s byte region of alignment a to a layout
938 : where l is an in progress layout.
939 :
940 : FD_LAYOUT_FINI returns the final layout footprint. a is the
941 : alignment to be used for the overall layout. It should be the
942 : alignment of all appends. The final footprint will be a multiple of
943 : a.
944 :
945 : All arguments should be ulong compatible. All alignment should be a
946 : positive integer power of 2 and safe against multiple evaluation.
947 :
948 : The caller further promises the layout is not unreasonably large that
949 : overflow might be an issue (i.e. will be at most
950 : fd_ulong_align_dn(ULONG_MAX,a) where is the a used for FINI in size).
951 :
952 : Example usage:
953 :
954 : FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT,
955 : align0, size0 ),
956 : align1, size1 ),
957 : page_sz )
958 :
959 : would return the number of pages as a page_sz multiple for a shared
960 : memory region that starts with a initial/final region of size0/size1
961 : bytes and alignment align0/align1. Correct operation requires
962 : page_sz>=max(align0,align1), page_sz, align0 and align1 be positive
963 : integer powers of 2, page_sz, size0, align0 and align1 should be
964 : ulong compatible, page_sz, align0 and align1 be safe against multiple
965 : evaluation, and the final size be at most ULONG_MAX-page_sz+1. */
966 :
967 13593 : #define FD_LAYOUT_INIT (0UL)
968 44622 : #define FD_LAYOUT_APPEND( l, a, s ) (FD_ULONG_ALIGN_UP( (l), (a) ) + (s))
969 5598423 : #define FD_LAYOUT_FINI( l, a ) FD_ULONG_ALIGN_UP( (l), (a) )
970 :
971 : /* FD_SCRATCH_ALLOC_{INIT,APPEND,FINI} are utility macros for allocating
972 : sub-regions of memory out of one larger region of memory. These are
973 : intentionally parallel to FD_LAYOUT, but for use at runtime, not
974 : compile time, and when you want to know the actual addresses, and not
975 : just the total footprint.
976 :
977 : FD_SCRATCH_ALLOC_INIT begins a scratch allocation operation called
978 : layout with starting base address of base.
979 :
980 : FD_SCRATCH_ALLOC_APPEND returns the next address in the allocation
981 : operation `layout` with the required alignment and advances the
982 : allocation operation by the provided size.
983 :
984 : FD_SCRATCH_ALLOC_FINI finalizes a scratch allocation operation with
985 : the name given by `layout` and returns the next address with the
986 : requested alignment.
987 :
988 : align must be a power of 2, and layout should be an identifier name.
989 : The macros are robust otherwise.
990 :
991 : Example usage:
992 : FD_SCRATCH_ALLOC_INIT( foo, scratch_base );
993 : int * arr1 = FD_SCRATCH_ALLOC_APPEND( foo, alignof(int), 100*sizeof(int) );
994 : ulong * arr2 = FD_SCRATCH_ALLOC_APPEND( foo, alignof(ulong), 25*sizeof(ulong) );
995 : FD_SCRATCH_ALLOC_FINI( foo, 32UL );
996 : */
997 316830 : #define FD_SCRATCH_ALLOC_INIT( layout, base ) ulong _##layout = (ulong)(base)
998 1589916 : #define FD_SCRATCH_ALLOC_APPEND( layout, align, sz ) (__extension__({ \
999 1589916 : ulong _align = (align); \
1000 1589916 : ulong _sz = (sz); \
1001 1589916 : ulong _scratch_alloc = fd_ulong_align_up( _##layout, (align) ); \
1002 1589916 : if( FD_UNLIKELY( _scratch_alloc+_sz<_scratch_alloc ) ) \
1003 1589916 : FD_LOG_ERR(( "FD_SCRATCH_ALLOC_APPEND( %s, %lu, %lu ) overflowed", #layout, _align, _sz )); \
1004 1589916 : _##layout = _scratch_alloc + _sz; \
1005 1589916 : (void *)_scratch_alloc; \
1006 1589916 : }))
1007 2703 : #define FD_SCRATCH_ALLOC_FINI( layout, align ) (_##layout = FD_ULONG_ALIGN_UP( _##layout, (align) ) )
1008 :
1009 : #define FD_SCRATCH_ALLOC_PUBLISH( layout ) (__extension__({ \
1010 : void * end = (void *)FD_SCRATCH_ALLOC_FINI( layout, 1UL ); \
1011 : int ok = fd_scratch_publish_is_safe( end ); \
1012 : if( ok ) fd_scratch_publish( end ); \
1013 : ok; \
1014 : }))
1015 :
1016 : /* fd_ulong_approx_sqrt( x ) returns an approximation to square root of
1017 : x that is accurate to +/- ~0.4% for all ulong x in fast O(1)
1018 : operations and is cross platform deterministic.
1019 :
1020 : fd_ulong_floor_sqrt returns floor( sqrt( x ) ) exactly in fast-ish O(1)
1021 :
1022 : fd_ulong_round_sqrt returns round( sqrt( x ) ) exactly in fast-ish O(1)
1023 :
1024 : fd_ulong_ceil_sqrt returns ceil ( sqrt( x ) ) exactly in fast-ish O(1)
1025 :
1026 : fd_ulong_{approx,floor,round,ceil}_cbrt have similar behavior as
1027 : their sqrt variants above but compute the cube root instead of the
1028 : square root. The approximate cube root accurate to +/- ~0.8%.
1029 :
1030 : These are similar in spirit to the implementations in fd_sqrt.h and
1031 : fd_fxp.h but take generic 64-bit inputs, have a fast approximation
1032 : support, support all rounding modes for these inputs, are performance
1033 : optimized for the case of a call with a moderate magnitude input
1034 : getting called O(1) times (e.g. computing the optimal number of
1035 : threads / cores needed for a parallel algorithm), support cube roots
1036 : in addition to square roots.
1037 :
1038 : FIXME: Consider making a TG wrapper for these too? (Limited benefit
1039 : in making custom approximations for narrower types given
1040 : implementations used under the hood.) */
1041 :
1042 : FD_FN_CONST ulong fd_ulong_approx_sqrt( ulong x );
1043 : FD_FN_CONST ulong fd_ulong_floor_sqrt ( ulong x );
1044 : FD_FN_CONST ulong fd_ulong_round_sqrt ( ulong x );
1045 : FD_FN_CONST ulong fd_ulong_ceil_sqrt ( ulong x );
1046 :
1047 : FD_FN_CONST ulong fd_ulong_approx_cbrt( ulong x );
1048 : FD_FN_CONST ulong fd_ulong_floor_cbrt ( ulong x );
1049 : FD_FN_CONST ulong fd_ulong_round_cbrt ( ulong x );
1050 : FD_FN_CONST ulong fd_ulong_ceil_cbrt ( ulong x );
1051 :
1052 : FD_PROTOTYPES_END
1053 :
1054 : /* Include type generic versions of much of the above */
1055 :
1056 : #include "fd_bits_tg.h"
1057 :
1058 : #endif /* HEADER_fd_src_util_bits_fd_bits_h */
|