Line data Source code
1 : #ifndef HEADER_fd_src_util_bits_fd_uwide_h
2 : #define HEADER_fd_src_util_bits_fd_uwide_h
3 :
4 : /* Useful operations for unsigned 128-bit integer operations on
5 : platforms without 128-bit wide integer support. A 128-bit wide
6 : number is represented as a pair of ulong. In the notation below
7 : <xh,xl> == xh 2^64 + xl where xh and xl are ulong's. Imported from
8 : pyth. */
9 :
10 : #include "fd_bits.h"
11 :
12 : FD_PROTOTYPES_BEGIN
13 :
14 : /* fd_uwide_add computes co 2^128 + <zh,zl> = <xh,xl> + <yh,yl> + ci
15 : exactly. Returns the carry out. Note that the carry in / carry
16 : operations should be compile time optimized out if ci is 0 at compile
17 : time on input and/or return value is not used. Assumes _zh and _zl
18 : are valid (e.g. non-NULL and non-overlapping). Ignoring carry
19 : in/out related operations costs 4 u64 adds, 1 u64 compare and 1 u64
20 : conditional increment. */
21 :
22 : static inline ulong
23 : fd_uwide_add( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
24 : ulong xh, ulong xl,
25 : ulong yh, ulong yl,
26 1500000000 : ulong ci ) {
27 1500000000 : ulong zh = xh; ulong zl = xl;
28 1500000000 : ulong ct; ulong co;
29 1500000000 : zl += ci; ct = (ulong)(zl<ci);
30 1500000000 : zl += yl; ct += (ulong)(zl<yl);
31 1500000000 : zh += ct; co = (ulong)(zh<ct);
32 1500000000 : zh += yh; co += (ulong)(zh<yh);
33 1500000000 : *_zh = zh; *_zl = zl; return co;
34 1500000000 : }
35 :
36 : /* fd_uwide_inc computes <zh,zl> = (<xh,xl> + y) mod 2^128 exactly (a
37 : common use of the above) */
38 :
39 : static inline void
40 : fd_uwide_inc( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
41 : ulong xh, ulong xl,
42 1500000300 : ulong y ) {
43 1500000300 : ulong zl = xl + y;
44 1500000300 : ulong zh = xh + (zl<xl);
45 1500000300 : *_zh = zh; *_zl = zl;
46 1500000300 : }
47 :
48 : /* fd_uwide_sub compute <zh,zl> = bo 2^128 + <xh,xl> - <yh,yl> - bi
49 : exactly. Returns the borrow out. Note that the borrow in / borrow
50 : operations should be compile time optimized out if b is 0 at compile
51 : time on input and/or return value is not used. Assumes _zh and _zl
52 : are valid (e.g. non-NULL and non-overlapping). Ignoring borrow
53 : in/out related operations costs 4 u64 subs, 1 u64 compare and 1 u64
54 : conditional decrement. */
55 :
56 : static inline ulong
57 : fd_uwide_sub( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
58 : ulong xh, ulong xl,
59 : ulong yh, ulong yl,
60 5190965268 : ulong bi ) {
61 5190965268 : ulong zh = xh; ulong zl = xl;
62 5190965268 : ulong bt; ulong bo;
63 5190965268 : bt = (ulong)(zl<bi); zl -= bi;
64 5190965268 : bt += (ulong)(zl<yl); zl -= yl;
65 5190965268 : bo = (ulong)(zh<bt); zh -= bt;
66 5190965268 : bo += (ulong)(zh<yh); zh -= yh;
67 5190965268 : *_zh = zh; *_zl = zl; return bo;
68 5190965268 : }
69 :
70 : /* fd_uwide_dec computes <zh,zl> = (<xh,xl> - y) mod 2^128 exactly (a
71 : common use of the above) */
72 :
73 : static inline void
74 : fd_uwide_dec( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
75 : ulong xh, ulong xl,
76 1500000000 : ulong y ) {
77 1500000000 : ulong zh = xh - (ulong)(y>xl);
78 1500000000 : ulong zl = xl - y;
79 1500000000 : *_zh = zh; *_zl = zl;
80 1500000000 : }
81 :
82 : /* fd_uwide_mul computes <zh,zl> = x*y exactly, will be in
83 : [0,2^128-2^65+1]. Assumes _zh and _zl are valid (e.g. non-NULL and
84 : non-overlapping). Cost is 4 u32*u32->u64 muls, 4 u64 adds, 2 u64
85 : compares, 2 u64 conditional increments, 8 u64 u32 word extractions. */
86 :
87 : static inline void
88 : fd_uwide_mul( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
89 : ulong x,
90 6722084211 : ulong y ) {
91 6722084211 : #if FD_HAS_INT128
92 : /* Compiles to one mulx instruction */
93 6722084211 : uint128 res = (uint128)x * (uint128)y;
94 6722084211 : *_zh = (ulong)(res>>64);
95 6722084211 : *_zl = (ulong) res;
96 : #else
97 : ulong x1 = x>>32; ulong x0 = (ulong)(uint)x; /* both 2^32-1 @ worst case (x==y==2^64-1) */
98 : ulong y1 = y>>32; ulong y0 = (ulong)(uint)y; /* both 2^32-1 @ worst case */
99 :
100 : ulong w0 = x0*y0; ulong w1 = x0*y1; /* both 2^64-2^33+1 @ worst case */
101 : ulong w2 = x1*y0; ulong w3 = x1*y1; /* both 2^64-2^33+1 @ worst case */
102 :
103 : ulong w1h = w1>>32; ulong w1l = (ulong)(uint)w1; /* w1h 2^32-2, w1l 1 @ worst case */
104 : ulong w2h = w2>>32; ulong w2l = (ulong)(uint)w2; /* w2h 2^32-2, w2l 1 @ worst case */
105 :
106 : ulong zh = w1h + w2h + w3; /* 2^64-3 @ worst case */
107 : ulong t0 = w0 + (w1l<<32); zh += (ulong)(t0<w0); /* t 2^64-2^32+1, zh 2^64 - 3 @ worst case */
108 : ulong zl = t0 + (w2l<<32); zh += (ulong)(zl<t0); /* t 1, zh 2^64 - 2 @ worst case */
109 : /* zh 2^64 + zl == 2^128-2^65+1 @ worst case */
110 :
111 : *_zh = zh; *_zl = zl;
112 : #endif
113 6722084211 : }
114 :
115 : /* fd_uwide_find_msb returns floor( log2 <xh,xl> ) exactly. Assumes
116 : <xh,xl> is not 0. */
117 :
118 : static inline int
119 : fd_uwide_find_msb( ulong xh,
120 2994142743 : ulong xl ) {
121 2994142743 : int off = 0;
122 2994142743 : if( xh ) { off = 64; xl = xh; } /* FIXME: BRANCHLESS? */
123 2994142743 : return off + fd_ulong_find_msb( xl );
124 2994142743 : }
125 :
126 : /* fd_uwide_find_msb_def same as the fd_uwide_find_msb but returns def
127 : is <xh,xl> is 0. FIXME: BRANCHLESS? */
128 :
129 1500000000 : static inline int fd_uwide_find_msb_def( ulong xh, ulong xl, int def ) { return (xh|xl) ? fd_uwide_find_msb(xh,xl) : def; }
130 :
131 : /* fd_uwide_sl computes <zh,zl> = <xh,xl> << s. Assumes _zh and _zl are
132 : valid (e.g. non-NULL and non-overlapping) and s is non-negative.
133 : Large values of s are fine (shifts to zero). Returns the inexact
134 : flag (will be 0 or 1) which indicates if any non-zero bits of <xh,xl>
135 : were lost in the process. Note that inexact handling and various
136 : cases should be compile time optimized out if s is known at compile
137 : time on input and/or return value is not used. Ignoring inexact
138 : handling and assuming compile time s, for the worst case s, cost is 3
139 : u64 shifts and 1 u64 bit or. FIXME: CONSIDER HAVING AN INVALID FLAG
140 : FOR NEGATIVE S? FIXME: BRANCHLESS? */
141 :
142 : static inline int
143 : fd_uwide_sl( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
144 : ulong xh, ulong xl,
145 2920175808 : int s ) {
146 2920175808 : if( s>=128 ) { *_zh = 0UL; *_zl = 0UL; return !!(xh| xl ); }
147 2914315608 : if( s> 64 ) { s -= 64; int t = 64-s; *_zh = xl<<s; *_zl = 0UL; return !!(xh|(xl>>t)); }
148 2176039776 : if( s== 64 ) { *_zh = xl; *_zl = 0UL; return !! xh; }
149 2164326528 : if( s> 0 ) { int t = 64-s; *_zh = (xh<<s)|(xl>>t); *_zl = xl<<s; return !!(xh>>t); }
150 28931007 : /* s== 0 */ *_zh = xh; *_zl = xl; return 0;
151 2164326528 : }
152 :
153 : /* fd_uwide_sr compute <zh,zl> = <xh,xl> >> s. Assumes _zh and _zl are
154 : valid (e.g. non-NULL and non-overlapping) and s is non-negative.
155 : Large values of s are fine (shifts to zero). Returns the inexact
156 : flag (will be 0 or 1) which indicates if any non-zero bits of <xh,xl>
157 : were lost in the process. Note that inexact handling and various
158 : cases should be compile time optimized out if s is known at compile
159 : time on input and/or return value is not used. Ignoring inexact
160 : handling and assuming compile time s, for the worst case s, cost is 3
161 : u64 shifts and 1 u64 bit or. (FIXME: CONSIDER HAVING AN INVALID FLAG
162 : FOR NEGATIVE S AND/OR MORE DETAILED INEXACT FLAGS TO SIMPLIFY
163 : IMPLEMENTING FIXED AND FLOATING POINT ROUNDING MODES?) */
164 :
165 : static inline int
166 : fd_uwide_sr( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
167 : ulong xh, ulong xl,
168 1546160214 : int s ) {
169 1546160214 : if( s>=128 ) { *_zh = 0UL; *_zl = 0UL; return !!( xh |xl); }
170 1540300014 : if( s> 64 ) { s -= 64; int t = 64-s; *_zh = 0UL; *_zl = xh>>s; return !!((xh<<t)|xl); }
171 802024182 : if( s== 64 ) { *_zh = 0UL; *_zl = xh; return !! xl; }
172 790310934 : if( s> 0 ) { int t = 64-s; *_zh = xh>>s; *_zl = (xl>>s)|(xh<<t); return !!( xl<<t ); }
173 28929495 : /* s== 0 */ *_zh = xh; *_zl = xl; return 0;
174 790310934 : }
175 :
176 : /* FIXME: LOOK FOR IMPROVED DIV ALGORITHMICS HERE! */
177 :
178 : /* fd_uwide_private_div_approx_init/fd_uwide_private_div_approx computes
179 : a ~32-bit accurate approximation qa of q = floor(n 2^64 / d) where d
180 : is in [2^63,2^64) cheaply. Approximation will be at most q.
181 :
182 : In the general case, qa is up to 65 bits wide (worst case is n=2^64-1
183 : and d=2^63 such that q is 2^65-2 and qa is precise enough to need 65
184 : bits too). The implementation here assumes n is is less than d so
185 : that q and qa are both known to fit within 64 bits.
186 :
187 : Cost to setup for a given d is approximately a 1 u64 u32 extract, 1
188 : u64 increment, 1 u64 neg, 1 u64/u64 div, 1 u64 add. Cost per n/d
189 : afterward is 2 u32*u32->u64 mul, 2 u64 add, 1 u64 u32 extract.
190 :
191 : Theory: Let q d + r = n 2^64 where q = floor( n 2^64 / d ). Note r
192 : is in [0,d). Break d into d = dh 2^32 + dl where dh and dl are
193 : 32-bit words:
194 : q+r/d = n 2^64 / d = n 2^64 / ( dh 2^32 + dl )
195 : = n 2^64 / ( (dh+1) 2^32 - (2^32-dl) ) ... dh+1 can be computed without overflow, 2^32-dl is positive
196 : > n 2^64 / ( (dh+1) 2^32 )
197 : >= n floor( 2^64/(dh+1) ) / 2^32
198 : Note that floor( 2^64/(dh+1) ) is in [2^32,2^33-4]. This suggests
199 : letting:
200 : 2^32 + m = floor( 2^64/(dh+1) )
201 : where m is in [0,2^32-4] (fits in a uint). Then we have:
202 : q+r/d > n (2^32 + m) / 2^32
203 : = n + n m / 2^32
204 : Similarly breaking n into n = nh 2^32 + nl:
205 : q+r/d > n + (nh m) + (nl m/2^32)
206 : >= n + nh m + floor( nl m / 2^32 )
207 : We get:
208 : q+r/d > n + nh*m + ((nl*m)>>32)
209 : And, as q is integral, r/d is less than 1 and the right hand side is
210 : integral:
211 : q >= qa
212 : where:
213 : qa = n + nh*m + (((nl*m)>>32)
214 : and:
215 : m = floor( 2^64 / (dh+1) ) - 2^32
216 :
217 : To compute m efficiently, note:
218 : m = floor( 1 + (2^64-(dh+1))/(dh+1) ) - 2^32
219 : = floor( (2^64-(dh+1))/(dh+1) ) - (2^32-1)
220 : and in "C style" modulo 2^64 arithmetic, 2^64 - x = -x. This yields:
221 : m = (-(dh+1))/(dh+1) - (2^32-1)
222 :
223 : Applications should avoid using these directly. (They might be
224 : removed in the future, different approximations might get used for
225 : different targets, might be changed to be faster but less accurate or
226 : slower and more accurate, etc.) */
227 :
228 : static inline ulong /* In [0,2^32) */
229 1420175808 : fd_uwide_private_div_approx_init( ulong d ) { /* d in [2^63,2^64) */
230 1420175808 : ulong m = (d>>32) + 1UL; /* m = dh+1 and is in (2^31,2^32] ... exact */
231 1420175808 : return ((-m)/m) - (ulong)UINT_MAX; /* m = floor( 2^64/(dh+1) ) - 2^32 ... exact */
232 1420175808 : }
233 :
234 : static inline ulong /* In [n,2^64) and <= floor(n 2^64/d) */
235 : fd_uwide_private_div_approx( ulong n, /* In [0,d) */
236 3690965268 : ulong m ) { /* Output of fd_uwide_private_div_approx_init for the desired d */
237 3690965268 : ulong nh = n>>32;
238 3690965268 : ulong nl = (ulong)(uint)n;
239 3690965268 : return n + nh*m + ((nl*m)>>32);
240 3690965268 : }
241 :
242 : /* fd_uwide_div computes zh 2^64 + zl = floor( (xh 2^64 + xl) / y ).
243 : Assumes _zh and _zl are valid (e.g. non-NULL and non-overlapping).
244 : Requires y to be non-zero. Returns the exception flag if y is 0
245 : (<zh,zl>=0 in this case). This is not very cheap and cost is highly
246 : variable depending on properties of both n and d. Worst case is
247 : roughly ~3 u64/u64 divides, ~24 u64*u64->u64 muls plus other minor
248 : ops.
249 :
250 : Breakdown in worst case 1 u64 log2, 2 u64/u64 div, 1 u64*u64->u64
251 : mul, 1 u64 sub, 1 int sub, 1 u128 variable shift, 1 1 u64 variable
252 : shift, 1 u64 div approx init, 4*(1 u64 div approx, 1 u64*u64->u128
253 : mul, 1 u128 sub) plus various operations to facilitating shortcutting
254 : (e.g. when xh is zero, cost is 1 u64/u64 div). */
255 :
256 : FD_FN_UNUSED static int /* Work around -Winline */
257 : fd_uwide_div( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
258 : ulong xh, ulong xl,
259 1500000000 : ulong y ) {
260 :
261 : /* Simple cases (y==0, x==0 and y==2^n) */
262 :
263 1500000000 : if( FD_UNLIKELY( !y ) ) { *_zh = 0UL; *_zl = 0UL; return 1; }
264 1500000000 : if( FD_UNLIKELY( !xh ) ) { *_zh = 0UL; *_zl = xl / y; return 0; }
265 :
266 738224304 : int n = fd_ulong_find_msb( y );
267 738224304 : if( FD_UNLIKELY( !fd_ulong_pop_lsb(y) ) ) { fd_uwide_sr( _zh,_zl, xh,xl, n ); return 0; }
268 :
269 : /* General case ... compute zh noting that:
270 : <zh,zl> = floor( (xh 2^64 + xl) / y )
271 : = floor( ((qh y + rh) 2^64 + xl) / y )
272 : where xh = qh y + rh and qh is floor(xh/y), rh = xh%y and rh is in
273 : [0,y). Thus:
274 : = qh 2^64 + floor( (rh 2^64 + xl) / y )
275 : where the right term is less that 2^64 such that zh is qh and
276 : zl = floor( (rh 2^64 + xl) / y ). */
277 :
278 715144197 : ulong qh = xh / y;
279 715144197 : ulong rh = xh - qh*y;
280 :
281 : /* Simple zl shortcut */
282 :
283 715144197 : if( FD_UNLIKELY( !rh ) ) { *_zh = qh; *_zl = xl / y; return 0; }
284 :
285 : /* At this point, zh is qh and zl is floor( (rh 2^64 + xl) / y ) where
286 : rh is in (0,y) and y is a positive non-power of 2.
287 :
288 : Normalize this by noting for:
289 : q=n/d; r=n-q*d=n%d
290 : we have:
291 : -> n = q d + r where q = floor(n/d) and r in [0,d).
292 : -> n w = q d w + r w for some integer w>0
293 : That is, if we scale up both n and d by w, it doesn't affect q (it
294 : just scales the remainder). We use a scale factor of 2^s on
295 : <rh,xl> and y such that y will have its most significant bit set.
296 : This will not cause <rh,xl> to overflow as rh is less than y at
297 : this point. */
298 :
299 710087904 : int s = 63-n;
300 710087904 : ulong nh, nl; fd_uwide_sl( &nh,&nl, rh,xl, s );
301 710087904 : ulong d = y << s;
302 :
303 : /* At this point zh = qh and zl is floor( (nh 2^64 + nl) / d ) where
304 : nh is in (0,d) and d is in (2^63,2^64). */
305 :
306 710087904 : ulong m = fd_uwide_private_div_approx_init( d );
307 710087904 : ulong eh = nh; ulong el = nl;
308 710087904 : ulong ql = 0UL;
309 1845482634 : do {
310 :
311 : /* At this point, n-ql*d has an error of <eh,el> such that:
312 : d < 2^64 <= <eh,el>
313 : (i.e. eh is non-zero). If we increment ql by the correction
314 : floor(<eh,el>/d), we'd be at the solution but computing this is
315 : as hard as the original problem. We do have the ability to
316 : very quickly compute a ~32-bit accurate estimate dqest of
317 : floor(<eh,0>/d) such that:
318 : 1 <= eh <= dqest <= floor(<eh,0>/d) <= floor(<eh,el)/d).
319 : so we use that as our increment. Practically, this loop will
320 : require at most ~4 iterations. */
321 :
322 1845482634 : ql += fd_uwide_private_div_approx( eh, m ); /* Guaranteed to make progress if eh > 0 */
323 1845482634 : fd_uwide_mul( &eh,&el, ql, d ); /* <eh,el> = ql*d */
324 1845482634 : fd_uwide_sub( &eh,&el, nh,nl, eh,el, 0UL ); /* <eh,el> = <nh,nl> - ql d */
325 1845482634 : } while( eh );
326 :
327 : /* At this point, n - ql*d has an error less than 2^64 so we can
328 : directly compute the remaining correction. */
329 :
330 710087904 : ql += el/d;
331 :
332 710087904 : *_zh = qh; *_zl = ql; return 0;
333 715144197 : }
334 :
335 : /* fd_uwide_divrem is same as the above but returns the value of the
336 : remainder too. For non-zero y, remainder will be in [0,y). If y==0,
337 : returns <zh,zl>=0 with a remainder of ULONG_MAX (to signal error). */
338 :
339 : static inline ulong
340 : fd_uwide_divrem( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
341 : ulong xh, ulong xl,
342 1500000000 : ulong y ) {
343 :
344 1500000000 : if( FD_UNLIKELY( !y ) ) { *_zh = 0UL; *_zl = 0UL; return ULONG_MAX; }
345 1500000000 : if( FD_UNLIKELY( !xh ) ) { ulong ql = xl / y; ulong r = xl - ql*y; *_zh = 0UL; *_zl = ql; return r; }
346 :
347 738224304 : int n = fd_ulong_find_msb( y );
348 738224304 : if( FD_UNLIKELY( !fd_ulong_pop_lsb(y) ) ) { int s = 64-n; fd_uwide_sr( _zh,_zl, xh,xl, n ); return n ? ((xl << s) >> s) : 0UL; }
349 :
350 715144197 : ulong qh = xh / y;
351 715144197 : ulong rh = xh - qh*y;
352 :
353 715144197 : if( FD_UNLIKELY( !rh ) ) { ulong ql = xl / y; ulong r = xl - ql*y; *_zh = qh; *_zl = ql; return r; }
354 :
355 710087904 : int s = 63-n;
356 710087904 : ulong nh, nl; fd_uwide_sl( &nh,&nl, rh,xl, s );
357 710087904 : ulong d = y << s;
358 :
359 710087904 : ulong m = fd_uwide_private_div_approx_init( d );
360 710087904 : ulong eh = nh; ulong el = nl;
361 710087904 : ulong ql = 0UL;
362 1845482634 : do {
363 1845482634 : ql += fd_uwide_private_div_approx( eh, m );
364 1845482634 : fd_uwide_mul( &eh,&el, ql, d );
365 1845482634 : fd_uwide_sub( &eh,&el, nh,nl, eh,el, 0UL );
366 1845482634 : } while( eh );
367 :
368 710087904 : ulong dq = el / d;
369 710087904 : ulong r = (el - dq*d) >> s;
370 710087904 : ql += dq;
371 :
372 710087904 : *_zh = qh; *_zl = ql; return r;
373 715144197 : }
374 :
375 : FD_PROTOTYPES_END
376 :
377 : #endif /* HEADER_fd_src_util_bits_fd_uwide_h */
378 :
|