Line data Source code
1 : #ifndef HEADER_fd_src_util_bits_fd_uwide_h
2 : #define HEADER_fd_src_util_bits_fd_uwide_h
3 :
4 : /* Useful operations for unsigned 128-bit integer operations on
5 : platforms without 128-bit wide integer support. A 128-bit wide
6 : number is represented as a pair of ulong. In the notation below
7 : <xh,xl> == xh 2^64 + xl where xh and xl are ulong's. Imported from
8 : pyth. */
9 :
10 : #include "fd_bits.h"
11 :
12 : FD_PROTOTYPES_BEGIN
13 :
14 : /* fd_uwide_add computes co 2^128 + <zh,zl> = <xh,xl> + <yh,yl> + ci
15 : exactly. Returns the carry out. Note that the carry in / carry
16 : operations should be compile time optimized out if ci is 0 at compile
17 : time on input and/or return value is not used. Assumes _zh and _zl
18 : are valid (e.g. non-NULL and non-overlapping). Ignoring carry
19 : in/out related operations costs 4 u64 adds, 1 u64 compare and 1 u64
20 : conditional increment. */
21 :
22 : static inline ulong
23 : fd_uwide_add( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
24 : ulong xh, ulong xl,
25 : ulong yh, ulong yl,
26 1500000030 : ulong ci ) {
27 1500000030 : ulong zh = xh; ulong zl = xl;
28 1500000030 : ulong ct; ulong co;
29 1500000030 : zl += ci; ct = (ulong)(zl<ci);
30 1500000030 : zl += yl; ct += (ulong)(zl<yl);
31 1500000030 : zh += ct; co = (ulong)(zh<ct);
32 1500000030 : zh += yh; co += (ulong)(zh<yh);
33 1500000030 : *_zh = zh; *_zl = zl; return co;
34 1500000030 : }
35 :
36 : /* fd_uwide_inc computes <zh,zl> = (<xh,xl> + y) mod 2^128 exactly (a
37 : common use of the above) */
38 :
39 : static inline void
40 : fd_uwide_inc( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
41 : ulong xh, ulong xl,
42 1501273962 : ulong y ) {
43 1501273962 : ulong zl = xl + y;
44 1501273962 : ulong zh = xh + (zl<xl);
45 1501273962 : *_zh = zh; *_zl = zl;
46 1501273962 : }
47 :
48 : /* fd_uwide_sub compute <zh,zl> = bo 2^128 + <xh,xl> - <yh,yl> - bi
49 : exactly. Returns the borrow out. Note that the borrow in / borrow
50 : operations should be compile time optimized out if b is 0 at compile
51 : time on input and/or return value is not used. Assumes _zh and _zl
52 : are valid (e.g. non-NULL and non-overlapping). Ignoring borrow
53 : in/out related operations costs 4 u64 subs, 1 u64 compare and 1 u64
54 : conditional decrement. */
55 :
56 : static inline ulong
57 : fd_uwide_sub( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
58 : ulong xh, ulong xl,
59 : ulong yh, ulong yl,
60 5190965313 : ulong bi ) {
61 5190965313 : ulong zh = xh; ulong zl = xl;
62 5190965313 : ulong bt; ulong bo;
63 5190965313 : bt = (ulong)(zl<bi); zl -= bi;
64 5190965313 : bt += (ulong)(zl<yl); zl -= yl;
65 5190965313 : bo = (ulong)(zh<bt); zh -= bt;
66 5190965313 : bo += (ulong)(zh<yh); zh -= yh;
67 5190965313 : *_zh = zh; *_zl = zl; return bo;
68 5190965313 : }
69 :
70 : /* fd_uwide_dec computes <zh,zl> = (<xh,xl> - y) mod 2^128 exactly (a
71 : common use of the above) */
72 :
73 : static inline void
74 : fd_uwide_dec( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
75 : ulong xh, ulong xl,
76 1500000015 : ulong y ) {
77 1500000015 : ulong zh = xh - (ulong)(y>xl);
78 1500000015 : ulong zl = xl - y;
79 1500000015 : *_zh = zh; *_zl = zl;
80 1500000015 : }
81 :
82 : /* fd_uwide_mul computes <zh,zl> = x*y exactly, will be in
83 : [0,2^128-2^65+1]. Assumes _zh and _zl are valid (e.g. non-NULL and
84 : non-overlapping). Cost is 4 u32*u32->u64 muls, 4 u64 adds, 2 u64
85 : compares, 2 u64 conditional increments, 8 u64 u32 word extractions. */
86 :
87 : static inline void
88 : fd_uwide_mul( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
89 : ulong x,
90 5190965343 : ulong y ) {
91 5190965343 : ulong x1 = x>>32; ulong x0 = (ulong)(uint)x; /* both 2^32-1 @ worst case (x==y==2^64-1) */
92 5190965343 : ulong y1 = y>>32; ulong y0 = (ulong)(uint)y; /* both 2^32-1 @ worst case */
93 :
94 5190965343 : ulong w0 = x0*y0; ulong w1 = x0*y1; /* both 2^64-2^33+1 @ worst case */
95 5190965343 : ulong w2 = x1*y0; ulong w3 = x1*y1; /* both 2^64-2^33+1 @ worst case */
96 :
97 5190965343 : ulong w1h = w1>>32; ulong w1l = (ulong)(uint)w1; /* w1h 2^32-2, w1l 1 @ worst case */
98 5190965343 : ulong w2h = w2>>32; ulong w2l = (ulong)(uint)w2; /* w2h 2^32-2, w2l 1 @ worst case */
99 :
100 5190965343 : ulong zh = w1h + w2h + w3; /* 2^64-3 @ worst case */
101 5190965343 : ulong t0 = w0 + (w1l<<32); zh += (ulong)(t0<w0); /* t 2^64-2^32+1, zh 2^64 - 3 @ worst case */
102 5190965343 : ulong zl = t0 + (w2l<<32); zh += (ulong)(zl<t0); /* t 1, zh 2^64 - 2 @ worst case */
103 : /* zh 2^64 + zl == 2^128-2^65+1 @ worst case */
104 :
105 5190965343 : *_zh = zh; *_zl = zl;
106 5190965343 : }
107 :
108 : /* fd_uwide_find_msb returns floor( log2 <xh,xl> ) exactly. Assumes
109 : <xh,xl> is not 0. */
110 :
111 : static inline int
112 : fd_uwide_find_msb( ulong xh,
113 2994142743 : ulong xl ) {
114 2994142743 : int off = 0;
115 2994142743 : if( xh ) { off = 64; xl = xh; } /* FIXME: BRANCHLESS? */
116 2994142743 : return off + fd_ulong_find_msb( xl );
117 2994142743 : }
118 :
119 : /* fd_uwide_find_msb_def same as the fd_uwide_find_msb but returns def
120 : is <xh,xl> is 0. FIXME: BRANCHLESS? */
121 :
122 1500000000 : static inline int fd_uwide_find_msb_def( ulong xh, ulong xl, int def ) { return (xh|xl) ? fd_uwide_find_msb(xh,xl) : def; }
123 :
124 : /* fd_uwide_sl computes <zh,zl> = <xh,xl> << s. Assumes _zh and _zl are
125 : valid (e.g. non-NULL and non-overlapping) and s is non-negative.
126 : Large values of s are fine (shifts to zero). Returns the inexact
127 : flag (will be 0 or 1) which indicates if any non-zero bits of <xh,xl>
128 : were lost in the process. Note that inexact handling and various
129 : cases should be compile time optimized out if if s is known at
130 : compile time on input and/or return value is not used. Ignoring
131 : inexact handling and assuming compile time s, for the worst case s,
132 : cost is 3 u64 shifts and 1 u64 bit or. FIXME: CONSIDER HAVING AN
133 : INVALID FLAG FOR NEGATIVE S? FIXME: BRANCHLESS? */
134 :
135 : static inline int
136 : fd_uwide_sl( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
137 : ulong xh, ulong xl,
138 2920175823 : int s ) {
139 2920175823 : if( s>=128 ) { *_zh = 0UL; *_zl = 0UL; return !!(xh| xl ); }
140 2914315623 : if( s> 64 ) { s -= 64; int t = 64-s; *_zh = xl<<s; *_zl = 0UL; return !!(xh|(xl>>t)); }
141 2176039791 : if( s== 64 ) { *_zh = xl; *_zl = 0UL; return !! xh; }
142 2164326543 : if( s> 0 ) { int t = 64-s; *_zh = (xh<<s)|(xl>>t); *_zl = xl<<s; return !!(xh>>t); }
143 28931016 : /* s== 0 */ *_zh = xh; *_zl = xl; return 0;
144 2164326543 : }
145 :
146 : /* fd_uwide_sr compute <zh,zl> = <xh,xl> >> s. Assumes _zh and _zl are
147 : valid (e.g. non-NULL and non-overlapping) and s is non-negative.
148 : Large values of s are fine (shifts to zero). Returns the inexact
149 : flag (will be 0 or 1) which indicates if any non-zero bits of <xh,xl>
150 : were lost in the process. Note that inexact handling and various
151 : cases should be compile time optimized out if if s is known at
152 : compile time on input and/or return value is not used. Ignoring
153 : inexact handling and assuming compile time s, for the worst case s,
154 : cost is 3 u64 shifts and 1 u64 bit or. (FIXME: CONSIDER HAVING AN
155 : INVALID FLAG FOR NEGATIVE S AND/OR MORE DETAILED INEXACT FLAGS TO
156 : SIMPLIFY IMPLEMENTING FIXED AND FLOATING POINT ROUNDING MODES?) */
157 :
158 : static inline int
159 : fd_uwide_sr( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
160 : ulong xh, ulong xl,
161 1546160214 : int s ) {
162 1546160214 : if( s>=128 ) { *_zh = 0UL; *_zl = 0UL; return !!( xh |xl); }
163 1540300014 : if( s> 64 ) { s -= 64; int t = 64-s; *_zh = 0UL; *_zl = xh>>s; return !!((xh<<t)|xl); }
164 802024182 : if( s== 64 ) { *_zh = 0UL; *_zl = xh; return !! xl; }
165 790310934 : if( s> 0 ) { int t = 64-s; *_zh = xh>>s; *_zl = (xl>>s)|(xh<<t); return !!( xl<<t ); }
166 28929495 : /* s== 0 */ *_zh = xh; *_zl = xl; return 0;
167 790310934 : }
168 :
169 : /* FIXME: LOOK FOR IMPROVED DIV ALGORITHMICS HERE! */
170 :
171 : /* fd_uwide_private_div_approx_init/fd_uwide_private_div_approx computes
172 : a ~32-bit accurate approximation qa of q = floor(n 2^64 / d) where d
173 : is in [2^63,2^64) cheaply. Approximation will be at most q.
174 :
175 : In the general case, qa is up to 65 bits wide (worst case is n=2^64-1
176 : and d=2^63 such that q is 2^65-2 and qa is precise enough to need 65
177 : bits too). The implementation here assumes n is is less than d so
178 : that q and qa are both known to fit within 64 bits.
179 :
180 : Cost to setup for a given d is approximately a 1 u64 u32 extract, 1
181 : u64 increment, 1 u64 neg, 1 u64/u64 div, 1 u64 add. Cost per n/d
182 : afterward is 2 u32*u32->u64 mul, 2 u64 add, 1 u64 u32 extract.
183 :
184 : Theory: Let q d + r = n 2^64 where q = floor( n 2^64 / d ). Note r
185 : is in [0,d). Break d into d = dh 2^32 + dl where dh and dl are
186 : 32-bit words:
187 : q+r/d = n 2^64 / d = n 2^64 / ( dh 2^32 + dl )
188 : = n 2^64 / ( (dh+1) 2^32 - (2^32-dl) ) ... dh+1 can be computed without overflow, 2^32-dl is positive
189 : > n 2^64 / ( (dh+1) 2^32 )
190 : >= n floor( 2^64/(dh+1) ) / 2^32
191 : Note that floor( 2^64/(dh+1) ) is in [2^32,2^33-4]. This suggests
192 : letting:
193 : 2^32 + m = floor( 2^64/(dh+1) )
194 : where m is in [0,2^32-4] (fits in a uint). Then we have:
195 : q+r/d > n (2^32 + m) / 2^32
196 : = n + n m / 2^32
197 : Similarly breaking n into n = nh 2^32 + nl:
198 : q+r/d > n + (nh m) + (nl m/2^32)
199 : >= n + nh m + floor( nl m / 2^32 )
200 : We get:
201 : q+r/d > n + nh*m + ((nl*m)>>32)
202 : And, as q is integral, r/d is less than 1 and the right hand side is
203 : integral:
204 : q >= qa
205 : where:
206 : qa = n + nh*m + (((nl*m)>>32)
207 : and:
208 : m = floor( 2^64 / (dh+1) ) - 2^32
209 :
210 : To compute m efficiently, note:
211 : m = floor( 1 + (2^64-(dh+1))/(dh+1) ) - 2^32
212 : = floor( (2^64-(dh+1))/(dh+1) ) - (2^32-1)
213 : and in "C style" modulo 2^64 arithmetic, 2^64 - x = -x. This yields:
214 : m = (-(dh+1))/(dh+1) - (2^32-1)
215 :
216 : Applications should avoid using these directly. (They might be
217 : removed in the future, different approximations might get used for
218 : different targets, might be changed to be faster but less accurate or
219 : slower and more accurate, etc.) */
220 :
221 : static inline ulong /* In [0,2^32) */
222 1420175823 : fd_uwide_private_div_approx_init( ulong d ) { /* d in [2^63,2^64) */
223 1420175823 : ulong m = (d>>32) + 1UL; /* m = dh+1 and is in (2^31,2^32] ... exact */
224 1420175823 : return ((-m)/m) - (ulong)UINT_MAX; /* m = floor( 2^64/(dh+1) ) - 2^32 ... exact */
225 1420175823 : }
226 :
227 : static inline ulong /* In [n,2^64) and <= floor(n 2^64/d) */
228 : fd_uwide_private_div_approx( ulong n, /* In [0,d) */
229 3690965313 : ulong m ) { /* Output of fd_uwide_private_div_approx_init for the desired d */
230 3690965313 : ulong nh = n>>32;
231 3690965313 : ulong nl = (ulong)(uint)n;
232 3690965313 : return n + nh*m + ((nl*m)>>32);
233 3690965313 : }
234 :
235 : /* fd_uwide_div computes zh 2^64 + zl = floor( (xh 2^64 + xl) / y ).
236 : Assumes _zh and _zl are valid (e.g. non-NULL and non-overlapping).
237 : Requires y to be non-zero. Returns the exception flag if y is 0
238 : (<zh,zl>=0 in this case). This is not very cheap and cost is highly
239 : variable depending on properties of both n and d. Worst case is
240 : roughly ~3 u64/u64 divides, ~24 u64*u64->u64 muls plus other minor
241 : ops.
242 :
243 : Breakdown in worst case 1 u64 log2, 2 u64/u64 div, 1 u64*u64->u64
244 : mul, 1 u64 sub, 1 int sub, 1 u128 variable shift, 1 1 u64 variable
245 : shift, 1 u64 div approx init, 4*(1 u64 div approx, 1 u64*u64->u128
246 : mul, 1 u128 sub) plus various operations to facilitating shortcutting
247 : (e.g. when xh is zero, cost is 1 u64/u64 div). */
248 :
249 : FD_FN_UNUSED static int /* Work around -Winline */
250 : fd_uwide_div( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
251 : ulong xh, ulong xl,
252 1500000015 : ulong y ) {
253 :
254 : /* Simple cases (y==0, x==0 and y==2^n) */
255 :
256 1500000015 : if( FD_UNLIKELY( !y ) ) { *_zh = 0UL; *_zl = 0UL; return 1; }
257 1500000015 : if( FD_UNLIKELY( !xh ) ) { *_zh = 0UL; *_zl = xl / y; return 0; }
258 :
259 738224319 : int n = fd_ulong_find_msb( y );
260 738224319 : if( FD_UNLIKELY( !fd_ulong_pop_lsb(y) ) ) { fd_uwide_sr( _zh,_zl, xh,xl, n ); return 0; }
261 :
262 : /* General case ... compute zh noting that:
263 : <zh,zl> = floor( (xh 2^64 + xl) / y )
264 : = floor( ((qh y + rh) 2^64 + xl) / y )
265 : where xh = qh y + rh and qh is floor(xh/y), rh = xh%y and rh is in
266 : [0,y). Thus:
267 : = qh 2^64 + floor( (rh 2^64 + xl) / y )
268 : where the right term is less that 2^64 such that zh is qh and
269 : zl = floor( (rh 2^64 + xl) / y ). */
270 :
271 715144212 : ulong qh = xh / y;
272 715144212 : ulong rh = xh - qh*y;
273 :
274 : /* Simple zl shortcut */
275 :
276 715144212 : if( FD_UNLIKELY( !rh ) ) { *_zh = qh; *_zl = xl / y; return 0; }
277 :
278 : /* At this point, zh is qh and zl is floor( (rh 2^64 + xl) / y ) where
279 : rh is in (0,y) and y is a positive non-power of 2.
280 :
281 : Normalize this by noting for:
282 : q=n/d; r=n-q*d=n%d
283 : we have:
284 : -> n = q d + r where q = floor(n/d) and r in [0,d).
285 : -> n w = q d w + r w for some integer w>0
286 : That is, if we scale up both n and d by w, it doesn't affect q (it
287 : just scales the remainder). We use a scale factor of 2^s on
288 : <rh,xl> and y such that y will have its most significant bit set.
289 : This will not cause <rh,xl> to overflow as rh is less than y at
290 : this point. */
291 :
292 710087919 : int s = 63-n;
293 710087919 : ulong nh, nl; fd_uwide_sl( &nh,&nl, rh,xl, s );
294 710087919 : ulong d = y << s;
295 :
296 : /* At this point zh = qh and zl is floor( (nh 2^64 + nl) / d ) where
297 : nh is in (0,d) and d is in (2^63,2^64). */
298 :
299 710087919 : ulong m = fd_uwide_private_div_approx_init( d );
300 710087919 : ulong eh = nh; ulong el = nl;
301 710087919 : ulong ql = 0UL;
302 1845482679 : do {
303 :
304 : /* At this point, n-ql*d has an error of <eh,el> such that:
305 : d < 2^64 <= <eh,el>
306 : (i.e. eh is non-zero). If we increment ql by the correction
307 : floor(<eh,el>/d), we'd be at the solution but computing this is
308 : as hard as the original problem. We do have the ability to
309 : very quickly compute a ~32-bit accurate estimate dqest of
310 : floor(<eh,0>/d) such that:
311 : 1 <= eh <= dqest <= floor(<eh,0>/d) <= floor(<eh,el)/d).
312 : so we use that as our increment. Practically, this loop will
313 : require at most ~4 iterations. */
314 :
315 1845482679 : ql += fd_uwide_private_div_approx( eh, m ); /* Guaranteed to make progress if eh > 0 */
316 1845482679 : fd_uwide_mul( &eh,&el, ql, d ); /* <eh,el> = ql*d */
317 1845482679 : fd_uwide_sub( &eh,&el, nh,nl, eh,el, 0UL ); /* <eh,el> = <nh,nl> - ql d */
318 1845482679 : } while( eh );
319 :
320 : /* At this point, n - ql*d has an error less than 2^64 so we can
321 : directly compute the remaining correction. */
322 :
323 710087919 : ql += el/d;
324 :
325 710087919 : *_zh = qh; *_zl = ql; return 0;
326 715144212 : }
327 :
328 : /* fd_uwide_divrem is same as the above but returns the value of the
329 : remainder too. For non-zero y, remainder will be in [0,y). If y==0,
330 : returns <zh,zl>=0 with a remainder of ULONG_MAX (to signal error). */
331 :
332 : static inline ulong
333 : fd_uwide_divrem( ulong * FD_RESTRICT _zh, ulong * FD_RESTRICT _zl,
334 : ulong xh, ulong xl,
335 1500000000 : ulong y ) {
336 :
337 1500000000 : if( FD_UNLIKELY( !y ) ) { *_zh = 0UL; *_zl = 0UL; return ULONG_MAX; }
338 1500000000 : if( FD_UNLIKELY( !xh ) ) { ulong ql = xl / y; ulong r = xl - ql*y; *_zh = 0UL; *_zl = ql; return r; }
339 :
340 738224304 : int n = fd_ulong_find_msb( y );
341 738224304 : if( FD_UNLIKELY( !fd_ulong_pop_lsb(y) ) ) { int s = 64-n; fd_uwide_sr( _zh,_zl, xh,xl, n ); return n ? ((xl << s) >> s) : 0UL; }
342 :
343 715144197 : ulong qh = xh / y;
344 715144197 : ulong rh = xh - qh*y;
345 :
346 715144197 : if( FD_UNLIKELY( !rh ) ) { ulong ql = xl / y; ulong r = xl - ql*y; *_zh = qh; *_zl = ql; return r; }
347 :
348 710087904 : int s = 63-n;
349 710087904 : ulong nh, nl; fd_uwide_sl( &nh,&nl, rh,xl, s );
350 710087904 : ulong d = y << s;
351 :
352 710087904 : ulong m = fd_uwide_private_div_approx_init( d );
353 710087904 : ulong eh = nh; ulong el = nl;
354 710087904 : ulong ql = 0UL;
355 1845482634 : do {
356 1845482634 : ql += fd_uwide_private_div_approx( eh, m );
357 1845482634 : fd_uwide_mul( &eh,&el, ql, d );
358 1845482634 : fd_uwide_sub( &eh,&el, nh,nl, eh,el, 0UL );
359 1845482634 : } while( eh );
360 :
361 710087904 : ulong dq = el / d;
362 710087904 : ulong r = (el - dq*d) >> s;
363 710087904 : ql += dq;
364 :
365 710087904 : *_zh = qh; *_zl = ql; return r;
366 715144197 : }
367 :
368 : FD_PROTOTYPES_END
369 :
370 : #endif /* HEADER_fd_src_util_bits_fd_uwide_h */
371 :
|