Line data Source code
1 : #include "fd_bits.h"
2 :
3 : ulong
4 1190663193 : fd_ulong_approx_sqrt( ulong x ) {
5 :
6 : /* Theory:
7 :
8 : Break a non-zero x exactly into:
9 :
10 : x = 2^e ( 1 + m/2^63 )
11 :
12 : where e is 6-bit and m is 63-bit. Further, break e exactly into:
13 :
14 : e = 2 q + r
15 :
16 : where q is 5-bit and r in 1-bit . Then, break m exactly into:
17 :
18 : m = u 2^61 + h 2^29 + l
19 :
20 : where u is 2-bit, h is 32-bit and l is 29-bit. Then:
21 :
22 : y = sqrt( x )
23 : = 2^q 2^(r/2) sqrt( 1 + u/4 + h/2^34 + l/2^63 )
24 :
25 : We ignore the l term as very small. For given r and u, y is
26 : roughly linearly with respect to h. Let i be a 3-bit table index
27 : such that:
28 :
29 : i = 4 r + u
30 :
31 : and approximate:
32 :
33 : y ~ 2^q (alpha_i + h beta_i / 2^32)
34 :
35 : where:
36 :
37 : alpha_i = 2^(r/2) sqrt( 1 + u/4 )
38 : beta_i = alpha_{i+1} - alpha_i
39 :
40 : The right hand term is in [1,2] for h in [0,2^32) after rounding.
41 : To compute this in 64-bit integer math then, we scale the right
42 : hand term by 2^62 yielding:
43 :
44 : y ~ round( 2^62 (alpha_i + beta_i h) / 2^(62-q) )
45 : ~ floor( (a_i + b_i h) / 2^(62-q) + 0.5 )
46 : ~ (a_i + b_i h + 2^(61-q)) / 2^(62-q)
47 :
48 : where:
49 :
50 : a_i = round( 2^62 alpha_i )
51 : b_i = round( 2^30 beta_i ) */
52 :
53 1190663193 : if( FD_UNLIKELY( !x ) ) return 0UL;
54 :
55 1187550924 : int e = fd_ulong_find_msb( x ); /* In [0,63] */
56 1187550924 : ulong m = x << (63-e); /* In [2^63,2^64) */
57 1187550924 : int q = e >> 1; /* In [0,31] */
58 1187550924 : int r = e & 1; /* In [0,1] */
59 1187550924 : int i = (4*r) + (int)((m >> 61) & 3UL); /* In [0,8) */
60 1187550924 : ulong h = (ulong)(uint)(m >> 29); /* In [0,2^32) */
61 :
62 : /* These tables are autogenerated using long double arithmetic */
63 :
64 1187550924 : static ulong const a[8] = { 4611686018427387904UL, 5156021714044493574UL, 5648138799537240564UL, 6100687164736251614UL,
65 1187550924 : 6521908912666391106UL, 7291715835891894856UL, 7987674492471257551UL, 8627674528165471361UL };
66 1187550924 : static ulong const b[8] = { 126738030UL, 114579938UL, 105367127UL, 98073331UL,
67 1187550924 : 179234641UL, 162040502UL, 149011620UL, 138696634UL };
68 :
69 1187550924 : return (a[i] + b[i]*h + (1UL<<(61-q))) >> (62-q);
70 1190663193 : }
71 :
72 : ulong
73 300000000 : fd_ulong_round_sqrt( ulong x ) {
74 :
75 : /* Theory:
76 :
77 : To find the integer y such that y = round( sqrt(x) ) with ties
78 : rounding toward zero, note:
79 :
80 : (y-0.5)^2 < x <= (y+0.5)^2
81 : -> y^2 - y + 0.25 < x <= y^2 + y + 0.25
82 :
83 : Since x and y are integral, ties can never occur. Thus:
84 :
85 : -> y^2 - y + 0.25 < x < y^2 + y + 0.25
86 :
87 : Similarly, integral x and y means we can eliminate the 0.25 adds:
88 :
89 : -> y^2 - y + 1 <= x < y^2 + y + 1
90 :
91 : Let y^2 - y + 1 = x - r.
92 :
93 : -> x - r <= x < x - r + 2 y
94 : -> 0 <= r < 2 y
95 :
96 : That is, if y is round( sqrt(x) ) we must have:
97 :
98 : y^2 - y + 1 = x - r for some r in [0,2 y).
99 :
100 : Suppose we have a reasonable initial guess. Applying
101 : Newton-Raphson to:
102 :
103 : f(y) = y^2 - x
104 :
105 : for real valued x and y yields:
106 :
107 : y1 = y0 + (x - y0^2)/(2 y0)
108 :
109 : Naively generalizing to integral x and y yields:
110 :
111 : y1 = y0 + floor( (x - y0^2)/(2 y0) )
112 :
113 : If this converges, at convergence we have:
114 :
115 : (x - y^2 - r) / (2 y) = 0 for some r in [0,2 y)
116 : -> y^2 = x - r for some r in [0,2 y)
117 :
118 : This the naive implementation doesn't converge to the desired y.
119 : But if we tweak the N-R iteration to:
120 :
121 : y1 = y0 + floor( (x - y0^2 + y0 - 1) / (2 y0) )
122 :
123 : At convergence we have:
124 :
125 : x - y^2 + y - 1 - r = 0 for some r in [0,2 y)
126 : -> y^2 - y + 1 = x - r for some r in [0,2 y).
127 :
128 : which is the desired solution.
129 :
130 : N-R converges quadratically (i.e. the number of correct bits
131 : roughly doubles each iteration given a reasonable initial guess).
132 : The initial guess here is accurate to 4+ bits and the solution
133 : requires ~32 bits accuracy. So this will complete with at most ~3
134 : divisions. Though it is slightly faster for large x to use an
135 : unrolled implementation (e.g. iterate 3 quickly), we use a variable
136 : iteration count to be absolutely sure of convergence and to be
137 : faster when fed smaller inputs (as is done in typical usage ...
138 : where O(1) call is made with a moderate sized x).
139 :
140 : Since our initial guess is close, y^2 fits within 65 bits. Thus,
141 : the subtraction to compute x - y^2 will "do the right thing" with
142 : no special handling needed for wrapping.
143 :
144 : Since modern C/C++ use round toward zero for integer division, we
145 : have to tweak the numerator when it is negative to convert to floor
146 : style division. */
147 :
148 300000000 : if( FD_UNLIKELY( !x ) ) return 0UL;
149 :
150 296887731 : ulong y = fd_ulong_approx_sqrt( x );
151 :
152 588777390 : for(;;) {
153 588777390 : long den = (long)(y << 1);
154 588777390 : long num = (long)(x - y*y + y - 1UL);
155 588777390 : if( ((0L<=num) & (num<den)) ) break; /* Branch prob depends on x's distribution */
156 291889659 : y += (ulong)((num - fd_long_if( num>=0L, 0L, den-1L )) / den);
157 291889659 : }
158 :
159 296887731 : return y;
160 300000000 : }
161 :
162 : ulong
163 300000000 : fd_ulong_floor_sqrt( ulong x ) {
164 :
165 : /* Theory:
166 :
167 : To find the integer y such that y = floor( sqrt(x) ), note:
168 :
169 : y^2 <= x < (y+1)^2
170 : -> y^2 <= x < y^2 + 2 y + 1
171 :
172 : Let y^2 = x - r.
173 :
174 : -> x - r <= x < x - r + 2 y + 1
175 : -> 0 <= r < 2 y + 1
176 :
177 : That is, if y is floor( sqrt(x) ) we must have:
178 :
179 : y^2 = x - r for some r in [0,2 y + 1).
180 :
181 : Like the round implementation, naively generalizing N-R
182 : to integer doesn't converge to the correct y. But if
183 : we tweak the N-R iteration to:
184 :
185 : y1 = y0 + floor( (x - y0^2) / (2 y0 + 1) )
186 :
187 : At convergence we have:
188 :
189 : x - y^2 - r = 0 for some r in [0,2 y+1)
190 : -> y^2 = x - r for some r in [0,2 y).
191 :
192 : which is the desired solution.
193 :
194 : Similar considerations for convergence, wrapping and trunc vs floor
195 : division as the above. */
196 :
197 300000000 : if( FD_UNLIKELY( !x ) ) return 0UL;
198 :
199 296887731 : ulong y = fd_ulong_approx_sqrt( x );
200 :
201 547129983 : for(;;) {
202 547129983 : long den = (long)((y << 1) + 1UL);
203 547129983 : long num = (long)(x - y*y);
204 547129983 : if( ((0L<=num) & (num<den)) ) break; /* Branch prob depends on x's distribution */
205 250242252 : y += (ulong)((num - fd_long_if( num>=0L, 0L, den-1L )) / den);
206 250242252 : }
207 :
208 296887731 : return y;
209 300000000 : }
210 :
211 : ulong
212 300000000 : fd_ulong_ceil_sqrt( ulong x ) {
213 :
214 : /* Theory:
215 :
216 : To find the integer y such that y = ceil( sqrt(x) ), note that
217 : positive x:
218 :
219 : (y-1)^2 < x <= y
220 : -> y^2 - 2 y + 1 < x <= y^2
221 :
222 : Let y = x + r.
223 :
224 : -> x + r - 2 y + 1 < x <= x + r
225 : -> -2 y + 1 < -r <= 0
226 : -> 0 <= r < 2 y - 1
227 :
228 : That is, if y is ceil( sqrt(x) ) we must have:
229 :
230 : y^2 = x + r for some r in [0,2 y - 1).
231 :
232 : Like the round implementation, naively generalizing N-R
233 : to integer doesn't converge to the correct y. But if
234 : we tweak the N-R iteration to:
235 :
236 : y1 = y0 + ceil ( (x - y0^2 ) / (2 y0 - 1) )
237 : = y0 + floor( (x - y0^2 + 2 y0 - 2) / (2 y0 - 1) )
238 :
239 : At convergence we have:
240 :
241 : x - y^2 + r = 0 for some r in [0,2 y-1)
242 : -> y^2 = x + r for some r in [0,2 y-1).
243 :
244 : which is the desired solution.
245 :
246 : Similar considerations for convergence, wrapping and trunc vs floor
247 : division as the above. */
248 :
249 300000000 : if( FD_UNLIKELY( !x ) ) return 0UL;
250 :
251 296887731 : ulong y = fd_ulong_approx_sqrt( x );
252 :
253 625390167 : for(;;) {
254 625390167 : ulong tmp = y << 1;
255 625390167 : long den = (long)(tmp - 1UL);
256 625390167 : long num = (long)(x - y*y + tmp - 2UL);
257 625390167 : if( ((0L<=num) & (num<den)) ) break; /* Branch prob depends on x's distribution */
258 328502436 : y += (ulong)((num - fd_long_if( num>=0L, 0L, den-1L )) / den);
259 328502436 : }
260 :
261 296887731 : return y;
262 300000000 : }
263 :
264 : ulong
265 1190663193 : fd_ulong_approx_cbrt( ulong x ) {
266 :
267 : /* This is basically the same as the approx_sqrt but only uses 1
268 : significant bit for u. */
269 :
270 1190663193 : if( FD_UNLIKELY( !x ) ) return 0UL;
271 :
272 1187550924 : int e = fd_ulong_find_msb( x ); /* In [0,63] */
273 1187550924 : ulong m = x << (63-e); /* In [2^63,2^64) */
274 1187550924 : int q = (int)(((uint)e) / 3U); /* In [0,21] */
275 1187550924 : int r = (int)(((uint)e) % 3U); /* In [0,2) */
276 1187550924 : int i = 2*r + (int)((m >> 62) & 1UL); /* In [0,6) */
277 1187550924 : ulong h = (ulong)(uint)(m >> 30); /* In [0,2^32) */
278 :
279 1187550924 : static ulong const a[6] = { 4611686018427387904UL, 5279062667477898215UL, 5810360290122541960UL,
280 1187550924 : 6651202178469583219UL, 7320595236998672907UL, 8379989631760464848UL };
281 1187550924 : static ulong const b[6] = { 155385734UL, 123702367UL, 195773758UL,
282 1187550924 : 155855216UL, 246659478UL, 196365268UL };
283 :
284 1187550924 : return (a[i] + b[i]*h + (1UL<<(61-q))) >> (62-q);
285 1190663193 : }
286 :
287 : ulong
288 300000000 : fd_ulong_round_cbrt( ulong x ) {
289 :
290 : /* This works the same as the above with:
291 : (y-0.5)^3 <= x < (y+0.5)^3
292 : -> 4 y^3 - 6 y^2 + 3 y = 4 x - r for some r in [0,12 y^2 + 1)
293 : and Newton-Raphson generalized to integer arithmetic applied to:
294 : f(y) = y^3 - x
295 : yielding:
296 : y1 = y0 + floor( 4 x - 4 y0^3 + 6 y0^2 - 3 y0) / (12 y0^2 + 1) ) */
297 :
298 300000000 : if( FD_UNLIKELY( !x ) ) return 0UL;
299 :
300 296887731 : ulong y = fd_ulong_approx_cbrt( x );
301 :
302 483466503 : for(;;) {
303 483466503 : ulong ysq = y*y;
304 483466503 : long num = (long)(4UL*(x - y*ysq) + 6UL*ysq - 3UL*y);
305 483466503 : long den = (long)(12UL*ysq + 1UL);
306 483466503 : if( ((0L<=num) & (num<den)) ) break; /* Branch prob depends on x's distribution */
307 186578772 : y += (ulong)((num - fd_long_if( num>=0L, 0L, den-1L )) / den);
308 186578772 : }
309 :
310 296887731 : return y;
311 300000000 : }
312 :
313 : ulong
314 300000000 : fd_ulong_floor_cbrt( ulong x ) {
315 :
316 : /* This works the same as the above with:
317 : y^3 <= x < (y+1)^3
318 : -> y^3 = x - r for some r in [0,3 y^2 + 3 y + 1)
319 : and Newton-Raphson generalized to integer arithmetic applied to:
320 : f(y) = y^3 - x
321 : yielding:
322 : y1 = y0 + floor( (x - y^3) / (3 y^2 + 3 y + 1) ) */
323 :
324 300000000 : if( FD_UNLIKELY( !x ) ) return 0UL;
325 :
326 296887731 : ulong y = fd_ulong_approx_cbrt( x );
327 :
328 540024249 : for(;;) {
329 540024249 : ulong ysq = y*y;
330 540024249 : long num = (long)(x - y*ysq);
331 540024249 : long den = (long)(3UL*(ysq+y) + 1UL);
332 540024249 : if( ((0L<=num) & (num<den)) ) break; /* Branch prob depends on x's distribution */
333 243136518 : y += (ulong)((num - fd_long_if( num>=0L, 0L, den-1L )) / den);
334 243136518 : }
335 :
336 296887731 : return y;
337 300000000 : }
338 :
339 : ulong
340 300000000 : fd_ulong_ceil_cbrt( ulong x ) {
341 :
342 : /* This works the same as the above with:
343 : (y-1)^3 < x <= y^3
344 : -> y^3 = x + r for some r in [0,3 y^2 - 3 y + 1)
345 : and Newton-Raphson generalized to integer arithmetic applied to:
346 : f(y) = y^3 - x
347 : yielding:
348 : y1 = y0 + ceil ( (x - y^3 ) / (3 y^2 - 3 y + 1) )
349 : = y0 + floor( (x - y^3 + 3 y^2 - 3 y) / (3 y^2 - 3 y + 1) ) */
350 :
351 300000000 : if( FD_UNLIKELY( !x ) ) return 0UL;
352 :
353 296887731 : ulong y = fd_ulong_approx_cbrt( x );
354 :
355 545419869 : for(;;) {
356 545419869 : ulong ysq = y*y;
357 545419869 : ulong tmp = 3UL*(ysq-y);
358 545419869 : long num = (long)(x - y*ysq + tmp);
359 545419869 : long den = (long)(tmp + 1UL);
360 545419869 : if( ((0L<=num) & (num<den)) ) break; /* Branch prob depends on x's distribution */
361 248532138 : y += (ulong)((num - fd_long_if( num>=0L, 0L, den-1L )) / den);
362 248532138 : }
363 :
364 296887731 : return y;
365 300000000 : }
|