LCOV - code coverage report
Current view: top level - util/math - fd_fxp.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 202 232 87.1 %
Date: 2025-01-08 12:08:44 Functions: 40 70 57.1 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_util_math_fd_fxp_h
       2             : #define HEADER_fd_src_util_math_fd_fxp_h
       3             : 
       4             : /* Large set of primitives for portable fixed point arithmetic with
       5             :    correct rounding and/or overflow detection.  Strongly targeted at
       6             :    platforms with reasonable performance C-style 64b unsigned integer
       7             :    arithmetic under the hood.  Likewise, strongly targets unsigned fixed
       8             :    point representations that fit within 64b and have 30 fractional
       9             :    bits.  Adapted from the Pyth Oracle. */
      10             : 
      11             : #include "fd_sqrt.h"
      12             : 
      13             : #if !FD_HAS_INT128
      14             : #include "../bits/fd_uwide.h"
      15             : #endif
      16             : 
      17             : FD_PROTOTYPES_BEGIN
      18             : 
      19             : /* Private API ********************************************************/
      20             : 
      21             : #if FD_HAS_INT128 /* See the uwide-based implementations for details how these work */
      22             : 
      23  2559380148 : FD_FN_CONST static inline uint128 fd_fxp_private_expand( ulong x ) { return ((uint128)x)<<30; }
      24  1800000000 : static inline ulong fd_fxp_private_contract( uint128 x, ulong * _c ) { x >>= 30; *_c = (ulong)(x>>64); return (ulong)x; }
      25             : 
      26             : /* Return the low 64-bits of a uint128 and store the high 64-bits at *_h */
      27             : 
      28  1785933144 : static inline ulong fd_fxp_private_split( uint128 x, ulong * _h ) { *_h = (ulong)(x>>64); return (ulong)x; }
      29             : 
      30             : #else /* uwide based implementations from pyth */
      31             : 
      32             : /* Helper used by the below that computes
      33             :      2^64 yh + yl = 2^30 x
      34             :    (i.e. widen a 64b number to 128b and shift it left by 30.)  Exact.
      35             :    yh<2^30. */
      36             : 
      37             : static inline void fd_fxp_private_expand( ulong * _yh, ulong * _yl, ulong x ) { *_yh = x>>34; *_yl = x<<30; }
      38             : 
      39             : /* Helper used by the below that computes
      40             :      2^64 c + y = floor( (2^64 xh + xl) / 2^30 )
      41             :    (i.e. shift a 128b uint right by 30).  Exact.  c<2^34. */
      42             : 
      43             : static inline ulong fd_fxp_private_contract( ulong xh, ulong xl, ulong * _c ) { *_c = xh>>30; return (xh<<34) | (xl>>30); }
      44             : 
      45             : #endif
      46             : 
      47             : /* FIXED POINT ADDITION ***********************************************/
      48             : 
      49             : /* Compute:
      50             :      (c 2^64 + z)/2^30 = x/2^30 + y/2^30
      51             :    Exact.  c will be in [0,1].  Fast variant assumes that the user knows
      52             :    c is zero or is not needed. */
      53             : 
      54   300000000 : static inline ulong fd_fxp_add( ulong x, ulong y, ulong * _c ) { *_c = (ulong)(x>(~y)); return x + y; }
      55   300000000 : FD_FN_CONST static inline ulong fd_fxp_add_fast( ulong x, ulong y ) { return x + y; }
      56             : 
      57             : /* FIXED POINT SUBTRACTION ********************************************/
      58             : 
      59             : /* Compute:
      60             :      z/2^30 = (2^64 b + x)/2^30 - y/2^30
      61             :    Exact.  b will be in [0,1].  Fast variant assumes that the user knows
      62             :    b is zero (i.e. x>=y) or is not needed. */
      63             : 
      64   300000000 : static inline ulong fd_fxp_sub( ulong x, ulong y, ulong * _b ) { *_b = (ulong)(x<y); return x - y; }
      65   300000000 : FD_FN_CONST static inline ulong fd_fxp_sub_fast( ulong x, ulong y ) { return x - y; }
      66             : 
      67             : /* FIXED POINT MULTIPLICATION *****************************************/
      68             : 
      69             : /* Compute:
      70             :      (2^64 c + z)/2^30 ~ (x/2^30)(y/2^30)
      71             :    under various rounding modes.  c<2^34. */
      72             : 
      73             : #if FD_HAS_INT128 /* See the uwide-based implementations for details how these work */
      74             : 
      75             : static inline ulong
      76             : fd_fxp_mul_rtz( ulong   x,
      77             :                 ulong   y,
      78   300000000 :                 ulong * _c ) {
      79   300000000 :   return fd_fxp_private_contract( ((uint128)x)*((uint128)y), _c );
      80   300000000 : }
      81             : 
      82             : static inline ulong
      83             : fd_fxp_mul_raz( ulong   x,
      84             :                 ulong   y,
      85   300000000 :                 ulong * _c ) {
      86   300000000 :   return fd_fxp_private_contract( ((uint128)x)*((uint128)y)+(uint128)((1UL<<30)-1UL), _c );
      87   300000000 : }
      88             : 
      89             : static inline ulong
      90             : fd_fxp_mul_rnz( ulong   x,
      91             :                 ulong   y,
      92   300000000 :                 ulong * _c ) {
      93   300000000 :   return fd_fxp_private_contract( ((uint128)x)*((uint128)y)+(uint128)((1UL<<29)-1UL), _c );
      94   300000000 : }
      95             : 
      96             : static inline ulong
      97             : fd_fxp_mul_rna( ulong   x,
      98             :                 ulong   y,
      99   300000000 :                 ulong * _c ) {
     100   300000000 :   return fd_fxp_private_contract( ((uint128)x)*((uint128)y)+(uint128)(1UL<<29), _c );
     101   300000000 : }
     102             : 
     103             : static inline ulong
     104             : fd_fxp_mul_rne( ulong   x,
     105             :                 ulong   y,
     106   300000000 :                 ulong * _c ) {
     107   300000000 :   uint128 z = ((uint128)x)*((uint128)y);
     108   300000000 :   return fd_fxp_private_contract( z + (uint128)(((1UL<<29)-1UL) + (((ulong)(z>>30)) & 1UL)), _c );
     109   300000000 : }
     110             : 
     111             : static inline ulong
     112             : fd_fxp_mul_rno( ulong   x,
     113             :                 ulong   y,
     114   300000000 :                 ulong * _c ) {
     115   300000000 :   uint128 z = ((uint128)x)*((uint128)y);
     116   300000000 :   return fd_fxp_private_contract( z + (uint128)((1UL<<29) - (((ulong)(z>>30)) & 1UL)), _c );
     117   300000000 : }
     118             : 
     119             : #else /* uwide based implementations from pyth */
     120             : 
     121             : /* rtz -> Round toward zero (aka truncate rounding)
     122             :    Fast variant assumes x*y < 2^64 (i.e. exact result < ~2^4).
     123             :    Based on:
     124             :         z/2^30 ~ (x/2^30)(y/2^30)
     125             :      -> z      ~ x y / 2^30
     126             :    With RTZ rounding: 
     127             :         z      = floor( x y / 2^30 )
     128             :                = (x*y) >> 30
     129             :    As x*y might overflow 64 bits, we need to do a 64b*64b->128b
     130             :    multiplication (fd_uwide_mul) and a 128b shift right by 30
     131             :    (fd_fxp_private_contract).
     132             :    
     133             :    Fastest style of rounding.  Rounding error in [0,1) ulp.
     134             :    (ulp==2^-30). */
     135             : 
     136             : static inline ulong
     137             : fd_fxp_mul_rtz( ulong   x,
     138             :                 ulong   y,
     139             :                 ulong * _c ) {
     140             :   ulong zh,zl; fd_uwide_mul( &zh,&zl, x, y ); /* <= 2^128 - 2^65 + 1 so no overflow */
     141             :   return fd_fxp_private_contract( zh,zl, _c );
     142             : }
     143             : 
     144             : /* raz -> Round away from zero
     145             :    Fast variant assumes x*y < 2^64-2^30+1 (i.e. exact result < ~2^4)
     146             :    Based on:
     147             :         z/2^30 ~ (x/2^30)(y/2^30)
     148             :      -> z      ~ x y / 2^30
     149             :    With RAZ rounding: 
     150             :         z      = ceil( x y / 2^30 )
     151             :                = floor( (x y + 2^30 - 1) / 2^30 )
     152             :                = (x*y + 2^30 -1) >> 30
     153             :    As x*y might overflow 64 bits, we need to do a 64b*64b->128b
     154             :    multiplication (fd_uwide_mul), a 64b increment of a 128b
     155             :    (fd_uwide_inc) and a 128b shift right by 30
     156             :    (fd_fxp_private_contract).
     157             :    
     158             :    Slightly more expensive (one fd_uwide_inc) than RTZ rounding.
     159             :    Rounding error in (-1,0] ulp.  (ulp==2^-30). */
     160             : 
     161             : static inline ulong
     162             : fd_fxp_mul_raz( ulong   x,
     163             :                 ulong   y,
     164             :                 ulong * _c ) {
     165             :   ulong zh,zl; fd_uwide_mul( &zh,&zl, x, y );    /* <= 2^128 - 2^65 + 1 so no overflow */
     166             :   fd_uwide_inc( &zh,&zl, zh,zl, (1UL<<30)-1UL ); /* <= 2^128 - 2^65 + 2^30 so no overflow */
     167             :   return fd_fxp_private_contract( zh,zl, _c );
     168             : }
     169             : 
     170             : /* rnz -> Round nearest with ties toward zero
     171             :    Fast variant assumes x*y < 2^64-2^29+1 (i.e. exact result < ~2^4)
     172             :    Based on:
     173             :         z/2^30 ~ (x/2^30)(y/2^30)
     174             :      -> z      ~ x y / 2^30
     175             :    Let frac be the least significant 30 bits of x*y.  If frac<2^29
     176             :    (frac>2^29), we should round down (up).  frac==2^29 is a tie and we
     177             :    should round down.  If we add 2^29-1 to frac, the result will be
     178             :    <2^30 when frac<=2^29 and will be >=2^30.  Thus bit 30 of frac+2^29-1
     179             :    indicates whether we need to round up or down.  This yields:
     180             :         z = floor( x y + 2^29 - 1 ) / 2^30 )
     181             :           = (x*y + 2^29 - 1) >> 30
     182             :    As x*y might overflow 64 bits, we need to do a 64b*64b->128b
     183             :    multiplication (fd_uwide_mul), a 64b increment of a 128b
     184             :    (fd_uwide_inc) and a 128b shift right by 30
     185             :    (fd_fxp_private_contract).
     186             : 
     187             :    Slightly more expensive (one fd_uwide_inc) than RTZ rounding.
     188             :    Rounding error in (-1/2,1/2] ulp.  (ulp==2^-30). */
     189             : 
     190             : static inline ulong
     191             : fd_fxp_mul_rnz( ulong   x,
     192             :                 ulong   y,
     193             :                 ulong * _c ) {
     194             :   ulong zh,zl; fd_uwide_mul( &zh,&zl, x, y ); /* <= 2^128 - 2^65 + 1 so no overflow */
     195             :   fd_uwide_inc( &zh,&zl, zh,zl, (1UL<<29)-1UL ); /* <= 2^128 - 2^65 + 2^29 so no overflow */
     196             :   return fd_fxp_private_contract( zh,zl, _c );
     197             : }
     198             : 
     199             : /* rna -> Round nearest with ties away from zero (aka grade school rounding)
     200             :    Fast variant assumes x*y < 2^64-2^29 (i.e. exact result < ~2^4)
     201             :    Based on:
     202             :         z/2^30 ~ (x/2^30)(y/2^30)
     203             :      -> z      ~ x y / 2^30
     204             :    Let frac be the least significant 30 bits of x*y.  If frac<2^29
     205             :    (frac>2^29), we should round down (up).  frac==2^29 is a tie and we
     206             :    should round up.  If we add 2^29-1 to frac, the result will be <2^30
     207             :    when frac<=2^29 and will be >=2^30.  Thus bit 30 of frac+2^29
     208             :    indicates whether we need to round up or down.  This yields:
     209             :         z = floor( x y + 2^29 ) / 2^30 )
     210             :           = (x*y + 2^29) >> 30
     211             :    As x*y might overflow 64 bits, we need to do a 64b*64b->128b
     212             :    multiplication (fd_uwide_mul), a 64b increment of a 128b
     213             :    (fd_uwide_inc) and a 128b shift right by 30
     214             :    (fd_fxp_private_contract).
     215             : 
     216             :    Slightly more expensive (one fd_uwide_inc) than RTZ rounding.
     217             :    Rounding error in [-1/2,1/2) ulp.  (ulp==2^-30). */
     218             : 
     219             : static inline ulong
     220             : fd_fxp_mul_rna( ulong   x,
     221             :                 ulong   y,
     222             :                 ulong * _c ) {
     223             :   ulong zh,zl; fd_uwide_mul( &zh,&zl, x, y ); /* <= 2^128 - 2^65 + 1 so no overflow */
     224             :   fd_uwide_inc( &zh,&zl, zh,zl, 1UL<<29 );    /* <= 2^128 - 2^65 + 2^29 so no overflow */
     225             :   return fd_fxp_private_contract( zh,zl, _c );
     226             : }
     227             : 
     228             : /* rne -> Round toward nearest with ties toward even (aka banker's rounding / IEEE style rounding)
     229             :    Fast variant assumes x*y < 2^64-2^29 (i.e. exact result < ~2^4)
     230             :    Based on the observation that rnz / rna rounding should be used when
     231             :    floor(x*y/2^30) is even/odd.  That is, use the rnz / rna increment of
     232             :    2^29-1 / 2^29 when bit 30 of x*y is 0 / 1.  As x*y might overflow 64
     233             :    bits, we need to do a 64b*64b->128b multiplication (fd_uwide_mul), a
     234             :    64b increment of a 128b (fd_uwide_inc) and a 128b shift right by 30
     235             :    (fd_fxp_private_contract).
     236             :    
     237             :    The most accurate style of rounding usually and somewhat more
     238             :    expensive (some sequentially dependent bit ops and one fd_uwide_inc)
     239             :    than RTZ rounding.  Rounding error in [-1/2,1/2] ulp (unbiased).
     240             :    (ulp==2^-30). */
     241             : 
     242             : static inline ulong
     243             : fd_fxp_mul_rne( ulong   x,
     244             :                 ulong   y,
     245             :                 ulong * _c ) {
     246             :   ulong zh,zl; fd_uwide_mul( &zh,&zl, x, y );   /* <= 2^128 - 2^65 + 1 */
     247             :   ulong t = ((1UL<<29)-1UL) + ((zl>>30) & 1UL); /* t = 2^29-1 / 2^29 when bit 30 of x*y is 0 / 1 */
     248             :   fd_uwide_inc( &zh,&zl, zh,zl, t );            /* <= 2^128 - 2^65 + 2^29 */
     249             :   return fd_fxp_private_contract( zh,zl, _c );
     250             : }
     251             : 
     252             : /* rno -> Round toward nearest with ties toward odd
     253             :    Fast variant assumes x*y < 2^64-2^29 (i.e. exact result < ~2^4)
     254             :    Same as rne with the parity flipped for the increment.  As x*y might
     255             :    overflow 64 bits, we need to do a 64b*64b->128b multiplication
     256             :    (fd_uwide_mul), a 64b increment of a 128b (fd_uwide_inc) and a 128b
     257             :    shift right by 30 (fd_fxp_private_contract).
     258             :    
     259             :    Somewhat more expensive (some sequentially dependent bit ops and one
     260             :    fd_uwide_inc) than RTZ rounding.  Rounding error in [-1/2,1/2] ulp
     261             :    (unbiased).  (ulp==2^-30). */
     262             : 
     263             : static inline ulong
     264             : fd_fxp_mul_rno( ulong   x,
     265             :                 ulong   y,
     266             :                 ulong * _c ) {
     267             :   ulong zh,zl; fd_uwide_mul( &zh,&zl, x, y ); /* <= 2^128 - 2^65 + 1 */
     268             :   ulong t = (1UL<<29) - ((zl>>30) & 1UL);     /* t = 2^29 / 2^29-1 when bit 30 of x*y is 0 / 1 */
     269             :   fd_uwide_inc( &zh,&zl, zh,zl, t );          /* <= 2^128 - 2^65 + 2^29 */
     270             :   return fd_fxp_private_contract( zh,zl, _c );
     271             : }
     272             : 
     273             : #endif
     274             : 
     275   300000000 : FD_FN_CONST static inline ulong fd_fxp_mul_rtz_fast( ulong x, ulong y ) { return (x*y)                 >> 30; }
     276   300000000 : FD_FN_CONST static inline ulong fd_fxp_mul_raz_fast( ulong x, ulong y ) { return (x*y+((1UL<<30)-1UL)) >> 30; }
     277   300000000 : FD_FN_CONST static inline ulong fd_fxp_mul_rnz_fast( ulong x, ulong y ) { return (x*y+((1UL<<29)-1UL)) >> 30; }
     278   300000000 : FD_FN_CONST static inline ulong fd_fxp_mul_rna_fast( ulong x, ulong y ) { return (x*y+ (1UL<<29))      >> 30; }
     279             : 
     280             : FD_FN_CONST static inline ulong
     281             : fd_fxp_mul_rne_fast( ulong x,
     282   300000000 :                      ulong y ) {
     283   300000000 :   ulong z = x*y;
     284   300000000 :   ulong t = ((1UL<<29)-1UL) + ((z>>30) & 1UL); /* t = 2^29-1 / 2^29 when bit 30 of x*y is 0 / 1 */
     285   300000000 :   return (z + t) >> 30;
     286   300000000 : }
     287             : 
     288             : FD_FN_CONST static inline ulong
     289             : fd_fxp_mul_rno_fast( ulong x,
     290   300000000 :                      ulong y ) {
     291   300000000 :   ulong z = x*y;
     292   300000000 :   ulong t = (1UL<<29) - ((z>>30) & 1UL); /* t = 2^29-1 / 2^29 when bit 30 of x*y is 0 / 1 */
     293   300000000 :   return (z + t) >> 30;
     294   300000000 : }
     295             : 
     296             : /* Other rounding modes:
     297             :      rdn -> Round down                   / toward floor / toward -inf ... same as rtz for unsigned
     298             :      rup -> Round up                     / toward ceil  / toward +inf ... same as raz for unsigned
     299             :      rnd -> Round nearest with ties down / toward floor / toward -inf ... same as rnz for unsigned
     300             :      rnu -> Round nearest with ties up   / toward ceil  / toward -inf ... same as rna for unsigned */
     301             : 
     302           0 : static inline ulong fd_fxp_mul_rdn( ulong x, ulong y, ulong * _c ) { return fd_fxp_mul_rtz( x, y, _c ); }
     303           0 : static inline ulong fd_fxp_mul_rup( ulong x, ulong y, ulong * _c ) { return fd_fxp_mul_raz( x, y, _c ); }
     304           0 : static inline ulong fd_fxp_mul_rnd( ulong x, ulong y, ulong * _c ) { return fd_fxp_mul_rnz( x, y, _c ); }
     305           0 : static inline ulong fd_fxp_mul_rnu( ulong x, ulong y, ulong * _c ) { return fd_fxp_mul_rna( x, y, _c ); }
     306             : 
     307           0 : FD_FN_CONST static inline ulong fd_fxp_mul_rdn_fast( ulong x, ulong y ) { return fd_fxp_mul_rtz_fast( x, y ); }
     308           0 : FD_FN_CONST static inline ulong fd_fxp_mul_rup_fast( ulong x, ulong y ) { return fd_fxp_mul_raz_fast( x, y ); }
     309           0 : FD_FN_CONST static inline ulong fd_fxp_mul_rnd_fast( ulong x, ulong y ) { return fd_fxp_mul_rnz_fast( x, y ); }
     310           0 : FD_FN_CONST static inline ulong fd_fxp_mul_rnu_fast( ulong x, ulong y ) { return fd_fxp_mul_rna_fast( x, y ); }
     311             : 
     312             : /* FIXED POINT DIVISION ***********************************************/
     313             : 
     314             : /* Compute:
     315             :      (2^64 c + z)/2^30 ~ (x/2^30)/(y/2^30)
     316             :    under various rounding modes.  c<2^30 if y is non-zero.  Returns
     317             :    c=ULONG_MAX,z=0 if y is zero. */
     318             : 
     319             : #if FD_HAS_INT128 /* See the uwide-based implementations for details how these work */
     320             : 
     321             : static inline ulong
     322             : fd_fxp_div_rtz( ulong   x,
     323             :                 ulong   y,
     324   300000000 :                 ulong * _c ) {
     325   300000000 :   if( !y ) { *_c = ULONG_MAX; return 0UL; }
     326   297655524 :   return fd_fxp_private_split( fd_fxp_private_expand( x ) / (uint128)y, _c );
     327   300000000 : }
     328             : 
     329             : static inline ulong
     330             : fd_fxp_div_raz( ulong   x,
     331             :                 ulong   y,
     332   300000000 :                 ulong * _c ) {
     333   300000000 :   if( !y ) { *_c = ULONG_MAX; return 0UL; }
     334   297655524 :   return fd_fxp_private_split( (fd_fxp_private_expand( x )+(uint128)(y-1UL)) / (uint128)y, _c );
     335   300000000 : }
     336             : 
     337             : static inline ulong
     338             : fd_fxp_div_rnz( ulong   x,
     339             :                 ulong   y,
     340   300000000 :                 ulong * _c ) {
     341   300000000 :   if( !y ) { *_c = ULONG_MAX; return 0UL; }
     342   297655524 :   return fd_fxp_private_split( (fd_fxp_private_expand( x )+(uint128)((y-1UL)>>1)) / (uint128)y, _c );
     343   300000000 : }
     344             : 
     345             : static inline ulong
     346             : fd_fxp_div_rna( ulong   x,
     347             :                 ulong   y,
     348   300000000 :                 ulong * _c ) {
     349   300000000 :   if( !y ) { *_c = ULONG_MAX; return 0UL; }
     350   297655524 :   return fd_fxp_private_split( (fd_fxp_private_expand( x )+(uint128)(y>>1)) / (uint128)y, _c );
     351   300000000 : }
     352             : 
     353             : static inline ulong
     354             : fd_fxp_div_rne( ulong   x,
     355             :                 ulong   y,
     356   300000000 :                 ulong * _c ) {
     357   300000000 :   if( !y ) { *_c = ULONG_MAX; return 0UL; }
     358   297655524 :   uint128 n    = fd_fxp_private_expand( x );
     359   297655524 :   uint128 q    = n / (uint128)y;
     360   297655524 :   ulong   r    = (ulong)(n - q*y);
     361   297655524 :   ulong   flhy = y>>1;
     362   297655524 :   return fd_fxp_private_split( q + (uint128)(ulong)( (r>flhy) | ((r==flhy) & !!((~y) & ((ulong)q) & 1UL)) ), _c );
     363   300000000 : }
     364             : 
     365             : static inline ulong
     366             : fd_fxp_div_rno( ulong   x,
     367             :                 ulong   y,
     368   300000000 :                 ulong * _c ) {
     369   300000000 :   if( !y ) { *_c = ULONG_MAX; return 0UL; }
     370   297655524 :   uint128 n    = fd_fxp_private_expand( x );
     371   297655524 :   uint128 q    = n / (uint128)y;
     372   297655524 :   ulong   r    = (ulong)(n - q*y);
     373   297655524 :   ulong   flhy = y>>1;
     374   297655524 :   return fd_fxp_private_split( q + (uint128)(ulong)( (r>flhy) | ((r==flhy) & !!((~y) & (~(ulong)q) & 1UL)) ), _c );
     375   300000000 : }
     376             : 
     377             : #else /* uwide based implementations from pyth */
     378             : 
     379             : /* rtz -> Round toward zero (aka truncate rounding)
     380             :    Fast variant assumes y!=0 and x<2^34 (i.e. exact result < ~2^34)
     381             :    Based on:
     382             :         z/2^30 ~ (x/2^30) / (y/2^30)
     383             :      -> z      ~ 2^30 x / y
     384             :    With RTZ rounding: 
     385             :         z      = floor( 2^30 x / y )
     386             :    As 2^30 x might overflow 64 bits, we need to expand x
     387             :    (fd_fxp_private_expand) and then use a 128b/64b -> 128b divider.
     388             :    (fd_uwide_div).
     389             : 
     390             :    Fastest style of rounding.  Rounding error in [0,1) ulp.
     391             :    (ulp==2^-30). */
     392             : 
     393             : static inline ulong
     394             : fd_fxp_div_rtz( ulong   x,
     395             :                 ulong   y,
     396             :                 ulong * _c ) {
     397             :   if( !y ) { *_c = ULONG_MAX; return 0UL; }         /* Handle divide by zero */
     398             :   ulong zh,zl; fd_fxp_private_expand( &zh,&zl, x ); /* 2^30 x  <= 2^94-2^30 so no overflow */
     399             :   fd_uwide_div( &zh,&zl, zh,zl, y );                /* <zh,zl> <= 2^94-2^30 so no overflow */
     400             :   *_c = zh; return zl;
     401             : }
     402             : 
     403             : /* raz -> Round away from zero
     404             :    Fast variant assumes y!=0 and 2^30*x+y-1<2^64 (i.e. exact result < ~2^34)
     405             :    Based on:
     406             :         z/2^30 ~ (x/2^30) / (y/2^30)
     407             :      -> z      ~ 2^30 x / y
     408             :    With RAZ rounding: 
     409             :         z      = ceil( 2^30 x / y )
     410             :                = floor( (2^30 x + y - 1) / y )
     411             :    As 2^30 x might overflow 64 bits, we need to expand x
     412             :    (fd_fxp_private_expand), increment it by the 64b y-1 (fd_uwide_inc)
     413             :    and then use a 128b/64b->128b divider (fd_uwide_div).
     414             : 
     415             :    Slightly more expensive (one fd_uwide_inc) than RTZ rounding.
     416             :    Rounding error in (-1,0] ulp. (ulp==2^-30). */
     417             : 
     418             : static inline ulong
     419             : fd_fxp_div_raz( ulong   x,
     420             :                 ulong   y,
     421             :                 ulong * _c ) {
     422             :   if( !y ) { *_c = ULONG_MAX; return 0UL; }         /* Handle divide by zero */
     423             :   ulong zh,zl; fd_fxp_private_expand( &zh,&zl, x ); /* 2^30 x  <= 2^94-2^30 so no overflow */
     424             :   fd_uwide_inc( &zh,&zl, zh,zl, y-1UL );            /* <zh,zl> <= 2^94+2^64-2^30-2 so no overflow */
     425             :   fd_uwide_div( &zh,&zl, zh,zl, y );                /* <zh,zl> = ceil( 2^30 x / y ) <= 2^94-2^30 so no overflow */
     426             :   *_c = zh; return zl;
     427             : }
     428             : 
     429             : /* rnz -> Round nearest with ties toward zero
     430             :    Fast variant assumes y!=0 and 2^30*x+floor((y-1)/2)<2^64 (i.e. exact result < ~2^34)
     431             : 
     432             :    Consider:
     433             :      z = floor( (2^30 x + floor( (y-1)/2 )) / y )
     434             :    where y>0.
     435             :    
     436             :    If y is even:                                   odd
     437             :      z       = floor( (2^30 x + (y/2) - 1) / y )     = floor( (2^30 x + (y-1)/2) / y )
     438             :    or:
     439             :      z y + r = 2^30 x + (y/2)-1                      = 2^30 x + (y-1)/2
     440             :    for some r in [0,y-1].  Or:
     441             :      z y     = 2^30 x + delta                        = 2^30 x + delta                        
     442             :    where:
     443             :      delta   in [-y/2,y/2-1]                         in [-y/2+1/2,y/2-1/2]
     444             :    or:
     445             :      z       = 2^30 x / y + epsilon                  = 2^30 x / y + epsilon
     446             :    where:
     447             :      epsilon in [-1/2,1/2-1/y]                       in [-1/2+1/(2y),1/2-1/(2y)]
     448             :    Thus we have:
     449             :      2^30 x/y - 1/2 <= z < 2^30 x/y + 1/2            2^30 x/y - 1/2 < z < 2^30 x/y + 1/2
     450             : 
     451             :    Combining yields:
     452             : 
     453             :      2^30 x/y - 1/2 <= z < 2^30 x/y + 1/2
     454             : 
     455             :    Thus, the z computed as per the above is the RNZ rounded result.  As
     456             :    2^30 x might overflow 64 bits, we need to expand x
     457             :    (fd_fxp_private_expand), increment it by the 64b (y-1)>>1
     458             :    (fd_uwide_inc) and then use a 128b/64b->128b divider (fd_uwide_div).
     459             : 
     460             :    Slightly more expensive (one fd_uwide_inc) than RTZ rounding.
     461             :    Rounding error in (-1/2,1/2] ulp. (ulp==2^-30). */
     462             : 
     463             : static inline ulong
     464             : fd_fxp_div_rnz( ulong   x,
     465             :                 ulong   y,
     466             :                 ulong * _c ) {
     467             :   if( !y ) { *_c = ULONG_MAX; return 0UL; }         /* Handle divide by zero */
     468             :   ulong zh,zl; fd_fxp_private_expand( &zh,&zl, x ); /* 2^30 x <= 2^94-2^30 so no overflow */
     469             :   fd_uwide_inc( &zh,&zl, zh,zl, (y-1UL)>>1 );       /* <zh,zl> <= 2^94-2^30 + 2^63-1 so no overflow */
     470             :   fd_uwide_div( &zh,&zl, zh,zl, y );                /* <zh,zl> <= ceil(2^30 x/y) <= 2^94-2^30 so no overflow */
     471             :   *_c = zh; return zl;
     472             : }
     473             : 
     474             : /* rna -> Round nearest with ties away from zero (aka grade school rounding)
     475             :    Fast variant assumes y!=0 and 2^30*x+floor(y/2)<2^64 (i.e. exact result < ~2^34)
     476             : 
     477             :    Consider:
     478             :      z = floor( (2^30 x + floor( y/2 )) / y )
     479             :    where y>0.
     480             :    
     481             :    If y is even:                                   odd
     482             :      z       = floor( (2^30 x + (y/2)) / y )         = floor( (2^30 x + (y-1)/2) / y )
     483             :    or:
     484             :      z y + r = 2^30 x + (y/2)                        = 2^30 x + (y-1)/2
     485             :    for some r in [0,y-1].  Or:
     486             :      z y     = 2^30 x + delta                        = 2^30 x + delta                        
     487             :    where:
     488             :      delta   in [-y/2+1,y/2]                         in [-y/2+1/2,y/2-1/2]
     489             :    or:
     490             :      z       = 2^30 x / y + epsilon                  = 2^30 x / y + epsilon
     491             :    where:
     492             :      epsilon in [-1/2+1/y,1/2]                       in [-1/2+1/(2y),1/2-1/(2y)]
     493             :    Thus we have:
     494             :      2^30 x/y - 1/2 < z <= 2^30 x/y + 1/2            2^30 x/y - 1/2 < z < 2^30 x/y + 1/2
     495             : 
     496             :    Combining yields:
     497             : 
     498             :      2^30 x/y - 1/2 < z <= 2^30 x/y + 1/2
     499             : 
     500             :    Thus, the z computed as per the above is the RNA rounded result.  As
     501             :    2^30 x might overflow 64 bits, we need to expand x
     502             :    (fd_fxp_private_expand), increment it by the 64b y>>1 (fd_uwide_inc)
     503             :    and then use a 128b/64b->128b divider (fd_uwide_div).
     504             : 
     505             :    Slightly more expensive (one fd_uwide_inc) than RTZ rounding.
     506             :    Rounding error in [-1/2,1/2) ulp. (ulp==2^-30).
     507             :    
     508             :    Probably worth noting that if y has its least significant bit set,
     509             :    all the rnz/rna/rne/rno modes yield the same result (as ties aren't
     510             :    possible) and this is the cheapest of the round nearest modes.*/
     511             : 
     512             : static inline ulong
     513             : fd_fxp_div_rna( ulong   x,
     514             :                 ulong   y,
     515             :                 ulong * _c ) {
     516             :   if( !y ) { *_c = ULONG_MAX; return 0UL; }         /* Handle divide by zero */
     517             :   ulong zh,zl; fd_fxp_private_expand( &zh,&zl, x ); /* 2^30 x <= 2^94-2^30 so no overflow */
     518             :   fd_uwide_inc( &zh,&zl, zh,zl, y>>1 );             /* <zh,zl> <= 2^94-2^30 + 2^63-1 so no overflow */
     519             :   fd_uwide_div( &zh,&zl, zh,zl, y );                /* <zh,zl> <= ceil(2^30 x/y) <= 2^94-2^30 so no overflow */
     520             :   *_c = zh; return zl;
     521             : }
     522             : 
     523             : /* rne -> Round nearest with ties toward even (aka banker's rounding / IEEE style rounding)
     524             :    Fast variant assumes y!=0 and 2^30 x < 2^64 (i.e. exact result < ~2^34)
     525             : 
     526             :    Based on computing (when y>0):
     527             : 
     528             :      q y + r = 2^30 x
     529             : 
     530             :    where q = floor( 2^30 x / y ) and r is in [0,y-1].
     531             : 
     532             :    If r < y/2, the result should round down.  And if r > y/2 the result
     533             :    should round up.  If r==y/2 (which is only possible if y is even),
     534             :    the result should round down / up when q is even / odd.
     535             :    
     536             :    Combining yields we need to round up when:
     537             : 
     538             :      r>floor(y/2) or (r==floor(y/2) and y is even and q is odd)
     539             : 
     540             :    As 2^30 x might overflow 64 bits, we need to expand x
     541             :    (fd_fxp_private_expand).  Since we need both the 128b quotient and
     542             :    the 64b remainder, we need a 128b/64b->128b,64b divider
     543             :    (fd_uwide_divrem ... if there was a way to quickly determine if
     544             :    floor( 2^30 x / y ) is even or odd, we wouldn't need the remainder
     545             :    and could select the appropriate RNZ/RNA based fd_uwide_inc
     546             :    increment) and then a 128b conditional increment (fd_uwide_inc).
     547             : 
     548             :    The most accurate style of rounding usually and somewhat more
     549             :    expensive (needs remainder, some sequentially dependent bit ops and
     550             :    one fd_uwide_inc) than RTZ rounding.  Rounding error in [-1/2,1/2]
     551             :    ulp (unbiased).  (ulp==2^-30). */
     552             : 
     553             : static inline ulong
     554             : fd_fxp_div_rne( ulong   x,
     555             :                 ulong   y,
     556             :                 ulong * _c ) {
     557             :   if( !y ) { *_c = ULONG_MAX; return 0UL; }          /* Handle divide by zero */
     558             :   ulong zh,zl; fd_fxp_private_expand( &zh,&zl, x );  /* 2^30 x <= 2^94-2^30 so no overflow */
     559             :   ulong r    = fd_uwide_divrem( &zh,&zl, zh,zl, y ); /* <zh,zl>*y + r = 2^30 x where r is in [0,y-1] so no overflow */
     560             :   ulong flhy = y>>1;                                 /* floor(y/2) so no overflow */
     561             :   fd_uwide_inc( &zh,&zl, zh,zl, (ulong)( (r>flhy) | ((r==flhy) & !!((~y) & zl & 1UL)) ) );
     562             :   /* <zh,zl> <= ceil( 2^30 x / y ) <= 2^94-2^30 so no overflow */
     563             :   *_c = zh; return zl;
     564             : }
     565             : 
     566             : /* rno -> Round nearest with ties toward odd
     567             :    Fast variant assumes y!=0 and 2^30 x < 2^64 (i.e. exact result < ~2^34)
     568             : 
     569             :    Similar considerations as RNE with the parity for rounding on ties
     570             :    swapped.
     571             : 
     572             :    Somewhat more expensive (needs remainder, some sequentially dependent
     573             :    bit ops and one fd_uwide_inc) than RTZ rounding.  Rounding error in
     574             :    [-1/2,1/2] ulp (unbiased).  (ulp==2^-30). */
     575             : 
     576             : static inline ulong
     577             : fd_fxp_div_rno( ulong   x,
     578             :                 ulong   y,
     579             :                 ulong * _c ) {
     580             :   if( !y ) { *_c = ULONG_MAX; return 0UL; }          /* Handle divide by zero */
     581             :   ulong zh,zl; fd_fxp_private_expand( &zh,&zl, x );  /* 2^30 x <= 2^94-2^30 so no overflow */
     582             :   ulong r    = fd_uwide_divrem( &zh,&zl, zh,zl, y ); /* <zh,zl>*y + r = 2^30 x where r is in [0,y-1] so no overflow */
     583             :   ulong flhy = y>>1;                                 /* floor(y/2) so no overflow */
     584             :   fd_uwide_inc( &zh,&zl, zh,zl, (ulong)( (r>flhy) | ((r==flhy) & !!((~y) & (~zl) & 1UL)) ) );
     585             :   /* <zh,zl> <= ceil( 2^30 x / y ) <= 2^94-2^30 so no overflow */
     586             :   *_c = zh; return zl;
     587             : }
     588             : 
     589             : #endif
     590             : 
     591   297655524 : FD_FN_CONST static inline ulong fd_fxp_div_rtz_fast( ulong x, ulong y ) { return ( x<<30              ) / y; }
     592   297655524 : FD_FN_CONST static inline ulong fd_fxp_div_raz_fast( ulong x, ulong y ) { return ((x<<30)+ (y-1UL)    ) / y; }
     593   297655524 : FD_FN_CONST static inline ulong fd_fxp_div_rnz_fast( ulong x, ulong y ) { return ((x<<30)+((y-1UL)>>1)) / y; }
     594   297655524 : FD_FN_CONST static inline ulong fd_fxp_div_rna_fast( ulong x, ulong y ) { return ((x<<30)+ (y     >>1)) / y; }
     595             : 
     596             : FD_FN_CONST static inline ulong
     597             : fd_fxp_div_rne_fast( ulong x,
     598   297655524 :                      ulong y ) {
     599   297655524 :   ulong n    = x << 30;
     600   297655524 :   ulong q    = n / y;
     601   297655524 :   ulong r    = n - q*y;
     602   297655524 :   ulong flhy = y>>1;
     603   297655524 :   return q + (ulong)( (r>flhy) | ((r==flhy) & !!((~y) & q & 1UL)) );
     604   297655524 : }
     605             : 
     606             : FD_FN_CONST static inline ulong
     607             : fd_fxp_div_rno_fast( ulong x,
     608   297655524 :                      ulong y ) {
     609   297655524 :   ulong n    = x << 30;
     610   297655524 :   ulong q    = n / y;
     611   297655524 :   ulong r    = n - q*y;
     612   297655524 :   ulong flhy = y>>1;
     613   297655524 :   return q + (ulong)( (r>flhy) | ((r==flhy) & !!((~y) & (~q) & 1UL)) );
     614   297655524 : }
     615             : 
     616             : /* Other rounding modes:
     617             :      rdn -> Round down                   / toward floor / toward -inf ... same as rtz for unsigned
     618             :      rup -> Round up                     / toward ceil  / toward +inf ... same as raz for unsigned
     619             :      rnd -> Round nearest with ties down / toward floor / toward -inf ... same as rnz for unsigned
     620             :      rnu -> Round nearest with ties up   / toward ceil  / toward -inf ... same as rna for unsigned */
     621             : 
     622           0 : static inline ulong fd_fxp_div_rdn( ulong x, ulong y, ulong * _c ) { return fd_fxp_div_rtz( x, y, _c ); }
     623           0 : static inline ulong fd_fxp_div_rup( ulong x, ulong y, ulong * _c ) { return fd_fxp_div_raz( x, y, _c ); }
     624           0 : static inline ulong fd_fxp_div_rnd( ulong x, ulong y, ulong * _c ) { return fd_fxp_div_rnz( x, y, _c ); }
     625           0 : static inline ulong fd_fxp_div_rnu( ulong x, ulong y, ulong * _c ) { return fd_fxp_div_rna( x, y, _c ); }
     626             : 
     627           0 : FD_FN_CONST static inline ulong fd_fxp_div_rdn_fast( ulong x, ulong y ) { return fd_fxp_div_rtz_fast( x, y ); }
     628           0 : FD_FN_CONST static inline ulong fd_fxp_div_rup_fast( ulong x, ulong y ) { return fd_fxp_div_raz_fast( x, y ); }
     629           0 : FD_FN_CONST static inline ulong fd_fxp_div_rnd_fast( ulong x, ulong y ) { return fd_fxp_div_rnz_fast( x, y ); }
     630           0 : FD_FN_CONST static inline ulong fd_fxp_div_rnu_fast( ulong x, ulong y ) { return fd_fxp_div_rna_fast( x, y ); }
     631             : 
     632             : /* FIXED POINT SQRT ***************************************************/
     633             : 
     634             : /* Compute:
     635             :      z/2^30 ~ sqrt( x/2^30 )
     636             :    under various rounding modes. */
     637             : 
     638             : #if FD_HAS_INT128 /* See the uwide-based implementations for details how these work */
     639             : 
     640             : /* FIXME: USE X86 FPU TRICKS FOR BETTER INITIAL APPROXIMATION HERE? */
     641             : 
     642             : FD_FN_CONST static inline ulong
     643   300000000 : fd_fxp_sqrt_rtz( ulong x ) {
     644   300000000 :   if( !x ) return 0UL;
     645   297659577 :   int s = (63-fd_ulong_find_msb( x )) >> 1;
     646   297659577 :   if( s>15 ) s = 15;
     647   297659577 :   ulong y = fd_ulong_sqrt( x << (s<<1) ) << (15-s);
     648   297659577 :   if( s==15 ) return y;
     649             : 
     650   257815668 :   uint128 _x = fd_fxp_private_expand( x );
     651   257815668 :   uint128 _y =  (uint128)y;
     652   512105205 :   for(;;) {
     653   512105205 :     uint128 _z = (_y*_y + _y + _x) / ((_y<<1)+(uint128)1);
     654   512105205 :     if( _z==_y ) break;
     655   254289537 :     _y = _z;
     656   254289537 :   }
     657   257815668 :   return (ulong)_y;
     658   297659577 : }
     659             : 
     660             : FD_FN_CONST static inline ulong
     661   300000000 : fd_fxp_sqrt_raz( ulong x ) {
     662   300000000 :   if( !x ) return 0UL;
     663   297659577 :   int s = (63-fd_ulong_find_msb( x )) >> 1;
     664   297659577 :   if( s>15 ) s = 15;
     665   297659577 :   ulong xl = x << (s<<1);
     666   297659577 :   ulong y = fd_ulong_sqrt( xl ) << (15-s);
     667   297659577 :   if( s==15 ) return y + (ulong)!!(xl-y*y);
     668             : 
     669   257815668 :   uint128 _x = fd_fxp_private_expand( x ) - (uint128)2;
     670   257815668 :   uint128 _y =  (uint128)y;
     671   517091919 :   for(;;) {
     672   517091919 :     uint128 _z = (_y*_y + _y + _x) / ((_y<<1)-(uint128)1);
     673   517091919 :     if( _z==_y ) break;
     674   259276251 :     _y = _z;
     675   259276251 :   }
     676   257815668 :   return (ulong)_y;
     677   297659577 : }
     678             : 
     679             : FD_FN_CONST static inline ulong
     680   300000000 : fd_fxp_sqrt_rnz( ulong x ) {
     681   300000000 :   if( !x ) return 0UL;
     682   297659577 :   int s = (63-fd_ulong_find_msb( x )) >> 1;
     683   297659577 :   if( s>15 ) s = 15;
     684   297659577 :   ulong xl = x << (s<<1);
     685   297659577 :   ulong y = fd_ulong_sqrt( xl ) << (15-s);
     686   297659577 :   if( s==15 ) return y + (ulong)((xl-y*y)>y);
     687             : 
     688   257815668 :   uint128 _x = fd_fxp_private_expand( x ) -(uint128)1;
     689   257815668 :   uint128 _y =  (uint128)y;
     690   513280803 :   for(;;) {
     691   513280803 :     uint128 _z = (_y*_y + _y + _x) / (_y<<1);
     692   513280803 :     if( _z==_y ) break;
     693   255465135 :     _y = _z;
     694   255465135 :   }
     695   257815668 :   return (ulong)_y;
     696   297659577 : }
     697             : 
     698             : #else /* uwide based implementations from pyth */
     699             : 
     700             : /* rtz -> Round toward zero (aka truncate rounding)
     701             :    Fast variant assumes x<2^34
     702             :    Based on:
     703             :         z/2^30 ~ sqrt( x/2^30)
     704             :      -> z      ~ sqrt( 2^30 x )
     705             :    With RTZ rounding:
     706             :         z      = floor( sqrt( 2^30 x ) )
     707             :    Fastest style of rounding.  Rounding error in [0,1) ulp.
     708             :    (ulp==2^-30). */
     709             : 
     710             : FD_FN_CONST static inline ulong
     711             : fd_fxp_sqrt_rtz( ulong x ) {
     712             : 
     713             :   /* Initial guess.  Want to compute
     714             :        y = sqrt( x 2^30 )
     715             :      but x 2^30 does not fit into 64-bits at this point.  So we instead
     716             :      approximate:
     717             :        y = sqrt( x 2^(2s) 2^(30-2s) )
     718             :          = sqrt( x 2^(2s) ) 2^(15-s)
     719             :          ~ floor( sqrt( x 2^(2s) ) ) 2^(15-s)
     720             :      where s is the largest integer such that x 2^(2s) does not
     721             :      overflow. */
     722             : 
     723             :   int s = (63-fd_ulong_find_msb( x )) >> 1;         /* lg x in [34,63], 63-lg x in [0,29], s in [0,14] when x>=2^34 */
     724             :   if( s>15 ) s = 15;                                /* s==15 when x<2^34 */
     725             :   ulong y = fd_ulong_sqrt( x << (s<<1) ) << (15-s); /* All shifts well defined */
     726             :   if( s==15 ) return y;                             /* No iteration if x<2^34 */
     727             : 
     728             :   /* Expand x to 2^30 x for the fixed point iteration */
     729             :   ulong xh,xl; fd_fxp_private_expand( &xh,&xl, x );
     730             :   for(;;) {
     731             : 
     732             :     /* Iterate y' = floor( (y(y+1) + 2^30 x) / (2y+1) ).  This is the
     733             :        same iteration as sqrt_uint{8,16,32,64} (which converges on the
     734             :        floor(sqrt(x)) but applied to the (wider than 64b) quantity
     735             :        2^30*x and then starting from an exceptionally good guess (such
     736             :        that ~2 iterations should be needed at most). */
     737             : 
     738             :     ulong yh,yl;
     739             :     fd_uwide_mul( &yh,&yl, y,y+1UL );
     740             :     fd_uwide_add( &yh,&yl, yh,yl, xh,xl, 0UL );
     741             :     fd_uwide_div( &yh,&yl, yh,yl, (y<<1)+1UL );
     742             :     if( yl==y ) break;
     743             :     y = yl;
     744             :   }
     745             : 
     746             :   return y;
     747             : }
     748             : 
     749             : /* raz -> Round away zero
     750             :    Fast variant assumes x<2^34
     751             :    Based on:
     752             :         z/2^30 ~ sqrt( x/2^30)
     753             :      -> z      ~ sqrt( 2^30 x )
     754             :    Let y be the RTZ rounded result:
     755             :         y = floor( sqrt( 2^30 x ) )
     756             :    and consider the residual:
     757             :         r = 2^30 x - y^2
     758             :    which, given the above, will be in [0,2y].  If r==0, the result
     759             :    is exact and thus already correctly rounded.  Otherwise, we need
     760             :    to round up.  We note that the residual of the RTZ iteration is
     761             :    the same as this residual at convergence:
     762             :         y = floor( (y^2 + y + 2^30 x) / (2y+1) )
     763             :           = (y^2 + y + 2^30 x - r') / (2y+1)
     764             :    where r' in [0,2y]:
     765             :         2y^2 + y = y^2 + y + 2^30 x - r'
     766             :         y^2 = 2^30 x - r'
     767             :         r' = 2^30 x - y^2
     768             :      -> r' = r
     769             :    Thus we can use explicitly compute the remainder or use
     770             :    fd_uwide_divrem in the iteration itself to produce the needed
     771             :    residual.
     772             : 
     773             :    Alternatively, the iteration
     774             :         y = floor( (y^2 + y - 2 + 2^30 x) / (2y-1) )
     775             :           = floor( (y(y+1) + (2^30 x-2)) / (2y-1) )
     776             :    should converge on the RAZ rounded result as:
     777             :         y = (y^2 + y - 2 + 2^30 x - r'') / (2y-1)
     778             :    where r'' in [0,2y-2]
     779             :         2y^2 - y = y^2 + y - 2 + 2^30 x - r''
     780             :         y^2 - 2 y + 2 + r'' = 2^30 x
     781             :    Thus at r'' = 0:
     782             :         (y-1)^2 + 1 = 2^30 x
     783             :      -> (y-1)^2 < 2^30 x
     784             :    and at r'' = 2y-2
     785             :         y^2 = 2^30 x
     786             :    such that:
     787             :         (y-1)^2 < 2^30 x <= y^2
     788             :    which means y is the correctly rounded result.
     789             : 
     790             :    Slightly more expensive than RTZ rounding.  Rounding error in (-1,0]
     791             :    ulp.  (ulp==2^-30). */
     792             : 
     793             : FD_FN_CONST static inline ulong
     794             : fd_fxp_sqrt_raz( ulong x ) {
     795             :   ulong xh, xl;
     796             : 
     797             :   /* Same guess as rtz rounding */
     798             :   int s = (63-fd_ulong_find_msb( x )) >> 1;
     799             :   if( s>15 ) s = 15;
     800             :   xl = x << (s<<1);
     801             :   ulong y = fd_ulong_sqrt( xl ) << (15-s);
     802             :   if( s==15 ) return y + (ulong)!!(xl-y*y); /* Explicitly compute residual to round when no iteration needed */
     803             : 
     804             :   /* Use the modified iteration to converge on raz rounded result */
     805             :   fd_fxp_private_expand( &xh,&xl, x );
     806             :   fd_uwide_dec( &xh,&xl, xh, xl, 2UL );
     807             :   for(;;) {
     808             :     ulong yh,yl;
     809             :     fd_uwide_mul( &yh,&yl, y, y+1UL );
     810             :     fd_uwide_add( &yh,&yl, yh,yl, xh,xl, 0UL );
     811             :     fd_uwide_div( &yh,&yl, yh,yl, (y<<1)-1UL );
     812             :     if( yl==y ) break;
     813             :     y = yl;
     814             :   }
     815             : 
     816             :   return y;
     817             : }
     818             : 
     819             : /* rnz/rna/rne/rno -> Round nearest with ties toward zero/away zero/toward even/toward odd
     820             :    Fast variant assumes x<2^34
     821             :    Based on:
     822             :         z/2^30 ~ sqrt( x/2^30)
     823             :      -> z      ~ sqrt( 2^30 x )
     824             :    Assuming there are no ties, we want to integer z such that:
     825             :         (z-1/2)^2 < 2^30 x < (z+1/2)^2
     826             :         z^2 - z + 1/4 < 2^30 x < z^2 + z + 1/4
     827             :    since z is integral, this is equivalent to finding a z such that:
     828             :      -> z^2 - z + 1 <= 2^30 x < z^2 + z + 1
     829             :      -> r = 2^30 x - (z^2 - z + 1) and is in [0,2z)
     830             :    This suggests using the iteration:
     831             :         z = floor( (z^2 + z - 1 + 2^30 x) / (2z) )
     832             :           = floor( (z(z+1) + (2^30 x-1)) / (2z) )
     833             :    which, at convergence, has:
     834             :         2z^2 = z^2 + z - 1 + 2^30 x - r'
     835             :    where r' is in [0,2z).  Solving for r', at convergence:
     836             :         r' = 2^30 x - (z^2 - z + 1)
     837             :         r' = r
     838             :    Thus, this iteration converges to the correctly rounded z when there
     839             :    are no ties.  But this also demonstrates that no ties are possible
     840             :    when z is integral ... the above derivation hold when either endpoint
     841             :    of the initial inequality is closed because the endpoint values are
     842             :    fraction and cannot be exactly met for any integral z.  As such,
     843             :    there are no ties and all round nearest styles can use the same
     844             :    iteration for the sqrt function.
     845             : 
     846             :    For computing a faster result for small x, let y be the RTZ rounded
     847             :    result:
     848             :         y = floor( sqrt( 2^30 x ) )
     849             :    and consider the residual:
     850             :         r'' = 2^30 x - y^2
     851             :    which, given the above, will be in [0,2y].  If r''==0, the result is
     852             :    exact and thus already correctly rounded.  Otherwise, let:
     853             :         z = y when r''<=y and y+1 when when r''>y
     854             :    Consider r''' from the above for this z.
     855             :         r''' = 2^30 x - z^2
     856             :              = 2^30 x - y^2 when r''<=y and 2^30 x - (y+1)^2 o.w.
     857             :              = r''' when r''<=y and r'' - 2y - 1 o.w.
     858             :      -> r''' in [0,y] when r''<=y and in [y+1,2y]-2y-1 o.w.
     859             :      -> r''' in [0,y] when r''<=y and in [-y,-1]-2y-1 o.w.
     860             :      -> r''' in [-y,y] and is negative when r''>y
     861             :      -> r''' in [-z+1,z]
     862             :    This implies that  we have for
     863             :         2^30 x - (z^2-z+1) = r''' + z-1 is in [0,2z)
     864             :    As such, z computed by this method is also correctly rounded.  Thus
     865             :    we can use explicitly compute the remainder or use fd_uwide_divrem in
     866             :    the iteration itself to produce the needed residual.
     867             : 
     868             :    Very slightly more expensive than RTZ rounding.  Rounding error in
     869             :    (-1/2,1/2) ulp.  (ulp==2^-30). */
     870             : 
     871             : FD_FN_CONST static inline ulong
     872             : fd_fxp_sqrt_rnz( ulong x ) {
     873             :   ulong xh, xl;
     874             : 
     875             :   /* Same guess as rtz rounding */
     876             :   int s = (63-fd_ulong_find_msb( x )) >> 1;
     877             :   if( s>15 ) s = 15;
     878             :   xl = x << (s<<1);
     879             :   ulong y = fd_ulong_sqrt( xl ) << (15-s);
     880             :   if( s==15 ) return y + (ulong)((xl-y*y)>y); /* Explicitly compute residual to round when no iteration needed */
     881             : 
     882             :   /* Use the modified iteration to converge on rnz rounded result */
     883             :   fd_fxp_private_expand( &xh,&xl, x );          /* 2^30 x */
     884             :   fd_uwide_dec( &xh,&xl, xh,xl, 1UL );          /* 2^30 x - 1 */
     885             :   for(;;) {
     886             :     ulong yh,yl;
     887             :     fd_uwide_mul( &yh,&yl, y,y+1UL );           /* y^2 + y */
     888             :     fd_uwide_add( &yh,&yl, yh,yl, xh,xl, 0UL ); /* y^2 + y - 1 + 2^30 x */
     889             :     fd_uwide_div( &yh,&yl, yh,yl, y<<1 );
     890             :     if( yl==y ) break;
     891             :     y = yl;
     892             :   }
     893             : 
     894             :   return y;
     895             : }
     896             : 
     897             : #endif
     898             : 
     899           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rna( ulong x ) { return fd_fxp_sqrt_rnz( x ); }
     900           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rne( ulong x ) { return fd_fxp_sqrt_rnz( x ); }
     901           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rno( ulong x ) { return fd_fxp_sqrt_rnz( x ); }
     902             : 
     903   300000000 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rtz_fast( ulong x ) { return fd_ulong_sqrt( x<<30 ); }
     904             : 
     905             : FD_FN_CONST static inline ulong
     906   300000000 : fd_fxp_sqrt_raz_fast( ulong x ) {
     907   300000000 :   ulong xl = x<<30;
     908   300000000 :   ulong y  = fd_ulong_sqrt( xl );
     909   300000000 :   ulong r  = xl - y*y;
     910   300000000 :   return y + (ulong)!!r;
     911   300000000 : }
     912             : 
     913             : FD_FN_CONST static inline ulong
     914   300000000 : fd_fxp_sqrt_rnz_fast( ulong x ) {
     915   300000000 :   ulong xl = x<<30;
     916   300000000 :   ulong y  = fd_ulong_sqrt( xl );
     917   300000000 :   ulong r  = xl - y*y;
     918   300000000 :   return y + (ulong)(r>y);
     919   300000000 : }
     920             : 
     921           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rna_fast( ulong x ) { return fd_fxp_sqrt_rnz_fast( x ); }
     922           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rne_fast( ulong x ) { return fd_fxp_sqrt_rnz_fast( x ); }
     923           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rno_fast( ulong x ) { return fd_fxp_sqrt_rnz_fast( x ); }
     924             : 
     925             : /* Other rounding modes:
     926             :      rdn -> Round down                   / toward floor / toward -inf ... same as rtz for unsigned
     927             :      rup -> Round up                     / toward ceil  / toward +inf ... same as raz for unsigned
     928             :      rnd -> Round nearest with ties down / toward floor / toward -inf ... same as rnz for unsigned
     929             :      rnu -> Round nearest with ties up   / toward ceil  / toward -inf ... same as rna for unsigned */
     930             : 
     931           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rdn( ulong x ) { return fd_fxp_sqrt_rtz( x ); }
     932           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rup( ulong x ) { return fd_fxp_sqrt_raz( x ); }
     933           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rnd( ulong x ) { return fd_fxp_sqrt_rnz( x ); }
     934           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rnu( ulong x ) { return fd_fxp_sqrt_rnz( x ); }
     935             : 
     936           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rdn_fast( ulong x ) { return fd_fxp_sqrt_rtz_fast( x ); }
     937           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rup_fast( ulong x ) { return fd_fxp_sqrt_raz_fast( x ); }
     938           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rnd_fast( ulong x ) { return fd_fxp_sqrt_rnz_fast( x ); }
     939           0 : FD_FN_CONST static inline ulong fd_fxp_sqrt_rnu_fast( ulong x ) { return fd_fxp_sqrt_rnz_fast( x ); }
     940             : 
     941             : /* FIXED POINT LOG2 ***************************************************/
     942             : 
     943             : /* Compute:
     944             : 
     945             :      e + f/2^30 ~ log2( x/2^30 )
     946             : 
     947             :    If x is non-zero, e will be in [-30,33] and f will be in [0,2^30]
     948             : 
     949             :    Note: This is not guaranteed to be exactly rounded and thus this
     950             :    doesn't have variants for every rounding mode under the sun.  The
     951             :    current implementation has <=2 ulp error with a round-nearest flavor
     952             :    though.
     953             : 
     954             :    Given the round-nearest flavor, it is possible to have a f/2^30=1
     955             :    exactly (e.g. log2_approx( ULONG_MAX ) will have e=33 and f/2^30=1
     956             :    such that e still is exactly determined by the index of x's most
     957             :    significant bit but the fractional part is 1 after rounding up.
     958             : 
     959             :    It is possible modify this while retaining a round-nearest flavor
     960             :    such that e is strictly in [-30,34] and f/2^30 is strictly in [0,1)
     961             :    (e will no longer strictly be determined the index of x's most
     962             :    significant bit so technically this retains less info than the
     963             :    above).
     964             : 
     965             :    Likewise, it is possible to modify this to use a round-toward-zero
     966             :    flavor such that e will be in [-30,33] and f/2^30 is in [0,1) always.
     967             :    The resulting approximation accuracy would be slightly lower and
     968             :    slightly biased.
     969             : 
     970             :    If x is zero, the result is undefined mathematically (the limit
     971             :    x->zero+ is -inf ... this implementation returns i=INT_MIN<<-30 and
     972             :    f=0 for specificity). */
     973             : /* FIXME: CONSIDER MAKING THIS A FUNCTION CALL? */
     974             : 
     975             : static inline ulong              /* f, in [0,2^30] (yes, closed ... see note above) */
     976             : fd_fxp_log2_approx( ulong x,
     977   300000000 :                     int * _e ) { /* e, in [-30,33] (==fd_ulong_find_msb(x)-30 ... see note above) */
     978             : 
     979             :   /* Handle bad inputs */
     980             : 
     981   300000000 :   if( !x ) { *_e = INT_MIN; return 0UL; }
     982             : 
     983             :   /* Crack x into:
     984             : 
     985             :        x = 2^i ( 1 + y/2^63 )
     986             : 
     987             :      where y is in [0,2^63).  This can always be done _exactly_ for
     988             :      non-zero x.  That is, i is the index of x's most significant
     989             :      non-zero bit (in [0,63]) and y is trailing i bits shifted up to be
     990             :      63b wide. */
     991             : 
     992   297659577 :   int   i = fd_ulong_find_msb( x );    /* In [0,63] */
     993   297659577 :   ulong y = (x << (63-i)) - (1UL<<63); /* In [0,2^63) */
     994             : 
     995             :   /* Convert this to a fixed point approximation of x:
     996             : 
     997             :        x ~ 2^i ( 1 + d/2^30 )
     998             : 
     999             :      via:
    1000             : 
    1001             :        d = floor( (y+2^32) / 2^33 )
    1002             : 
    1003             :      This representation is still exact when i <= 30 and at most 1/2 ulp
    1004             :      error in d when i>30 (rna / round nearest with ties away from zero
    1005             :      rounding ... consider using ties toward even or truncate rounding
    1006             :      here as per note above).  Given the use of a round nearest style, d
    1007             :      is in [0,2^30] (closed at both ends). */
    1008             : 
    1009   297659577 :   ulong d = (y + (1UL<<32)) >> 33;
    1010             : 
    1011             :   /* Given this, we have:
    1012             : 
    1013             :        e + f/2^30 = log2( x/2^30 )
    1014             :                   = log2( x ) - 30
    1015             :                   ~ log2( 2^i ( 1+ d/2^30 ) ) - 30
    1016             :                   = i-30 + log2( 1 + d/2^30 )
    1017             : 
    1018             :      From this, we identify:
    1019             : 
    1020             :        e      = i-30               (exact)
    1021             :        f/2^30 ~ log2( 1 + d/2^30 ) (approximate)
    1022             : 
    1023             :      Thus, f of a 30b fixed point lg1p calculator with a 30b fixed point
    1024             :      input.  The below is automatically generated code for a fixed point
    1025             :      implementation of a minimax Pade(4,3) approximant to log2(1+x) over
    1026             :      the domain [0,1].  In exact math, the approximation has an accuracy
    1027             :      better than 1/2 ulp over the whole domain and is exact at the
    1028             :      endpoints.  As implemented, the accuracy is O(1) ulp over the whole
    1029             :      domain (with round nearest flavored rounding), monotonic and still
    1030             :      exact at the endpoints. */
    1031             : 
    1032   297659577 :   ulong f;
    1033   297659577 :   ulong g;
    1034             : 
    1035             :   /* BEGIN AUTOGENERATED CODE */
    1036             :   /* bits 31.8 rms_aerr 1.9e-10 rms_rerr 1.3e-10 max_aerr 2.7e-10 max_rerr 2.7e-10 */
    1037             : 
    1038   297659577 :   f = 0x0000000245c36b35UL;               /* scale 41 bout 34 bmul  - */
    1039   297659577 :   f = 0x000000029c5b8e15UL + ((f*d)>>36); /* scale 35 bout 34 bmul 64 */
    1040   297659577 :   f = 0x0000000303d59639UL + ((f*d)>>32); /* scale 33 bout 34 bmul 64 */
    1041   297659577 :   f = 0x00000001715475ccUL + ((f*d)>>31); /* scale 32 bout 34 bmul 64 */
    1042   297659577 :   f =                         (f*d);      /* scale 62 bout 64 bmul 64 */
    1043             :   /* f max 0xd1fb651800000000 */
    1044             : 
    1045   297659577 :   g = 0x000000024357c946UL;               /* scale 37 bout 34 bmul  - */
    1046   297659577 :   g = 0x00000002a94e3723UL + ((g*d)>>33); /* scale 34 bout 34 bmul 64 */
    1047   297659577 :   g = 0x000000018b7f484dUL + ((g*d)>>32); /* scale 32 bout 34 bmul 64 */
    1048   297659577 :   g = 0x0000000100000000UL + ((g*d)>>30); /* scale 32 bout 34 bmul 64 */
    1049             :   /* g max 0x0000000347ed945f */
    1050             : 
    1051   297659577 :   f = (f + (g>>1)) / g; /* RNA style rounding */
    1052             :   /* END AUTOGENERATED CODE */
    1053             : 
    1054   297659577 :   *_e = i-30; return f;
    1055   300000000 : }
    1056             : 
    1057             : /* FIXED POINT EXP2 / REXP2 *******************************************/
    1058             : 
    1059             : /* fd_fxp_exp2_approx computes:
    1060             : 
    1061             :      y/2^30 ~ exp2( x/2^30 )
    1062             : 
    1063             :    with an error of O(1) ulp for x/2^30 < ~1.  This uses a minimax
    1064             :    polynomial that is better than 0.5 ulp accurate in exact arithmetic.
    1065             :    As implemented, this is +/-1 ulp of the correctly rounded RNE result
    1066             :    when x<=2^30, has the leading ~30 bits correct for larger x and is
    1067             :    exact for input values that yield exactly representable outputs.
    1068             :    Returns ULONG_MAX if output would overflow the 34.30u output. */
    1069             : /* FIXME: CONSIDER MAKING THIS A FUNCTION CALL? */
    1070             : 
    1071             : FD_FN_CONST static inline ulong
    1072   300000000 : fd_fxp_exp2_approx( ulong x ) {
    1073   300000000 :   ulong i = x >> 30;
    1074   300000000 :   if( i>=34UL ) return ULONG_MAX;
    1075    43428828 :   ulong d = x & ((1UL<<30)-1UL);
    1076    43428828 :   ulong y;
    1077             :   /* BEGIN AUTOGENERATED CODE */
    1078             :   /* bits 33.8 rms_aerr 4.7e-11 rms_rerr 3.5e-11 max_aerr 6.7e-11 max_rerr 6.6e-11 */
    1079    43428828 :   y = 0x00000002d6e2cc42UL;               /* scale 49 bout 34 bmul  - */
    1080    43428828 :   y = 0x0000000257c0894cUL + ((y*d)>>33); /* scale 46 bout 34 bmul 64 */
    1081    43428828 :   y = 0x00000002c01421b9UL + ((y*d)>>33); /* scale 43 bout 34 bmul 64 */
    1082    43428828 :   y = 0x000000027609e3a4UL + ((y*d)>>33); /* scale 40 bout 34 bmul 64 */
    1083    43428828 :   y = 0x00000001c6b2ea70UL + ((y*d)>>33); /* scale 37 bout 34 bmul 64 */
    1084    43428828 :   y = 0x00000001ebfbce13UL + ((y*d)>>32); /* scale 35 bout 34 bmul 64 */
    1085    43428828 :   y = 0x00000002c5c8603bUL + ((y*d)>>31); /* scale 34 bout 34 bmul 64 */
    1086    43428828 :   y =                         (y*d);      /* scale 64 bout 64 bmul 64 */
    1087             :   /* END AUTOGENERATED CODE */
    1088    43428828 :   int s = 34-(int)i;
    1089    43428828 :   return ((y + (1UL<<(s-1))) >> s) + (1UL<<(64-s));
    1090   300000000 : }
    1091             : 
    1092             : /* fd_fxp_rexp2_approx computes:
    1093             : 
    1094             :      y/2^30 ~ exp2( -x/2^30 )
    1095             : 
    1096             :    with an error of O(1) ulp everywhere.  This uses a minimax polynomial
    1097             :    that is better than 0.5 ulp accurate in exact arithmetic.  As
    1098             :    implemented, this is +/-1 ulp of the correctly rounded RNE result
    1099             :    everywhere and exact for input values that have exactly representable
    1100             :    outputs. */
    1101             : /* FIXME: CONSIDER MAKING THIS A FUNCTION CALL? */
    1102             : 
    1103             : FD_FN_CONST static inline ulong
    1104   300000000 : fd_fxp_rexp2_approx( ulong x ) {
    1105   300000000 :   ulong i = x >> 30;
    1106   300000000 :   if( i>=31UL ) return 0UL;
    1107    43282515 :   ulong d = x & ((1UL<<30)-1UL);
    1108    43282515 :   ulong y;
    1109             :   /* BEGIN AUTOGENERATED CODE */
    1110             :   /* bits 35.4 rms_aerr 2.4e-11 rms_rerr 1.4e-11 max_aerr 3.3e-11 max_rerr 2.2e-11 */
    1111    43282515 :   y = 0x00000002d6e2c6a2UL;               /* scale 50 bout 34 bmul  - */
    1112    43282515 :   y = 0x0000000269e37ccbUL - ((y*d)>>34); /* scale 46 bout 34 bmul 64 */
    1113    43282515 :   y = 0x00000002b83379a8UL - ((y*d)>>33); /* scale 43 bout 34 bmul 64 */
    1114    43282515 :   y = 0x00000002762c0ceaUL - ((y*d)>>33); /* scale 40 bout 34 bmul 64 */
    1115    43282515 :   y = 0x00000001c6af4b81UL - ((y*d)>>33); /* scale 37 bout 34 bmul 64 */
    1116    43282515 :   y = 0x00000001ebfbd6aaUL - ((y*d)>>32); /* scale 35 bout 34 bmul 64 */
    1117    43282515 :   y = 0x00000002c5c85fb1UL - ((y*d)>>31); /* scale 34 bout 34 bmul 64 */
    1118    43282515 :   y = 0x8000000000000000UL - ((y*d)>> 1); /* scale 63 bout 64 bmul 64 */
    1119             :   /* END AUTOGENERATED CODE */
    1120    43282515 :   int s = 33+(int)i;
    1121    43282515 :   return (y + (1UL<<(s-1))) >> s;
    1122   300000000 : }
    1123             : 
    1124             : FD_PROTOTYPES_END
    1125             : 
    1126             : #endif /* HEADER_fd_src_util_math_fd_fxp_h */
    1127             : 

Generated by: LCOV version 1.14