LCOV - cov.lcov - ballet/bigint/fd_uint256

LCOV - code coverage report

Current view:	top level - ballet/bigint - fd_uint256_mul.h (source / functions)		Hit	Total	Coverage
Test:	cov.lcov	Lines:	65	65	100.0 %
Date:	2025-08-05 05:04:49	Functions:	16	46	34.8 %

          Line data    Source code

       1             : #ifndef HEADER_fd_src_ballet_bigint_uint256_h
       2             : #error "Do not include this directly; use fd_uint256.h"
       3             : #endif
       4             : 
       5             : /* Implementation of uint256 Montgomery mul.
       6             : 
       7             :    TODOs:
       8             :    - efficient sqr, Alg. 5 https://eprint.iacr.org/2022/1400
       9             :    - regular CIOS (the current impl *probably* won't work for secp256k1/r1)
      10             : 
      11             :    This is used under the hood to implement:
      12             :    - bn254 field arithmetic
      13             :    - bn254 scalar field arithmetic
      14             :    - (TODO) ed25519 scalar field arithmetic
      15             :    - (TODO) secp256k1 field and scalar arithmetic
      16             :    - (TODO) secp256r1 field and scalar arithmetic
      17             :    - ...
      18             :  */
      19             : 
      20             : #define INLINE static inline __attribute__((always_inline))
      21             : 
      22             : #ifdef FD_USING_GCC
      23             : #define OPTIMIZE __attribute__((optimize("unroll-loops")))
      24             : #else
      25             : #define OPTIMIZE
      26             : #endif
      27             : 
      28             : #if FD_HAS_X86
      29             : #include <x86intrin.h>
      30             : #endif
      31             : 
      32             : /* Utility functions for fd_uint256_mul_mod_p.
      33             :    Implementation is based on uint128.
      34             :    The implementations WITHOUT uint128 are just for completeness, we
      35             :    should avoid using them (and, e.g., rely on fiat-crypto mul instead). */
      36             : 
      37             : INLINE void
      38             : fd_ulong_sub_borrow(
      39             :     ulong * r,   /* out (r=a-b) */
      40             :     int *   b,   /* out borrow flag */
      41             :     ulong   a0,
      42             :     ulong   a1,
      43             :     int     bi   /* in borrow flag */
      44    16165815 : ) {
      45    16165815 : # if FD_HAS_X86
      46    16165815 :   *b = (uchar)_subborrow_u64( (uchar)bi, a0, a1, (unsigned long long *)r );
      47             : # else
      48             :   a1 += !!bi;
      49             :   *r = a0 - a1;
      50             :   *b = a0 < a1;
      51             : # endif
      52    16165815 : }
      53             : 
      54             : #if FD_HAS_INT128
      55             : 
      56             : INLINE void
      57   772532592 : fd_ulong_vec_mul( ulong l[4], ulong h[4], ulong const a[4], ulong b ) {
      58   772532592 :   uint128 r0 = ((uint128)a[0]) * ((uint128)b);
      59   772532592 :   uint128 r1 = ((uint128)a[1]) * ((uint128)b);
      60   772532592 :   uint128 r2 = ((uint128)a[2]) * ((uint128)b);
      61   772532592 :   uint128 r3 = ((uint128)a[3]) * ((uint128)b);
      62   772532592 :   l[0] = (ulong)r0;  h[0] = (ulong)(r0 >> 64);
      63   772532592 :   l[1] = (ulong)r1;  h[1] = (ulong)(r1 >> 64);
      64   772532592 :   l[2] = (ulong)r2;  h[2] = (ulong)(r2 >> 64);
      65   772532592 :   l[3] = (ulong)r3;  h[3] = (ulong)(r3 >> 64);
      66   772532592 : }
      67             : 
      68             : INLINE void
      69  3090883008 : fd_ulong_add_carry4( ulong *l, uchar *h, ulong a0, ulong a1, ulong a2, uchar a3 ) {
      70  3090883008 :   uint128 r = ((uint128)a0) + ((uint128)a1) + ((uint128)a2) + ((uint128)a3);
      71  3090883008 :   *l = (ulong)r;
      72  3090883008 :   *h = (uchar)(r >> 64);
      73  3090883008 : }
      74             : 
      75             : #else
      76             : 
      77             : INLINE void
      78             : fd_ulong_mul128( ulong * l, ulong * h, ulong const a, ulong const b ) {
      79             :     /* First calculate all of the cross products. */
      80             :     ulong lo_lo = (a & 0xFFFFFFFF) * (b & 0xFFFFFFFF);
      81             :     ulong hi_lo = (a >> 32)        * (b & 0xFFFFFFFF);
      82             :     ulong lo_hi = (a & 0xFFFFFFFF) * (b >> 32);
      83             :     ulong hi_hi = (a >> 32)        * (b >> 32);
      84             : 
      85             :     /* Now add the products together. These will never overflow. */
      86             :     ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
      87             :     ulong upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
      88             : 
      89             :     *h = upper;
      90             :     *l = (cross << 32) | (lo_lo & 0xFFFFFFFF);
      91             :   }
      92             : 
      93             : INLINE void
      94             : fd_ulong_vec_mul( ulong l[4], ulong h[4], ulong const a[4], ulong b ) {
      95             :   fd_ulong_mul128( &l[0], &h[0], a[0], b );
      96             :   fd_ulong_mul128( &l[1], &h[1], a[1], b );
      97             :   fd_ulong_mul128( &l[2], &h[2], a[2], b );
      98             :   fd_ulong_mul128( &l[3], &h[3], a[3], b );
      99             : }
     100             : 
     101             : INLINE void
     102             : fd_ulong_add_carry4( ulong *l, uchar *h, ulong a0, ulong a1, ulong a2, uchar a3 ) {
     103             :   ulong r0 = a0 + a1;
     104             :   uchar c0 = r0 < a0;
     105             : 
     106             :   ulong r1 = a2 + a3;
     107             :   uchar c1 = r1 < a2;
     108             : 
     109             :   *l = r0 + r1;
     110             :   *h = (uchar)((*l < r0) + c0 + c1);
     111             : }
     112             : 
     113             : #endif
     114             : 
     115             : INLINE fd_uint256_t *
     116             : fd_uint256_add(fd_uint256_t *       r,
     117             :                fd_uint256_t const * a,
     118      188160 :                fd_uint256_t const * b ) {
     119      188160 :   uchar c0;
     120      188160 :   fd_ulong_add_carry4( &r->limbs[0], &c0, a->limbs[0], b->limbs[0], 0, 0 );
     121      188160 :   fd_ulong_add_carry4( &r->limbs[1], &c0, a->limbs[1], b->limbs[1], 0, c0 );
     122      188160 :   fd_ulong_add_carry4( &r->limbs[2], &c0, a->limbs[2], b->limbs[2], 0, c0 );
     123      188160 :   fd_ulong_add_carry4( &r->limbs[3], &c0, a->limbs[3], b->limbs[3], 0, c0 );
     124      188160 :   return r;
     125      188160 : }
     126             : 
     127             : /* fd_uint256_mul_mod_p computes r = a * b mod p, using the CIOS method.
     128             :    r, a, b are in Montgomery representation (p is not).
     129             : 
     130             :    This is an efficient implementation of CIOS that works when (circa) p < 2^255
     131             :    (precisely, p->limbs[3] < (2^64-1)/2 - 1).
     132             :    Alg. 2, https://eprint.iacr.org/2022/1400
     133             :    Code example, for bn254: https://github.com/Consensys/gnark-crypto/blob/v0.12.1/ecc/bn254/fp/element_ops_purego.go#L66
     134             : 
     135             :    In go lang, bits.Add64 has carry 0, 1.
     136             :    We allow the carry to be a uchar, so we can dp a single add chain after each mul.
     137             : 
     138             :    This function is intended to be wrapped into a fd_<field>_mul( r, a, b ).
     139             :    Experimentally we found that:
     140             :    1. We have to force inlining for this function, otherwise compilers tend to reuse
     141             :       the function, introducing overhead.
     142             :    2. In GCC, we have to force loop unrolling optimization *in the outer fd_<field>_mul()*
     143             :       function, otherwise performance degrades significantly.
     144             :       For this we added the macro FD_UINT256_FP_MUL_IMPL. */
     145             : 
     146             : INLINE fd_uint256_t *
     147             : fd_uint256_mul_mod_p( fd_uint256_t *       r,
     148             :                       fd_uint256_t const * a,
     149             :                       fd_uint256_t const * b,
     150             :                       fd_uint256_t const * p,
     151    96566574 :                       ulong const          p_inv ) {
     152    96566574 :   ulong FD_ALIGNED t[4] = { 0 };
     153    96566574 :   ulong FD_ALIGNED u[4];
     154    96566574 :   ulong FD_ALIGNED h[4];
     155    96566574 :   ulong FD_ALIGNED l[4];
     156    96566574 :   uchar c0, c1;
     157    96566574 :   ulong tmp;
     158    96566574 :   ulong m;
     159             : 
     160   482832870 :   for( int i=0; i<4; i++ ) {
     161   386266296 :     fd_ulong_vec_mul( l, u, a->limbs, b->limbs[i] );
     162   386266296 :     fd_ulong_add_carry4( &t[0], &c0, t[0], l[0], 0, 0 );
     163   386266296 :     fd_ulong_add_carry4( &t[1], &c0, t[1], l[1], u[0], c0 );
     164   386266296 :     fd_ulong_add_carry4( &t[2], &c0, t[2], l[2], u[1], c0 );
     165   386266296 :     fd_ulong_add_carry4( &t[3], &c0, t[3], l[3], u[2], c0 );
     166             : 
     167   386266296 :     m = t[0] * p_inv;
     168             : 
     169   386266296 :     fd_ulong_vec_mul( l, h, p->limbs, m );
     170   386266296 :     fd_ulong_add_carry4( &tmp,  &c1, t[0], l[0], 0, 0 );
     171   386266296 :     fd_ulong_add_carry4( &t[0], &c1, t[1], l[1], h[0], c1 );
     172   386266296 :     fd_ulong_add_carry4( &t[1], &c1, t[2], l[2], h[1], c1 );
     173   386266296 :     fd_ulong_add_carry4( &t[2], &c1, t[3], l[3], h[2], c1 );
     174   386266296 :     t[3] = u[3] + h[3] + c0 + c1;
     175   386266296 :   }
     176             : 
     177    96566574 :   r->limbs[0] = t[0];
     178    96566574 :   r->limbs[1] = t[1];
     179    96566574 :   r->limbs[2] = t[2];
     180    96566574 :   r->limbs[3] = t[3];
     181             : 
     182    96566574 :   if( fd_uint256_cmp( r, p ) >= 0 ) {
     183     4041450 :     int b = 0;
     184     4041450 :     fd_ulong_sub_borrow( &r->limbs[0], &b, r->limbs[0], p->limbs[0], b );
     185     4041450 :     fd_ulong_sub_borrow( &r->limbs[1], &b, r->limbs[1], p->limbs[1], b );
     186     4041450 :     fd_ulong_sub_borrow( &r->limbs[2], &b, r->limbs[2], p->limbs[2], b );
     187     4041450 :     fd_ulong_sub_borrow( &r->limbs[3], &b, r->limbs[3], p->limbs[3], b );
     188     4041450 :   }
     189    96566574 :   return r;
     190    96566574 : }
     191             : 
     192             : /* FD_UINT256_FP_MUL_IMPL macro to properly implement Fp mul based on
     193             :    fd_uint256_mul_mod_p().
     194             :    In GCC we need to explicitly force loop unroll. */
     195             : #define FD_UINT256_FP_MUL_IMPL(fp, p, p_inv)          \
     196             :   static inline fp ## _t * OPTIMIZE                   \
     197             :   fp ## _mul( fp ## _t * r,                           \
     198             :               fp ## _t const * a,                     \
     199    96566574 :               fp ## _t const * b ) {                  \
     200    96566574 :     return fd_uint256_mul_mod_p( r, a, b, p, p_inv ); \
     201    96566574 :   }

Generated by: LCOV version 1.14