LCOV - code coverage report
Current view: top level - ballet/bigint - fd_uint256_mul.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 79 79 100.0 %
Date: 2026-05-15 07:18:56 Functions: 19 181 10.5 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_ballet_bigint_fd_uint256_h
       2             : #error "Do not include this directly; use fd_uint256.h"
       3             : #endif
       4             : 
       5             : /* Implementation of uint256 Montgomery mul.
       6             : 
       7             :    TODOs:
       8             :    - efficient sqr, Alg. 5 https://eprint.iacr.org/2022/1400
       9             :    - regular CIOS (the current impl *probably* won't work for secp256k1/r1)
      10             : 
      11             :    This is used under the hood to implement:
      12             :    - bn254 field arithmetic
      13             :    - bn254 scalar field arithmetic
      14             :    - (TODO) ed25519 scalar field arithmetic
      15             :    - (TODO) secp256k1 field and scalar arithmetic
      16             :    - (TODO) secp256r1 field and scalar arithmetic
      17             :    - ...
      18             :  */
      19             : 
      20             : #define INLINE static inline __attribute__((always_inline))
      21             : 
      22             : #ifdef FD_USING_GCC
      23             : #define OPTIMIZE __attribute__((optimize("unroll-loops")))
      24             : #else
      25             : #define OPTIMIZE
      26             : #endif
      27             : 
      28             : #if FD_HAS_X86
      29             : #include <x86intrin.h>
      30             : #endif
      31             : 
      32             : /* Utility functions for fd_uint256_mul_mod_p.
      33             :    Implementation is based on uint128.
      34             :    The implementations WITHOUT uint128 are just for completeness, we
      35             :    should avoid using them (and, e.g., rely on fiat-crypto mul instead). */
      36             : 
      37             : INLINE void
      38             : fd_ulong_sub_borrow(
      39             :     ulong * r,   /* out (r=a-b) */
      40             :     int *   b,   /* out borrow flag */
      41             :     ulong   a0,
      42             :     ulong   a1,
      43             :     int     bi   /* in borrow flag */
      44    34393295 : ) {
      45    34393295 : # if FD_HAS_X86
      46    34393295 :   *b = (uchar)_subborrow_u64( (uchar)bi, a0, a1, (unsigned long long *)r );
      47             : # else
      48             :   a1 += !!bi;
      49             :   *r = a0 - a1;
      50             :   *b = a0 < a1;
      51             : # endif
      52    34393295 : }
      53             : 
      54             : #if FD_HAS_INT128
      55             : 
      56             : INLINE void
      57  1031193856 : fd_ulong_vec_mul( ulong l[4], ulong h[4], ulong const a[4], ulong b ) {
      58  1031193856 :   uint128 r0 = ((uint128)a[0]) * ((uint128)b);
      59  1031193856 :   uint128 r1 = ((uint128)a[1]) * ((uint128)b);
      60  1031193856 :   uint128 r2 = ((uint128)a[2]) * ((uint128)b);
      61  1031193856 :   uint128 r3 = ((uint128)a[3]) * ((uint128)b);
      62  1031193856 :   l[0] = (ulong)r0;  h[0] = (ulong)(r0 >> 64);
      63  1031193856 :   l[1] = (ulong)r1;  h[1] = (ulong)(r1 >> 64);
      64  1031193856 :   l[2] = (ulong)r2;  h[2] = (ulong)(r2 >> 64);
      65  1031193856 :   l[3] = (ulong)r3;  h[3] = (ulong)(r3 >> 64);
      66  1031193856 : }
      67             : 
      68             : INLINE void
      69  4125672448 : fd_ulong_add_carry4( ulong *l, uchar *h, ulong a0, ulong a1, ulong a2, uchar a3 ) {
      70  4125672448 :   uint128 r = ((uint128)a0) + ((uint128)a1) + ((uint128)a2) + ((uint128)a3);
      71  4125672448 :   *l = (ulong)r;
      72  4125672448 :   *h = (uchar)(r >> 64);
      73  4125672448 : }
      74             : 
      75             : #else
      76             : 
      77             : INLINE void
      78             : fd_ulong_mul128( ulong * l, ulong * h, ulong const a, ulong const b ) {
      79             :     /* First calculate all of the cross products. */
      80             :     ulong lo_lo = (a & 0xFFFFFFFF) * (b & 0xFFFFFFFF);
      81             :     ulong hi_lo = (a >> 32)        * (b & 0xFFFFFFFF);
      82             :     ulong lo_hi = (a & 0xFFFFFFFF) * (b >> 32);
      83             :     ulong hi_hi = (a >> 32)        * (b >> 32);
      84             : 
      85             :     /* Now add the products together. These will never overflow. */
      86             :     ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
      87             :     ulong upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
      88             : 
      89             :     *h = upper;
      90             :     *l = (cross << 32) | (lo_lo & 0xFFFFFFFF);
      91             :   }
      92             : 
      93             : INLINE void
      94             : fd_ulong_vec_mul( ulong l[4], ulong h[4], ulong const a[4], ulong b ) {
      95             :   fd_ulong_mul128( &l[0], &h[0], a[0], b );
      96             :   fd_ulong_mul128( &l[1], &h[1], a[1], b );
      97             :   fd_ulong_mul128( &l[2], &h[2], a[2], b );
      98             :   fd_ulong_mul128( &l[3], &h[3], a[3], b );
      99             : }
     100             : 
     101             : INLINE void
     102             : fd_ulong_add_carry4( ulong *l, uchar *h, ulong a0, ulong a1, ulong a2, uchar a3 ) {
     103             :   ulong r0 = a0 + a1;
     104             :   uchar c0 = r0 < a0;
     105             : 
     106             :   ulong r1 = a2 + a3;
     107             :   uchar c1 = r1 < a2;
     108             : 
     109             :   *l = r0 + r1;
     110             :   *h = (uchar)((*l < r0) + c0 + c1);
     111             : }
     112             : 
     113             : #endif
     114             : 
     115             : INLINE fd_uint256_t *
     116             : fd_uint256_add(fd_uint256_t *       r,
     117             :                fd_uint256_t const * a,
     118      224256 :                fd_uint256_t const * b ) {
     119      224256 :   uchar c0;
     120      224256 :   fd_ulong_add_carry4( &r->limbs[0], &c0, a->limbs[0], b->limbs[0], 0, 0 );
     121      224256 :   fd_ulong_add_carry4( &r->limbs[1], &c0, a->limbs[1], b->limbs[1], 0, c0 );
     122      224256 :   fd_ulong_add_carry4( &r->limbs[2], &c0, a->limbs[2], b->limbs[2], 0, c0 );
     123      224256 :   fd_ulong_add_carry4( &r->limbs[3], &c0, a->limbs[3], b->limbs[3], 0, c0 );
     124      224256 :   return r;
     125      224256 : }
     126             : 
     127             : /* fd_uint256_mul_mod_p computes r = a * b mod p, using the CIOS method.
     128             :    r, a, b are in Montgomery representation (p is not).
     129             : 
     130             :    This is an efficient implementation of CIOS that works when (circa) p < 2^255
     131             :    (precisely, p->limbs[3] < (2^64-1)/2 - 1).
     132             :    Alg. 2, https://eprint.iacr.org/2022/1400
     133             :    Code example, for bn254: https://github.com/Consensys/gnark-crypto/blob/v0.12.1/ecc/bn254/fp/element_ops_purego.go#L66
     134             : 
     135             :    In go lang, bits.Add64 has carry 0, 1.
     136             :    We allow the carry to be a uchar, so we can dp a single add chain after each mul.
     137             : 
     138             :    This function is intended to be wrapped into a fd_<field>_mul( r, a, b ).
     139             :    Experimentally we found that:
     140             :    1. We have to force inlining for this function, otherwise compilers tend to reuse
     141             :       the function, introducing overhead.
     142             :    2. In GCC, we have to force loop unrolling optimization *in the outer fd_<field>_mul()*
     143             :       function, otherwise performance degrades significantly.
     144             :       For this we added the macro FD_UINT256_FP_MUL_IMPL. */
     145             : 
     146             : #if FD_HAS_X86 && defined(__BMI2__) && defined(__ADX__)
     147             : __asm__( ".include \"src/ballet/bigint/fd_uint256_mul.inc\"" );
     148             : #endif
     149             : 
     150             : INLINE fd_uint256_t *
     151             : fd_uint256_mul_mod_p( fd_uint256_t *       r,
     152             :                       fd_uint256_t const * a,
     153             :                       fd_uint256_t const * b,
     154             :                       fd_uint256_t const * p,
     155   193348848 :                       ulong const          p_inv ) {
     156    64449616 : #if FD_HAS_X86 && defined(__BMI2__) && defined(__ADX__)
     157    64449616 :   register fd_uint256_t *       _r    __asm__("rdi") = r;
     158    64449616 :   register fd_uint256_t const * _a    __asm__("rsi") = a;
     159    64449616 :   register fd_uint256_t const * _b    __asm__("rdx") = b;
     160    64449616 :   register fd_uint256_t const * _p    __asm__("rcx") = p;
     161    64449616 :   register ulong                _pinv __asm__("r8")  = p_inv;
     162    64449616 :   __asm__ __volatile__ (
     163    64449616 :     "_fd_uint256_mul_mod_p %[r], %[a], %[b], %[p], %[pinv]"
     164    64449616 :     : [r]"+r"(_r), [a]"+r"(_a), [b]"+r"(_b), [pinv]"+r"(_pinv)
     165    64449616 :     : [p]"r"(_p)
     166    64449616 :     : "rax", "r9", "r10", "r11", "cc", "memory"
     167    64449616 :   );
     168    64449616 :   r = _r;
     169             : #else
     170   128899232 :   ulong FD_ALIGNED t[4] = { 0 };
     171   128899232 :   ulong FD_ALIGNED u[4];
     172   128899232 :   ulong FD_ALIGNED h[4];
     173   128899232 :   ulong FD_ALIGNED l[4];
     174   128899232 :   uchar c0, c1;
     175   128899232 :   ulong tmp;
     176   128899232 :   ulong m;
     177             : 
     178   644496160 :   for( int i=0; i<4; i++ ) {
     179   515596928 :     fd_ulong_vec_mul( l, u, a->limbs, b->limbs[i] );
     180   515596928 :     fd_ulong_add_carry4( &t[0], &c0, t[0], l[0], 0, 0 );
     181   515596928 :     fd_ulong_add_carry4( &t[1], &c0, t[1], l[1], u[0], c0 );
     182   515596928 :     fd_ulong_add_carry4( &t[2], &c0, t[2], l[2], u[1], c0 );
     183   515596928 :     fd_ulong_add_carry4( &t[3], &c0, t[3], l[3], u[2], c0 );
     184             : 
     185   515596928 :     m = t[0] * p_inv;
     186             : 
     187   515596928 :     fd_ulong_vec_mul( l, h, p->limbs, m );
     188   515596928 :     fd_ulong_add_carry4( &tmp,  &c1, t[0], l[0], 0, 0 );
     189   515596928 :     fd_ulong_add_carry4( &t[0], &c1, t[1], l[1], h[0], c1 );
     190   515596928 :     fd_ulong_add_carry4( &t[1], &c1, t[2], l[2], h[1], c1 );
     191   515596928 :     fd_ulong_add_carry4( &t[2], &c1, t[3], l[3], h[2], c1 );
     192   515596928 :     t[3] = u[3] + h[3] + c0 + c1;
     193   515596928 :   }
     194             : 
     195   128899232 :   r->limbs[0] = t[0];
     196   128899232 :   r->limbs[1] = t[1];
     197   128899232 :   r->limbs[2] = t[2];
     198   128899232 :   r->limbs[3] = t[3];
     199             : 
     200   128899232 :   if( fd_uint256_cmp( r, p ) >= 0 ) {
     201     5584694 :     int b = 0;
     202     5584694 :     fd_ulong_sub_borrow( &r->limbs[0], &b, r->limbs[0], p->limbs[0], b );
     203     5584694 :     fd_ulong_sub_borrow( &r->limbs[1], &b, r->limbs[1], p->limbs[1], b );
     204     5584694 :     fd_ulong_sub_borrow( &r->limbs[2], &b, r->limbs[2], p->limbs[2], b );
     205     5584694 :     fd_ulong_sub_borrow( &r->limbs[3], &b, r->limbs[3], p->limbs[3], b );
     206     5584694 :   }
     207   128899232 : #endif
     208   193348848 :   return r;
     209   193348848 : }
     210             : 
     211             : /* FD_UINT256_FP_MUL_IMPL macro to properly implement Fp mul based on
     212             :    fd_uint256_mul_mod_p().
     213             :    In GCC we need to explicitly force loop unroll. */
     214             : #define FD_UINT256_FP_MUL_IMPL(fp, p, p_inv)          \
     215             :   static inline fp ## _t * OPTIMIZE                   \
     216             :   fp ## _mul( fp ## _t * r,                           \
     217             :               fp ## _t const * a,                     \
     218   193348848 :               fp ## _t const * b ) {                  \
     219   193348848 :     return fd_uint256_mul_mod_p( r, a, b, p, p_inv ); \
     220   193348848 :   }

Generated by: LCOV version 1.14