LCOV - code coverage report
Current view: top level - ballet/bigint - fd_uint256_mul.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 65 65 100.0 %
Date: 2024-11-13 11:58:15 Functions: 12 36 33.3 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_ballet_bigint_uint256_h
       2             : #error "Do not include this directly; use fd_uint256.h"
       3             : #endif
       4             : 
       5             : /* Implementation of uint256 Montgomery mul.
       6             : 
       7             :    TODOs:
       8             :    - efficient sqr, Alg. 5 https://eprint.iacr.org/2022/1400
       9             :    - regular CIOS (the current impl *probably* won't work for secp256k1/r1)
      10             : 
      11             :    This is used under the hood to implement:
      12             :    - bn254 field arithmetic
      13             :    - bn254 scalar field arithmetic
      14             :    - (TODO) ed25519 scalar field arithmetic
      15             :    - (TODO) secp256k1 field and scalar arithmetic
      16             :    - (TODO) secp256r1 field and scalar arithmetic
      17             :    - ...
      18             :  */
      19             : 
      20             : #define INLINE static inline __attribute__((always_inline))
      21             : 
      22             : #ifdef FD_USING_GCC
      23             : #define OPTIMIZE __attribute__((optimize("unroll-loops")))
      24             : #else
      25             : #define OPTIMIZE
      26             : #endif
      27             : 
      28             : /* Utility functions for fd_uint256_mul_mod_p.
      29             :    Implementation is based on uint128.
      30             :    The implementations WITHOUT uint128 are just for completeness, we
      31             :    should avoid using them (and, e.g., rely on fiat-crypto mul instead). */
      32             : 
      33             : INLINE void
      34             : fd_ulong_sub_borrow(
      35             :     ulong * r,   /* out (r=a-b) */
      36             :     int *   b,   /* out borrow flag */
      37             :     ulong   a0,
      38             :     ulong   a1,
      39             :     int     bi   /* in borrow flag */
      40     1396587 : ) {
      41     1396587 : # if FD_HAS_X86
      42     1396587 :   *b = (uchar)_subborrow_u64( (uchar)bi, a0, a1, (unsigned long long *)r );
      43             : # else
      44             :   a1 += !!bi;
      45             :   *r = a0 - a1;
      46             :   *b = a0 < a1;
      47             : # endif
      48     1396587 : }
      49             : 
      50             : #if FD_HAS_INT128
      51             : 
      52             : INLINE void
      53    65204328 : fd_ulong_vec_mul( ulong l[4], ulong h[4], ulong const a[4], ulong b ) {
      54    65204328 :   uint128 r0 = ((uint128)a[0]) * ((uint128)b);
      55    65204328 :   uint128 r1 = ((uint128)a[1]) * ((uint128)b);
      56    65204328 :   uint128 r2 = ((uint128)a[2]) * ((uint128)b);
      57    65204328 :   uint128 r3 = ((uint128)a[3]) * ((uint128)b);
      58    65204328 :   l[0] = (ulong)r0;  h[0] = (ulong)(r0 >> 64);
      59    65204328 :   l[1] = (ulong)r1;  h[1] = (ulong)(r1 >> 64);
      60    65204328 :   l[2] = (ulong)r2;  h[2] = (ulong)(r2 >> 64);
      61    65204328 :   l[3] = (ulong)r3;  h[3] = (ulong)(r3 >> 64);
      62    65204328 : }
      63             : 
      64             : INLINE void
      65   261075360 : fd_ulong_add_carry4( ulong *l, uchar *h, ulong a0, ulong a1, ulong a2, uchar a3 ) {
      66   261075360 :   uint128 r = ((uint128)a0) + ((uint128)a1) + ((uint128)a2) + ((uint128)a3);
      67   261075360 :   *l = (ulong)r;
      68   261075360 :   *h = (uchar)(r >> 64);
      69   261075360 : }
      70             : 
      71             : #else
      72             : 
      73             : INLINE void
      74             : fd_ulong_mul128( ulong * l, ulong * h, ulong const a, ulong const b ) {
      75             :     /* First calculate all of the cross products. */
      76             :     ulong lo_lo = (a & 0xFFFFFFFF) * (b & 0xFFFFFFFF);
      77             :     ulong hi_lo = (a >> 32)        * (b & 0xFFFFFFFF);
      78             :     ulong lo_hi = (a & 0xFFFFFFFF) * (b >> 32);
      79             :     ulong hi_hi = (a >> 32)        * (b >> 32);
      80             : 
      81             :     /* Now add the products together. These will never overflow. */
      82             :     ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
      83             :     ulong upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
      84             : 
      85             :     *h = upper;
      86             :     *l = (cross << 32) | (lo_lo & 0xFFFFFFFF);
      87             :   }
      88             : 
      89             : INLINE void
      90             : fd_ulong_vec_mul( ulong l[4], ulong h[4], ulong const a[4], ulong b ) {
      91             :   fd_ulong_mul128( &l[0], &h[0], a[0], b );
      92             :   fd_ulong_mul128( &l[1], &h[1], a[1], b );
      93             :   fd_ulong_mul128( &l[2], &h[2], a[2], b );
      94             :   fd_ulong_mul128( &l[3], &h[3], a[3], b );
      95             : }
      96             : 
      97             : INLINE void
      98             : fd_ulong_add_carry4( ulong *l, uchar *h, ulong a0, ulong a1, ulong a2, uchar a3 ) {
      99             :   ulong r0 = a0 + a1;
     100             :   uchar c0 = r0 < a0;
     101             : 
     102             :   ulong r1 = a2 + a3;
     103             :   uchar c1 = r1 < a2;
     104             : 
     105             :   *l = r0 + r1;
     106             :   *h = (uchar)((*l < r0) + c0 + c1);
     107             : }
     108             : 
     109             : #endif
     110             : 
     111             : INLINE fd_uint256_t *
     112             : fd_uint256_add(fd_uint256_t *       r,
     113             :                fd_uint256_t const * a,
     114       64512 :                fd_uint256_t const * b ) {
     115       64512 :   uchar c0;
     116       64512 :   fd_ulong_add_carry4( &r->limbs[0], &c0, a->limbs[0], b->limbs[0], 0, 0 );
     117       64512 :   fd_ulong_add_carry4( &r->limbs[1], &c0, a->limbs[1], b->limbs[1], 0, c0 );
     118       64512 :   fd_ulong_add_carry4( &r->limbs[2], &c0, a->limbs[2], b->limbs[2], 0, c0 );
     119       64512 :   fd_ulong_add_carry4( &r->limbs[3], &c0, a->limbs[3], b->limbs[3], 0, c0 );
     120       64512 :   return r;
     121       64512 : }
     122             : 
     123             : /* fd_uint256_mul_mod_p computes r = a * b mod p, using the CIOS method.
     124             :    r, a, b are in Montgomery representation (p is not).
     125             : 
     126             :    This is an efficient implementation of CIOS that works when (circa) p < 2^255
     127             :    (precisely, p->limbs[3] < (2^64-1)/2 - 1).
     128             :    Alg. 2, https://eprint.iacr.org/2022/1400
     129             :    Code example, for bn254: https://github.com/Consensys/gnark-crypto/blob/v0.12.1/ecc/bn254/fp/element_ops_purego.go#L66
     130             : 
     131             :    In go lang, bits.Add64 has carry 0, 1.
     132             :    We allow the carry to be a uchar, so we can dp a single add chain after each mul.
     133             : 
     134             :    This function is intended to be wrapped into a fd_<field>_mul( r, a, b ).
     135             :    Experimentally we found that:
     136             :    1. We have to force inlining for this function, otherwise compilers tend to reuse
     137             :       the function, introducing overhead.
     138             :    2. In GCC, we have to force loop unrolling optimization *in the outer fd_<field>_mul()*
     139             :       function, otherwise performance degrades significantly.
     140             :       For this we added the macro FD_UINT256_FP_MUL_IMPL. */
     141             : 
     142             : INLINE fd_uint256_t *
     143             : fd_uint256_mul_mod_p( fd_uint256_t *       r,
     144             :                       fd_uint256_t const * a,
     145             :                       fd_uint256_t const * b,
     146             :                       fd_uint256_t const * p,
     147     8150541 :                       ulong const          p_inv ) {
     148     8150541 :   ulong FD_ALIGNED t[4] = { 0 };
     149     8150541 :   ulong FD_ALIGNED u[4];
     150     8150541 :   ulong FD_ALIGNED h[4];
     151     8150541 :   ulong FD_ALIGNED l[4];
     152     8150541 :   uchar c0, c1;
     153     8150541 :   ulong tmp;
     154     8150541 :   ulong m;
     155             : 
     156    40752705 :   for( int i=0; i<4; i++ ) {
     157    32602164 :     fd_ulong_vec_mul( l, u, a->limbs, b->limbs[i] );
     158    32602164 :     fd_ulong_add_carry4( &t[0], &c0, t[0], l[0], 0, 0 );
     159    32602164 :     fd_ulong_add_carry4( &t[1], &c0, t[1], l[1], u[0], c0 );
     160    32602164 :     fd_ulong_add_carry4( &t[2], &c0, t[2], l[2], u[1], c0 );
     161    32602164 :     fd_ulong_add_carry4( &t[3], &c0, t[3], l[3], u[2], c0 );
     162             : 
     163    32602164 :     m = t[0] * p_inv;
     164             : 
     165    32602164 :     fd_ulong_vec_mul( l, h, p->limbs, m );
     166    32602164 :     fd_ulong_add_carry4( &tmp,  &c1, t[0], l[0], 0, 0 );
     167    32602164 :     fd_ulong_add_carry4( &t[0], &c1, t[1], l[1], h[0], c1 );
     168    32602164 :     fd_ulong_add_carry4( &t[1], &c1, t[2], l[2], h[1], c1 );
     169    32602164 :     fd_ulong_add_carry4( &t[2], &c1, t[3], l[3], h[2], c1 );
     170    32602164 :     t[3] = u[3] + h[3] + c0 + c1;
     171    32602164 :   }
     172             : 
     173     8150541 :   r->limbs[0] = t[0];
     174     8150541 :   r->limbs[1] = t[1];
     175     8150541 :   r->limbs[2] = t[2];
     176     8150541 :   r->limbs[3] = t[3];
     177             : 
     178     8150541 :   if( fd_uint256_cmp( r, p ) >= 0 ) {
     179      349143 :     int b = 0;
     180      349143 :     fd_ulong_sub_borrow( &r->limbs[0], &b, r->limbs[0], p->limbs[0], b );
     181      349143 :     fd_ulong_sub_borrow( &r->limbs[1], &b, r->limbs[1], p->limbs[1], b );
     182      349143 :     fd_ulong_sub_borrow( &r->limbs[2], &b, r->limbs[2], p->limbs[2], b );
     183      349143 :     fd_ulong_sub_borrow( &r->limbs[3], &b, r->limbs[3], p->limbs[3], b );
     184      349143 :   }
     185     8150541 :   return r;
     186     8150541 : }
     187             : 
     188             : /* FD_UINT256_FP_MUL_IMPL macro to properly implement Fp mul based on
     189             :    fd_uint256_mul_mod_p().
     190             :    In GCC we need to explicitly force loop unroll. */
     191             : #define FD_UINT256_FP_MUL_IMPL(fp, p, p_inv)          \
     192             :   static inline fp ## _t * OPTIMIZE                   \
     193             :   fp ## _mul( fp ## _t * r,                           \
     194             :               fp ## _t const * a,                     \
     195     8150541 :               fp ## _t const * b ) {                  \
     196     8150541 :     return fd_uint256_mul_mod_p( r, a, b, p, p_inv ); \
     197     8150541 :   }

Generated by: LCOV version 1.14