Line data Source code
1 : #ifndef HEADER_fd_src_ballet_bigint_uint256_h
2 : #error "Do not include this directly; use fd_uint256.h"
3 : #endif
4 :
5 : /* Implementation of uint256 Montgomery mul.
6 :
7 : TODOs:
8 : - efficient sqr, Alg. 5 https://eprint.iacr.org/2022/1400
9 : - regular CIOS (the current impl *probably* won't work for secp256k1/r1)
10 :
11 : This is used under the hood to implement:
12 : - bn254 field arithmetic
13 : - bn254 scalar field arithmetic
14 : - (TODO) ed25519 scalar field arithmetic
15 : - (TODO) secp256k1 field and scalar arithmetic
16 : - (TODO) secp256r1 field and scalar arithmetic
17 : - ...
18 : */
19 :
20 : #define INLINE static inline __attribute__((always_inline))
21 :
22 : #ifdef FD_USING_GCC
23 : #define OPTIMIZE __attribute__((optimize("unroll-loops")))
24 : #else
25 : #define OPTIMIZE
26 : #endif
27 :
28 : /* Utility functions for fd_uint256_mul_mod_p.
29 : Implementation is based on uint128.
30 : The implementations WITHOUT uint128 are just for completeness, we
31 : should avoid using them (and, e.g., rely on fiat-crypto mul instead). */
32 :
33 : INLINE void
34 : fd_ulong_sub_borrow(
35 : ulong * r, /* out (r=a-b) */
36 : int * b, /* out borrow flag */
37 : ulong a0,
38 : ulong a1,
39 : int bi /* in borrow flag */
40 1396587 : ) {
41 1396587 : # if FD_HAS_X86
42 1396587 : *b = (uchar)_subborrow_u64( (uchar)bi, a0, a1, (unsigned long long *)r );
43 : # else
44 : a1 += !!bi;
45 : *r = a0 - a1;
46 : *b = a0 < a1;
47 : # endif
48 1396587 : }
49 :
50 : #if FD_HAS_INT128
51 :
52 : INLINE void
53 65204328 : fd_ulong_vec_mul( ulong l[4], ulong h[4], ulong const a[4], ulong b ) {
54 65204328 : uint128 r0 = ((uint128)a[0]) * ((uint128)b);
55 65204328 : uint128 r1 = ((uint128)a[1]) * ((uint128)b);
56 65204328 : uint128 r2 = ((uint128)a[2]) * ((uint128)b);
57 65204328 : uint128 r3 = ((uint128)a[3]) * ((uint128)b);
58 65204328 : l[0] = (ulong)r0; h[0] = (ulong)(r0 >> 64);
59 65204328 : l[1] = (ulong)r1; h[1] = (ulong)(r1 >> 64);
60 65204328 : l[2] = (ulong)r2; h[2] = (ulong)(r2 >> 64);
61 65204328 : l[3] = (ulong)r3; h[3] = (ulong)(r3 >> 64);
62 65204328 : }
63 :
64 : INLINE void
65 261075360 : fd_ulong_add_carry4( ulong *l, uchar *h, ulong a0, ulong a1, ulong a2, uchar a3 ) {
66 261075360 : uint128 r = ((uint128)a0) + ((uint128)a1) + ((uint128)a2) + ((uint128)a3);
67 261075360 : *l = (ulong)r;
68 261075360 : *h = (uchar)(r >> 64);
69 261075360 : }
70 :
71 : #else
72 :
73 : INLINE void
74 : fd_ulong_mul128( ulong * l, ulong * h, ulong const a, ulong const b ) {
75 : /* First calculate all of the cross products. */
76 : ulong lo_lo = (a & 0xFFFFFFFF) * (b & 0xFFFFFFFF);
77 : ulong hi_lo = (a >> 32) * (b & 0xFFFFFFFF);
78 : ulong lo_hi = (a & 0xFFFFFFFF) * (b >> 32);
79 : ulong hi_hi = (a >> 32) * (b >> 32);
80 :
81 : /* Now add the products together. These will never overflow. */
82 : ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
83 : ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
84 :
85 : *h = upper;
86 : *l = (cross << 32) | (lo_lo & 0xFFFFFFFF);
87 : }
88 :
89 : INLINE void
90 : fd_ulong_vec_mul( ulong l[4], ulong h[4], ulong const a[4], ulong b ) {
91 : fd_ulong_mul128( &l[0], &h[0], a[0], b );
92 : fd_ulong_mul128( &l[1], &h[1], a[1], b );
93 : fd_ulong_mul128( &l[2], &h[2], a[2], b );
94 : fd_ulong_mul128( &l[3], &h[3], a[3], b );
95 : }
96 :
97 : INLINE void
98 : fd_ulong_add_carry4( ulong *l, uchar *h, ulong a0, ulong a1, ulong a2, uchar a3 ) {
99 : ulong r0 = a0 + a1;
100 : uchar c0 = r0 < a0;
101 :
102 : ulong r1 = a2 + a3;
103 : uchar c1 = r1 < a2;
104 :
105 : *l = r0 + r1;
106 : *h = (uchar)((*l < r0) + c0 + c1);
107 : }
108 :
109 : #endif
110 :
111 : INLINE fd_uint256_t *
112 : fd_uint256_add(fd_uint256_t * r,
113 : fd_uint256_t const * a,
114 64512 : fd_uint256_t const * b ) {
115 64512 : uchar c0;
116 64512 : fd_ulong_add_carry4( &r->limbs[0], &c0, a->limbs[0], b->limbs[0], 0, 0 );
117 64512 : fd_ulong_add_carry4( &r->limbs[1], &c0, a->limbs[1], b->limbs[1], 0, c0 );
118 64512 : fd_ulong_add_carry4( &r->limbs[2], &c0, a->limbs[2], b->limbs[2], 0, c0 );
119 64512 : fd_ulong_add_carry4( &r->limbs[3], &c0, a->limbs[3], b->limbs[3], 0, c0 );
120 64512 : return r;
121 64512 : }
122 :
123 : /* fd_uint256_mul_mod_p computes r = a * b mod p, using the CIOS method.
124 : r, a, b are in Montgomery representation (p is not).
125 :
126 : This is an efficient implementation of CIOS that works when (circa) p < 2^255
127 : (precisely, p->limbs[3] < (2^64-1)/2 - 1).
128 : Alg. 2, https://eprint.iacr.org/2022/1400
129 : Code example, for bn254: https://github.com/Consensys/gnark-crypto/blob/v0.12.1/ecc/bn254/fp/element_ops_purego.go#L66
130 :
131 : In go lang, bits.Add64 has carry 0, 1.
132 : We allow the carry to be a uchar, so we can dp a single add chain after each mul.
133 :
134 : This function is intended to be wrapped into a fd_<field>_mul( r, a, b ).
135 : Experimentally we found that:
136 : 1. We have to force inlining for this function, otherwise compilers tend to reuse
137 : the function, introducing overhead.
138 : 2. In GCC, we have to force loop unrolling optimization *in the outer fd_<field>_mul()*
139 : function, otherwise performance degrades significantly.
140 : For this we added the macro FD_UINT256_FP_MUL_IMPL. */
141 :
142 : INLINE fd_uint256_t *
143 : fd_uint256_mul_mod_p( fd_uint256_t * r,
144 : fd_uint256_t const * a,
145 : fd_uint256_t const * b,
146 : fd_uint256_t const * p,
147 8150541 : ulong const p_inv ) {
148 8150541 : ulong FD_ALIGNED t[4] = { 0 };
149 8150541 : ulong FD_ALIGNED u[4];
150 8150541 : ulong FD_ALIGNED h[4];
151 8150541 : ulong FD_ALIGNED l[4];
152 8150541 : uchar c0, c1;
153 8150541 : ulong tmp;
154 8150541 : ulong m;
155 :
156 40752705 : for( int i=0; i<4; i++ ) {
157 32602164 : fd_ulong_vec_mul( l, u, a->limbs, b->limbs[i] );
158 32602164 : fd_ulong_add_carry4( &t[0], &c0, t[0], l[0], 0, 0 );
159 32602164 : fd_ulong_add_carry4( &t[1], &c0, t[1], l[1], u[0], c0 );
160 32602164 : fd_ulong_add_carry4( &t[2], &c0, t[2], l[2], u[1], c0 );
161 32602164 : fd_ulong_add_carry4( &t[3], &c0, t[3], l[3], u[2], c0 );
162 :
163 32602164 : m = t[0] * p_inv;
164 :
165 32602164 : fd_ulong_vec_mul( l, h, p->limbs, m );
166 32602164 : fd_ulong_add_carry4( &tmp, &c1, t[0], l[0], 0, 0 );
167 32602164 : fd_ulong_add_carry4( &t[0], &c1, t[1], l[1], h[0], c1 );
168 32602164 : fd_ulong_add_carry4( &t[1], &c1, t[2], l[2], h[1], c1 );
169 32602164 : fd_ulong_add_carry4( &t[2], &c1, t[3], l[3], h[2], c1 );
170 32602164 : t[3] = u[3] + h[3] + c0 + c1;
171 32602164 : }
172 :
173 8150541 : r->limbs[0] = t[0];
174 8150541 : r->limbs[1] = t[1];
175 8150541 : r->limbs[2] = t[2];
176 8150541 : r->limbs[3] = t[3];
177 :
178 8150541 : if( fd_uint256_cmp( r, p ) >= 0 ) {
179 349143 : int b = 0;
180 349143 : fd_ulong_sub_borrow( &r->limbs[0], &b, r->limbs[0], p->limbs[0], b );
181 349143 : fd_ulong_sub_borrow( &r->limbs[1], &b, r->limbs[1], p->limbs[1], b );
182 349143 : fd_ulong_sub_borrow( &r->limbs[2], &b, r->limbs[2], p->limbs[2], b );
183 349143 : fd_ulong_sub_borrow( &r->limbs[3], &b, r->limbs[3], p->limbs[3], b );
184 349143 : }
185 8150541 : return r;
186 8150541 : }
187 :
188 : /* FD_UINT256_FP_MUL_IMPL macro to properly implement Fp mul based on
189 : fd_uint256_mul_mod_p().
190 : In GCC we need to explicitly force loop unroll. */
191 : #define FD_UINT256_FP_MUL_IMPL(fp, p, p_inv) \
192 : static inline fp ## _t * OPTIMIZE \
193 : fp ## _mul( fp ## _t * r, \
194 : fp ## _t const * a, \
195 8150541 : fp ## _t const * b ) { \
196 8150541 : return fd_uint256_mul_mod_p( r, a, b, p, p_inv ); \
197 8150541 : }
|