Line data Source code
1 : #ifndef HEADER_fd_src_ballet_bigint_uint256_h
2 : #error "Do not include this directly; use fd_uint256.h"
3 : #endif
4 :
5 : /* Implementation of uint256 Montgomery mul.
6 :
7 : TODOs:
8 : - efficient sqr, Alg. 5 https://eprint.iacr.org/2022/1400
9 : - regular CIOS (the current impl *probably* won't work for secp256k1/r1)
10 :
11 : This is used under the hood to implement:
12 : - bn254 field arithmetic
13 : - bn254 scalar field arithmetic
14 : - (TODO) ed25519 scalar field arithmetic
15 : - (TODO) secp256k1 field and scalar arithmetic
16 : - (TODO) secp256r1 field and scalar arithmetic
17 : - ...
18 : */
19 :
20 : #define INLINE static inline __attribute__((always_inline))
21 :
22 : #ifdef FD_USING_GCC
23 : #define OPTIMIZE __attribute__((optimize("unroll-loops")))
24 : #else
25 : #define OPTIMIZE
26 : #endif
27 :
28 : #if FD_HAS_X86
29 : #include <x86intrin.h>
30 : #endif
31 :
32 : /* Utility functions for fd_uint256_mul_mod_p.
33 : Implementation is based on uint128.
34 : The implementations WITHOUT uint128 are just for completeness, we
35 : should avoid using them (and, e.g., rely on fiat-crypto mul instead). */
36 :
37 : INLINE void
38 : fd_ulong_sub_borrow(
39 : ulong * r, /* out (r=a-b) */
40 : int * b, /* out borrow flag */
41 : ulong a0,
42 : ulong a1,
43 : int bi /* in borrow flag */
44 15 : ) {
45 15 : # if FD_HAS_X86
46 15 : *b = (uchar)_subborrow_u64( (uchar)bi, a0, a1, (unsigned long long *)r );
47 : # else
48 : a1 += !!bi;
49 : *r = a0 - a1;
50 : *b = a0 < a1;
51 : # endif
52 15 : }
53 :
54 : #if FD_HAS_INT128
55 :
56 : INLINE void
57 0 : fd_ulong_vec_mul( ulong l[4], ulong h[4], ulong const a[4], ulong b ) {
58 0 : uint128 r0 = ((uint128)a[0]) * ((uint128)b);
59 0 : uint128 r1 = ((uint128)a[1]) * ((uint128)b);
60 0 : uint128 r2 = ((uint128)a[2]) * ((uint128)b);
61 0 : uint128 r3 = ((uint128)a[3]) * ((uint128)b);
62 0 : l[0] = (ulong)r0; h[0] = (ulong)(r0 >> 64);
63 0 : l[1] = (ulong)r1; h[1] = (ulong)(r1 >> 64);
64 0 : l[2] = (ulong)r2; h[2] = (ulong)(r2 >> 64);
65 0 : l[3] = (ulong)r3; h[3] = (ulong)(r3 >> 64);
66 0 : }
67 :
68 : INLINE void
69 0 : fd_ulong_add_carry4( ulong *l, uchar *h, ulong a0, ulong a1, ulong a2, uchar a3 ) {
70 0 : uint128 r = ((uint128)a0) + ((uint128)a1) + ((uint128)a2) + ((uint128)a3);
71 0 : *l = (ulong)r;
72 0 : *h = (uchar)(r >> 64);
73 0 : }
74 :
75 : #else
76 :
77 : INLINE void
78 : fd_ulong_mul128( ulong * l, ulong * h, ulong const a, ulong const b ) {
79 : /* First calculate all of the cross products. */
80 : ulong lo_lo = (a & 0xFFFFFFFF) * (b & 0xFFFFFFFF);
81 : ulong hi_lo = (a >> 32) * (b & 0xFFFFFFFF);
82 : ulong lo_hi = (a & 0xFFFFFFFF) * (b >> 32);
83 : ulong hi_hi = (a >> 32) * (b >> 32);
84 :
85 : /* Now add the products together. These will never overflow. */
86 : ulong cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
87 : ulong upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
88 :
89 : *h = upper;
90 : *l = (cross << 32) | (lo_lo & 0xFFFFFFFF);
91 : }
92 :
93 : INLINE void
94 : fd_ulong_vec_mul( ulong l[4], ulong h[4], ulong const a[4], ulong b ) {
95 : fd_ulong_mul128( &l[0], &h[0], a[0], b );
96 : fd_ulong_mul128( &l[1], &h[1], a[1], b );
97 : fd_ulong_mul128( &l[2], &h[2], a[2], b );
98 : fd_ulong_mul128( &l[3], &h[3], a[3], b );
99 : }
100 :
101 : INLINE void
102 : fd_ulong_add_carry4( ulong *l, uchar *h, ulong a0, ulong a1, ulong a2, uchar a3 ) {
103 : ulong r0 = a0 + a1;
104 : uchar c0 = r0 < a0;
105 :
106 : ulong r1 = a2 + a3;
107 : uchar c1 = r1 < a2;
108 :
109 : *l = r0 + r1;
110 : *h = (uchar)((*l < r0) + c0 + c1);
111 : }
112 :
113 : #endif
114 :
115 : INLINE fd_uint256_t *
116 : fd_uint256_add(fd_uint256_t * r,
117 : fd_uint256_t const * a,
118 0 : fd_uint256_t const * b ) {
119 0 : uchar c0;
120 0 : fd_ulong_add_carry4( &r->limbs[0], &c0, a->limbs[0], b->limbs[0], 0, 0 );
121 0 : fd_ulong_add_carry4( &r->limbs[1], &c0, a->limbs[1], b->limbs[1], 0, c0 );
122 0 : fd_ulong_add_carry4( &r->limbs[2], &c0, a->limbs[2], b->limbs[2], 0, c0 );
123 0 : fd_ulong_add_carry4( &r->limbs[3], &c0, a->limbs[3], b->limbs[3], 0, c0 );
124 0 : return r;
125 0 : }
126 :
127 : /* fd_uint256_mul_mod_p computes r = a * b mod p, using the CIOS method.
128 : r, a, b are in Montgomery representation (p is not).
129 :
130 : This is an efficient implementation of CIOS that works when (circa) p < 2^255
131 : (precisely, p->limbs[3] < (2^64-1)/2 - 1).
132 : Alg. 2, https://eprint.iacr.org/2022/1400
133 : Code example, for bn254: https://github.com/Consensys/gnark-crypto/blob/v0.12.1/ecc/bn254/fp/element_ops_purego.go#L66
134 :
135 : In go lang, bits.Add64 has carry 0, 1.
136 : We allow the carry to be a uchar, so we can dp a single add chain after each mul.
137 :
138 : This function is intended to be wrapped into a fd_<field>_mul( r, a, b ).
139 : Experimentally we found that:
140 : 1. We have to force inlining for this function, otherwise compilers tend to reuse
141 : the function, introducing overhead.
142 : 2. In GCC, we have to force loop unrolling optimization *in the outer fd_<field>_mul()*
143 : function, otherwise performance degrades significantly.
144 : For this we added the macro FD_UINT256_FP_MUL_IMPL. */
145 :
146 : INLINE fd_uint256_t *
147 : fd_uint256_mul_mod_p( fd_uint256_t * r,
148 : fd_uint256_t const * a,
149 : fd_uint256_t const * b,
150 : fd_uint256_t const * p,
151 0 : ulong const p_inv ) {
152 0 : ulong FD_ALIGNED t[4] = { 0 };
153 0 : ulong FD_ALIGNED u[4];
154 0 : ulong FD_ALIGNED h[4];
155 0 : ulong FD_ALIGNED l[4];
156 0 : uchar c0, c1;
157 0 : ulong tmp;
158 0 : ulong m;
159 :
160 0 : for( int i=0; i<4; i++ ) {
161 0 : fd_ulong_vec_mul( l, u, a->limbs, b->limbs[i] );
162 0 : fd_ulong_add_carry4( &t[0], &c0, t[0], l[0], 0, 0 );
163 0 : fd_ulong_add_carry4( &t[1], &c0, t[1], l[1], u[0], c0 );
164 0 : fd_ulong_add_carry4( &t[2], &c0, t[2], l[2], u[1], c0 );
165 0 : fd_ulong_add_carry4( &t[3], &c0, t[3], l[3], u[2], c0 );
166 :
167 0 : m = t[0] * p_inv;
168 :
169 0 : fd_ulong_vec_mul( l, h, p->limbs, m );
170 0 : fd_ulong_add_carry4( &tmp, &c1, t[0], l[0], 0, 0 );
171 0 : fd_ulong_add_carry4( &t[0], &c1, t[1], l[1], h[0], c1 );
172 0 : fd_ulong_add_carry4( &t[1], &c1, t[2], l[2], h[1], c1 );
173 0 : fd_ulong_add_carry4( &t[2], &c1, t[3], l[3], h[2], c1 );
174 0 : t[3] = u[3] + h[3] + c0 + c1;
175 0 : }
176 :
177 0 : r->limbs[0] = t[0];
178 0 : r->limbs[1] = t[1];
179 0 : r->limbs[2] = t[2];
180 0 : r->limbs[3] = t[3];
181 :
182 0 : if( fd_uint256_cmp( r, p ) >= 0 ) {
183 0 : int b = 0;
184 0 : fd_ulong_sub_borrow( &r->limbs[0], &b, r->limbs[0], p->limbs[0], b );
185 0 : fd_ulong_sub_borrow( &r->limbs[1], &b, r->limbs[1], p->limbs[1], b );
186 0 : fd_ulong_sub_borrow( &r->limbs[2], &b, r->limbs[2], p->limbs[2], b );
187 0 : fd_ulong_sub_borrow( &r->limbs[3], &b, r->limbs[3], p->limbs[3], b );
188 0 : }
189 0 : return r;
190 0 : }
191 :
192 : /* FD_UINT256_FP_MUL_IMPL macro to properly implement Fp mul based on
193 : fd_uint256_mul_mod_p().
194 : In GCC we need to explicitly force loop unroll. */
195 : #define FD_UINT256_FP_MUL_IMPL(fp, p, p_inv) \
196 : static inline fp ## _t * OPTIMIZE \
197 : fp ## _mul( fp ## _t * r, \
198 : fp ## _t const * a, \
199 0 : fp ## _t const * b ) { \
200 0 : return fd_uint256_mul_mod_p( r, a, b, p, p_inv ); \
201 0 : }
|