Line data Source code
1 : /* Included by fd_bn254_g1.c and fd_bn254_g2.c, should not be used elsewhere. */
2 :
3 : /* Rundown of the BN254 GLV implementation:
4 :
5 : Context: https://bitcointalk.org/index.php?topic=3238.msg45565#msg45565
6 :
7 : BN254 is a "pairing-friendly" curve E: y^2 = x^3 + 3 over a prime field
8 : Fp, where:
9 : p = 21888242871839275222246405745257275088696311157297823662689037894645226208583
10 :
11 : The group G1 = E(Fp) has prime order:
12 : r = 21888242871839275222246405745257275088548364400416034343698204186575808495617
13 :
14 : A naive approach to scalar multiplication, [s]P, requires ~256 doublings
15 : and ~128 additions (double-and-add) in the worst case. The GLV method
16 : takes advantage of an easy to compute endomorphism to cut this roughly
17 : in half.
18 :
19 : For BN254, there exists a cube root of unity beta in Fp such that:
20 : phi: (x, y) -> (beta * x, y)
21 : is an endomorphism of E.
22 : That is, if point P = (x, y) is on the curve, then phi(P) = (beta * x, y)
23 : is also on the curve, since (beta*x)^3 = beta^3 * x^3) = x^3,
24 : so the curve equation is preserved.
25 :
26 : phi satisfies phi(P) = [lambda]P where lambda is a cube root of unity
27 : modulo r (the group order we defined above^):
28 : lambda = 4407920970296243842393367215006156084916469457145843978461
29 : (a root of X^2 + X + 1 = 0 mod r)
30 :
31 : Computing phi(P) = (beta * x, y) costs just one FP multiplication, but
32 : is equivalent to a full scalar multiplication by the ~254-bit scalar lambda.
33 :
34 : We want to "decompose" the input scalar in a way where a majority of the
35 : effort is re-used by the lambda computation, leaving the remaining operations
36 : smaller and faster to perform. Given a scalar s, we want to find small
37 : k1, k2 (each ~128-bit) such that:
38 : s = k1 + k2 * lambda (mod r)
39 : Then: [s]P = [k1]P + [k2](lambda * P) = [k1]P + [k2]phi(P).
40 :
41 : Using the "Straus-Shamir trick", https://pmc.ncbi.nlm.nih.gov/articles/PMC9028562/#app1-sensors-22-03083,
42 : this requires only ~128 doublings + ~128 additions instead of ~256 doublings.
43 :
44 : To decompose the scalar, we have a 2-dimensional lattice
45 : L = { (a, b) in Z^2 : a + b*lambda = 0 (mod r) }
46 :
47 : A reduced basis of L is given by three magnitudes:
48 : N_A = 147946756881789319000765030803803410728 (~128 bits, 2 limbs)
49 : N_B = 9931322734385697763 (~64 bits, 1 limb)
50 : N_C = 147946756881789319010696353538189108491 (~128 bits, 2 limbs)
51 :
52 : These are arranged differently depending on the group.
53 : G1:
54 : | +N_A +N_B |
55 : | -N_B +N_C |
56 : G2:
57 : | -N_C -N_B |
58 : | +N_B -N_A |
59 :
60 : We avoid big integer divisions by r using Babai's algorithm with
61 : precomputed fixed-point inverses:
62 : b1 = (s * g1) >> 256, where for group:
63 : G1: g1 = round(2^256 * N_C / r)
64 : G2: g1 = round(2^256 * N_A / r)
65 : b2 = (s * g2) >> 256 g2 = round(2^256 * N_B / r), 66-bit, 2 limbs)
66 :
67 : Then:
68 : G1: k1 = s - b1*N_A - b2*N_B, k2 = b1*N_B - b2*N_C
69 : G2: k1 = s - b1*N_C - b2*N_B, k2 = b2*N_A - b1*N_B
70 :
71 : For G1, k1 >= 0 always, k2 may be negative.
72 : For G2, both k1 and k2 may be negative. */
73 :
74 : /* beta in Montgomery form.
75 : 0x30644e72e131a0295e6dd9e7e0acccb0c28f069fbb966e3de4bd44e5607cfd48 */
76 : const fd_bn254_fp_t fd_bn254_const_beta_mont[1] = {{{
77 : 0x3350c88e13e80b9cUL, 0x7dce557cdb5e56b9UL, 0x6001b4b8b615564aUL, 0x2682e617020217e0UL
78 : }}};
79 :
80 : /* Lattice constants, see glv.py */
81 : const ulong na[ 2 ] = { 0x8211bbeb7d4f1128UL, 0x6f4d8248eeb859fcUL };
82 : const ulong nb[ 1 ] = { 0x89d3256894d213e3UL };
83 : const ulong nc[ 2 ] = { 0x0be4e1541221250bUL, 0x6f4d8248eeb859fdUL };
84 :
85 : /* g2 = round(2^256 * N_B / r), 66-bit (2 limbs). Same for G1 and G2. */
86 : const ulong g2_const[ 2 ] = { 0xd91d232ec7e0b3d7UL, 0x0000000000000002UL };
87 :
88 : /* Multiply 4-limb scalar s by a 3-limb constant g.
89 : Returns top 3 limbs. */
90 : static inline void
91 : fd_bn254_glv_sxg3( ulong out[ 3 ],
92 : fd_bn254_scalar_t const * s,
93 31629 : ulong const g[ 3 ] ) {
94 31629 : uint128 s0 = s->limbs[0];
95 31629 : uint128 s1 = s->limbs[1];
96 31629 : uint128 s2 = s->limbs[2];
97 31629 : uint128 s3 = s->limbs[3];
98 31629 : uint128 acc;
99 31629 : acc = s0 * g[ 0 ];
100 31629 : acc = s1 * g[ 0 ] + s0 * g[ 1 ] + (ulong)(acc >> 64);
101 31629 : acc = s2 * g[ 0 ] + s1 * g[ 1 ] + s0 * g[ 2 ] + (ulong)(acc >> 64);
102 31629 : acc = s3 * g[ 0 ] + s2 * g[ 1 ] + s1 * g[ 2 ] + (ulong)(acc >> 64);
103 31629 : acc = s3 * g[ 1 ] + s2 * g[ 2 ] + (ulong)(acc >> 64); out[ 0 ] = (ulong)acc;
104 31629 : acc = s3 * g[ 2 ] + (ulong)(acc >> 64); out[ 1 ] = (ulong)acc;
105 31629 : acc = (ulong)(acc >> 64); out[ 2 ] = (ulong)acc;
106 31629 : }
107 :
108 : /* Same, but for a 2-limb constant g. */
109 : static inline void
110 : fd_bn254_glv_sxg2( ulong out[ 2 ],
111 : fd_bn254_scalar_t const * s,
112 31629 : ulong const g[ 2 ] ) {
113 31629 : uint128 s0 = s->limbs[0];
114 31629 : uint128 s1 = s->limbs[1];
115 31629 : uint128 s2 = s->limbs[2];
116 31629 : uint128 s3 = s->limbs[3];
117 31629 : uint128 acc;
118 31629 : acc = s0 * g[ 0 ];
119 31629 : acc = s1 * g[ 0 ] + s0 * g[ 1 ] + (ulong)(acc >> 64);
120 31629 : acc = s2 * g[ 0 ] + s1 * g[ 1 ] + (ulong)(acc >> 64);
121 31629 : acc = s3 * g[ 0 ] + s2 * g[ 1 ] + (ulong)(acc >> 64);
122 31629 : acc = s3 * g[ 1 ] + (ulong)(acc >> 64); out[ 0 ] = (ulong)acc;
123 31629 : acc = (ulong)(acc >> 64); out[ 1 ] = (ulong)acc;
124 31629 : }
125 :
126 : /* Multiply 3-limb a by 2-limb n, store low 4 limbs into out. */
127 : static inline void
128 : fd_bn254_glv_mul3x2( ulong out[ 4 ],
129 : ulong const a[ 3 ],
130 31629 : ulong const n[ 2 ] ) {
131 31629 : uint128 acc;
132 31629 : acc = (uint128)a[ 0 ] * n[ 0 ]; out[ 0 ] = (ulong)acc;
133 31629 : acc = (uint128)a[ 1 ] * n[ 0 ] + (uint128)a[ 0 ] * n[ 1 ] + (ulong)(acc >> 64); out[ 1 ] = (ulong)acc;
134 31629 : acc = (uint128)a[ 2 ] * n[ 0 ] + (uint128)a[ 1 ] * n[ 1 ] + (ulong)(acc >> 64); out[ 2 ] = (ulong)acc;
135 31629 : acc = (uint128)a[ 2 ] * n[ 1 ] + (ulong)(acc >> 64); out[ 3 ] = (ulong)acc;
136 31629 : }
137 :
138 : /* Multiply 3-limb by 1-limb, store into 4-limb. */
139 : static inline void
140 : fd_bn254_glv_mul3x1( ulong out[ 4 ],
141 : ulong const a[ 3 ],
142 31629 : ulong const n[ 1 ] ) {
143 31629 : uint128 acc;
144 31629 : acc = (uint128)a[ 0 ] * n[ 0 ]; out[ 0 ] = (ulong)acc;
145 31629 : acc = (uint128)a[ 1 ] * n[ 0 ] + (ulong)(acc >> 64); out[ 1 ] = (ulong)acc;
146 31629 : acc = (uint128)a[ 2 ] * n[ 0 ] + (ulong)(acc >> 64); out[ 2 ] = (ulong)acc;
147 31629 : acc = (ulong)(acc >> 64); out[ 3 ] = (ulong)acc;
148 31629 : }
149 :
150 : /* Multiply 2-limb by 2-limb, store into 4-limb. */
151 : static inline void
152 : fd_bn254_glv_mul2x2( ulong out[ 4 ],
153 : ulong const a[ 2 ],
154 31629 : ulong const n[ 2 ] ) {
155 31629 : uint128 acc;
156 31629 : acc = (uint128)a[ 0 ] * n[ 0 ]; out[ 0 ] = (ulong)acc;
157 31629 : acc = (uint128)a[ 1 ] * n[ 0 ] + (uint128)a[ 0 ] * n[ 1 ] + (ulong)(acc >> 64); out[ 1 ] = (ulong)acc;
158 31629 : acc = (uint128)a[ 1 ] * n[ 1 ] + (ulong)(acc >> 64); out[ 2 ] = (ulong)acc;
159 31629 : acc = (ulong)(acc >> 64); out[ 3 ] = (ulong)acc;
160 31629 : }
161 :
162 : /* Multiply 2-limb by 1-limb, stores 3-limbs. */
163 : static inline void
164 : fd_bn254_glv_mul2x1( ulong out[ 3 ],
165 : ulong const a[ 2 ],
166 31629 : ulong const n[ 1 ] ) {
167 31629 : uint128 acc;
168 31629 : acc = (uint128)a[ 0 ] * n[ 0 ]; out[ 0 ] = (ulong)acc;
169 31629 : acc = (uint128)a[ 1 ] * n[ 0 ] + (ulong)(acc >> 64); out[ 1 ] = (ulong)acc;
170 31629 : acc = (ulong)(acc >> 64); out[ 2 ] = (ulong)acc;
171 31629 : }
172 :
173 : /* 4-limb addition: out = a + b. Returns carry. */
174 : static inline ulong
175 : fd_bn254_glv_add4( ulong out[ 4 ],
176 : ulong const a[ 4 ],
177 31629 : ulong const b[ 4 ] ) {
178 31629 : ulong carry = 0;
179 158145 : for( int j = 0; j < 4; j++ ) {
180 126516 : uint128 acc = (uint128)a[ j ] + b[ j ] + carry;
181 126516 : out[ j ] = (ulong)acc;
182 126516 : carry = (ulong)(acc >> 64);
183 126516 : }
184 31629 : return carry;
185 31629 : }
186 :
187 : /* 4-limb subtraction: out = a - b. Returns borrow (1 if a < b). */
188 : static inline ulong
189 : fd_bn254_glv_sub4( ulong out[ 4 ],
190 : ulong const a[ 4 ],
191 63258 : ulong const b[ 4 ] ) {
192 63258 : ulong borrow = 0;
193 316290 : for( int j = 0; j < 4; j++ ) {
194 253032 : ulong av = a[ j ];
195 253032 : ulong bv = b[ j ];
196 253032 : ulong diff = av - bv - borrow;
197 253032 : borrow = ( av < bv || (borrow && av == bv) ) ? 1UL : 0UL;
198 253032 : out[ j ] = diff;
199 253032 : }
200 63258 : return borrow;
201 63258 : }
202 :
203 : /* Two's complement negation of a 4-limb value in-place. */
204 : static inline void
205 306 : fd_bn254_glv_negate4( ulong v[ 4 ] ) {
206 306 : ulong carry = 1;
207 1530 : for( int j = 0; j < 4; j++ ) {
208 1224 : uint128 sum = (uint128)(~v[ j ]) + carry;
209 1224 : v[ j ] = (ulong)sum;
210 1224 : carry = (ulong)(sum >> 64);
211 1224 : }
212 306 : }
|