Line data Source code
1 : #include "fd_x25519.h"
2 : #include "fd_f25519.h"
3 :
4 : /* FD_X25519_VECTORIZE calls mul4 instead of sqr2+mul2, and similar.
5 : Only useful if the underlying ops are actually vectorized and therefore
6 : the cost of 4 muls is <= the cost of 2 sqr + 2 mul. */
7 : #define FD_X25519_VECTORIZE FD_HAS_AVX512
8 :
9 : /* FD_X25519_ALIGN aligns variables. */
10 : #if FD_HAS_AVX
11 : #define FD_X25519_ALIGN __attribute__((aligned(32)))
12 : #else
13 : #define FD_X25519_ALIGN
14 : #endif
15 :
16 : /*
17 : * Constant time primitives
18 : */
19 :
20 : static inline int FD_FN_SENSITIVE
21 196953 : fd_x25519_is_zero_const_time( uchar const point[ 32 ] ) {
22 : //TODO: this is generally done by (x)or-ing the limbs, see also RFC 7748, page 13.
23 196953 : int is_zero = 1;
24 6499449 : for( ulong i=0UL; i<32UL; i++ ) {
25 6302496 : is_zero &= ( !point[ i ] );
26 6302496 : }
27 196953 : return is_zero;
28 196953 : }
29 :
30 : #if !FD_HAS_AVX512
31 :
32 : static inline void FD_FN_SENSITIVE
33 : fd_x25519_montgomery_ladder( fd_f25519_t * x2,
34 : fd_f25519_t * z2,
35 : fd_f25519_t const * x1,
36 131302 : uchar const * secret_scalar ) {
37 : /* memory areas that will contain (partial) secrets and will be cleared at the end */
38 131302 : fd_f25519_t secret_tmp_f[4];
39 131302 : int swap = 0;
40 131302 : int b = 0;
41 :
42 : /* human-readable variables */
43 131302 : fd_f25519_t * x3 = &secret_tmp_f[0];
44 131302 : fd_f25519_t * z3 = &secret_tmp_f[1];
45 131302 : fd_f25519_t * tmp0 = &secret_tmp_f[2];
46 131302 : fd_f25519_t * tmp1 = &secret_tmp_f[3];
47 :
48 131302 : fd_f25519_set( x2, fd_f25519_one );
49 131302 : fd_f25519_set( z2, fd_f25519_zero );
50 :
51 : /* use fd_f25519_add to reduce x1 mod p. it's prob unnecessary but not worth the risk. */
52 131302 : fd_f25519_add( x3, fd_f25519_zero, x1 );
53 131302 : fd_f25519_set( z3, fd_f25519_one );
54 :
55 33613312 : for( long pos=254UL; pos>=0; pos-- ) {
56 33482010 : b = (secret_scalar[ pos / 8L ] >> ( pos & 7L )) & 1;
57 33482010 : swap ^= b;
58 33482010 : fd_f25519_swap_if( x2, x3, swap );
59 33482010 : fd_f25519_swap_if( z2, z3, swap );
60 33482010 : swap = b;
61 :
62 33482010 : fd_f25519_sub_nr( tmp0, x3, z3 );
63 33482010 : fd_f25519_sub_nr( tmp1, x2, z2 );
64 33482010 : fd_f25519_add_nr( x2, x2, z2 );
65 33482010 : fd_f25519_add_nr( z2, x3, z3 );
66 :
67 : # if FD_X25519_VECTORIZE /* Note that okay to use less efficient squaring because we get it for free in unused vector lanes */
68 : fd_f25519_mul4( z3, tmp0, x2,
69 : z2, z2, tmp1,
70 : tmp0, tmp1, tmp1,
71 : tmp1, x2, x2 );
72 : # else /* Use more efficient squaring if scalar implementation */
73 33482010 : fd_f25519_mul2( z3, tmp0, x2,
74 33482010 : z2, z2, tmp1 );
75 33482010 : fd_f25519_sqr2( tmp0, tmp1,
76 33482010 : tmp1, x2 );
77 33482010 : # endif
78 33482010 : fd_f25519_add_nr( x3, z3, z2 );
79 33482010 : fd_f25519_sub_nr( z2, z3, z2 );
80 : # if FD_X25519_VECTORIZE /* See note above */
81 : fd_f25519_mul2( x2, tmp1, tmp0,
82 : z2, z2, z2 );
83 : # else
84 33482010 : fd_f25519_mul( x2, tmp1, tmp0 );
85 33482010 : fd_f25519_sqr( z2, z2 );
86 33482010 : # endif
87 33482010 : fd_f25519_sub_nr( tmp1, tmp1, tmp0 );
88 :
89 33482010 : fd_f25519_mul_121666( z3, tmp1 );
90 :
91 33482010 : fd_f25519_add_nr( tmp0, tmp0, z3 );
92 : # if FD_X25519_VECTORIZE /* See note above */
93 : fd_f25519_mul3( x3, x3, x3,
94 : z3, x1, z2,
95 : z2, tmp0, tmp1 );
96 : # else
97 33482010 : fd_f25519_sqr ( x3, x3 );
98 33482010 : fd_f25519_mul2( z3, x1, z2,
99 33482010 : z2, tmp1, tmp0 );
100 33482010 : # endif
101 33482010 : }
102 :
103 131302 : fd_f25519_swap_if( x2, x3, swap );
104 131302 : fd_f25519_swap_if( z2, z3, swap );
105 :
106 : /* Sanitize */
107 :
108 131302 : fd_memset_explicit( secret_tmp_f, 0, sizeof(secret_tmp_f) );
109 131302 : fd_memset_explicit( &b, 0, sizeof(int) );
110 131302 : fd_memset_explicit( &swap, 0, sizeof(int) );
111 131302 : }
112 : #else
113 :
114 : /* This is the "transposed" version of the Montgomery ladder above.
115 : Experimentally, this is 15-20% faster on AVX-512. */
116 : static inline void FD_FN_SENSITIVE
117 : fd_x25519_montgomery_ladder( fd_f25519_t * x2,
118 : fd_f25519_t * z2,
119 : fd_f25519_t const * x1,
120 65651 : uchar const * secret_scalar ) {
121 65651 : FD_R43X6_QUAD_DECL( U );
122 65651 : FD_R43X6_QUAD_DECL( Q );
123 65651 : FD_R43X6_QUAD_DECL( P );
124 65651 : FD_R43X6_QUAD_PACK( U, fd_r43x6_zero(),
125 65651 : fd_r43x6_zero(),
126 65651 : fd_r43x6_zero(),
127 65651 : x1->el ); // x_1 = u, in u44
128 65651 : FD_R43X6_QUAD_PACK( Q, fd_r43x6_one(), // x_2 = 1, in u44
129 65651 : fd_r43x6_zero(), // z_2 = 0, in u44
130 65651 : x1->el, // x_3 = u, in u44
131 65651 : fd_r43x6_one() ); // z_3 = 1, in u44
132 65651 : int swap = 0;
133 65651 : int k_t = 0;
134 65651 : wwl_t perm;
135 65651 : fd_r43x6_t AA, E, F, G, H, GG;
136 :
137 16806656 : for( int t=254UL; t>=0; t-- ) { // For t = bits-1 down to 0:
138 :
139 : /* At this point, Q and U in u44|u44|u44|u44 */
140 :
141 16741005 : k_t = (secret_scalar[ t / 8L ] >> ( t & 7L )) & 1; // k_t = (k >> t) & 1;
142 16741005 : swap ^= k_t; // swap ^= k_t
143 16741005 : perm = wwl_if( (-swap) & 0xff, wwl( 2L,3L,0L,1L, 6L,7L,4L,5L ), wwl( 0L,1L,2L,3L, 4L,5L,6L,7L ) );
144 16741005 : Q03 = wwl_permute( perm, Q03 ); // (x_2, x_3) = cswap(swap, x_2, x_3)
145 16741005 : Q14 = wwl_permute( perm, Q14 ); // (z_2, z_3) = cswap(swap, z_2, z_3)
146 16741005 : Q25 = wwl_permute( perm, Q25 );
147 16741005 : swap = k_t; // swap = k_t
148 :
149 : /* These operations are exactly from the RFC but have been reordered
150 : slightly to make it easier to extract ILP. */
151 :
152 16741005 : FD_R43X6_QUAD_PERMUTE ( P, 0,0,2,2, Q ); // A = x_2 + z_2, P = x_2|x_2|x_3 |x_3, in u44|u44|u44|u44
153 16741005 : FD_R43X6_QUAD_PERMUTE ( Q, 1,1,3,3, Q ); // B = x_2 - z_2, Q = z_2|z_2|z_3 |z_3, in u44|u44|u44|u44
154 16741005 : FD_R43X6_QUAD_LANE_ADD_FAST( P, P, 1,0,1,0, P, Q ); // C = x_3 + z_3, P = A |x_2|C |x_3, in u45|u44|u45|u44
155 16741005 : FD_R43X6_QUAD_LANE_SUB_FAST( P, P, 0,1,0,1, P, Q ); // D = x_3 - z_3, P = A |B |C |D, in u45|s44|u45|s44
156 16741005 : FD_R43X6_QUAD_PERMUTE ( Q, 0,1,1,0, P ); // BB = B^2, P = A |B |B |A, in u44|u44|u44|u44
157 16741005 : FD_R43X6_QUAD_MUL_FAST ( P, P, Q ); // DA = D * A, P = AA |BB |CB |DA, in u62|u62|u62|u62
158 16741005 : FD_R43X6_QUAD_FOLD_SIGNED ( P, P ); // DA = D * A, P = AA |BB |CB |DA, in u44|u44|u44|u44
159 16741005 : FD_R43X6_QUAD_PERMUTE ( Q, 1,0,3,2, P ); // CB = C * B, Q = BB |AA |DA |CB, in u62|u62|u62|u62
160 16741005 : FD_R43X6_QUAD_LANE_SUB_FAST( P, P, 0,1,0,1, Q, P ); // E = AA-BB, P = AA |E |CB |CB-DA, in u62|s62|u62|s62
161 16741005 : FD_R43X6_QUAD_LANE_ADD_FAST( P, P, 0,0,1,0, P, Q ); // P = AA |E |DA+CB|CB-DA, in u62|s62|u63|s62
162 16741005 : FD_R43X6_QUAD_LANE_IF ( Q, 0,1,1,0, P, Q ); // Q = BB |E |DA+CB|CB, in u62|u44|u44|u62
163 16741005 : FD_R43X6_QUAD_LANE_IF ( Q, 0,0,0,1, U, Q ); // x_3 = (DA + CB)^2, Q = BB |E |DA+CB|x_1, in u62|u44|u44|u44
164 16741005 : FD_R43X6_QUAD_UNPACK ( AA, E, F, G, P );
165 16741005 : H = fd_r43x6_add_fast( AA, fd_r43x6_scale_fast( 121665L, E ) ); // H = AA + a24 * E, in u60
166 16741005 : GG = fd_r43x6_sqr_fast( G ); // GG = (DA - CB)^2, in u61
167 16741005 : FD_R43X6_QUAD_PACK ( P, AA, H, F, GG ); // z_2 = E * (AA + a24 * E), P = AA |H |DA+CB|GG, in u44|u60|u44|u61
168 16741005 : FD_R43X6_QUAD_FOLD_UNSIGNED( P, P ); // P = AA |H |DA+CB|GG, in u44|u44|u44|u44
169 16741005 : FD_R43X6_QUAD_MUL_FAST ( P, P, Q ); // z_3 = x_1 * (DA - CB)^2, Q = x_2|z_2|x_3 |z_3, in u62|u62|u62|u62
170 16741005 : FD_R43X6_QUAD_FOLD_UNSIGNED( Q, P ); // Q = x_2|z_2|x_3 |z_3, in u44|u44|u44|u44
171 16741005 : }
172 :
173 : /* At this point, Q in u44|u44|u44|u44 */
174 65651 : perm = wwl_if( (-swap) & 0xff, wwl( 2L,3L,0L,1L, 6L,7L,4L,5L ), wwl( 0L,1L,2L,3L, 4L,5L,6L,7L ) );
175 65651 : Q03 = wwl_permute( perm, Q03 ); // (x_2, x_3) = cswap(swap, x_2, x_3)
176 65651 : Q14 = wwl_permute( perm, Q14 ); // (z_2, z_3) = cswap(swap, z_2, z_3)
177 65651 : Q25 = wwl_permute( perm, Q25 );
178 :
179 65651 : FD_R43X6_QUAD_UNPACK( x2->el, z2->el, E, F, Q );
180 :
181 : /* Sanitize */
182 :
183 65651 : fd_memset_explicit( &P03, 0, sizeof(wwl_t) );
184 65651 : fd_memset_explicit( &P14, 0, sizeof(wwl_t) );
185 65651 : fd_memset_explicit( &P25, 0, sizeof(wwl_t) );
186 65651 : fd_memset_explicit( &U03, 0, sizeof(wwl_t) );
187 65651 : fd_memset_explicit( &U14, 0, sizeof(wwl_t) );
188 65651 : fd_memset_explicit( &U25, 0, sizeof(wwl_t) );
189 65651 : fd_memset_explicit( &Q03, 0, sizeof(wwl_t) );
190 65651 : fd_memset_explicit( &Q14, 0, sizeof(wwl_t) );
191 65651 : fd_memset_explicit( &Q25, 0, sizeof(wwl_t) );
192 65651 : fd_memset_explicit( &AA, 0, sizeof(wwl_t) );
193 65651 : fd_memset_explicit( &E, 0, sizeof(wwl_t) );
194 65651 : fd_memset_explicit( &F, 0, sizeof(wwl_t) );
195 65651 : fd_memset_explicit( &G, 0, sizeof(wwl_t) );
196 65651 : fd_memset_explicit( &H, 0, sizeof(wwl_t) );
197 65651 : fd_memset_explicit( &GG, 0, sizeof(wwl_t) );
198 65651 : fd_memset_explicit( &perm, 0, sizeof(wwl_t) );
199 65651 : fd_memset_explicit( &swap, 0, sizeof(int) );
200 65651 : fd_memset_explicit( &k_t, 0, sizeof(int) );
201 :
202 65651 : }
203 : #endif
204 :
205 : /*
206 : * X25519 Protocol
207 : */
208 :
209 : static inline void FD_FN_SENSITIVE
210 : fd_x25519_scalar_mul_const_time( uchar out[ 32 ],
211 : uchar const * secret_scalar,
212 196953 : fd_f25519_t const * point_x ) {
213 196953 : fd_f25519_t x2[1], z2[1];
214 :
215 196953 : fd_x25519_montgomery_ladder( x2, z2, point_x, secret_scalar );
216 :
217 196953 : fd_f25519_inv( z2, z2 );
218 196953 : fd_f25519_mul( x2, x2, z2 );
219 :
220 196953 : fd_f25519_tobytes( out, x2 );
221 196953 : }
222 :
223 : static const uchar fd_x25519_basepoint[ 32 ] FD_X25519_ALIGN = { 9 };
224 :
225 : uchar * FD_FN_SENSITIVE
226 : fd_x25519_public( uchar self_public_key [ 32 ],
227 93354 : uchar const self_private_key[ 32 ] ) {
228 : /* IETF RFC 7748 Section 4.1 (page 3) */
229 93354 : return fd_x25519_exchange( self_public_key, self_private_key, fd_x25519_basepoint );
230 93354 : }
231 :
232 : uchar * FD_FN_SENSITIVE
233 : fd_x25519_exchange( uchar shared_secret [ 32 ],
234 : uchar const self_private_key[ 32 ],
235 196953 : uchar const peer_public_key [ 32 ] ) {
236 :
237 : /* Memory areas that will contain secrets */
238 196953 : uchar secret_scalar[ 32UL ] FD_X25519_ALIGN;
239 :
240 : /* Public local variables */
241 196953 : fd_f25519_t peer_public_key_point_u[1];
242 :
243 : // RFC 7748 - Elliptic Curves for Security
244 : //
245 : // 5. The X25519 and X448 Functions
246 : //
247 : // The "X25519" and "X448" functions perform scalar multiplication on
248 : // the Montgomery form of the above curves. (This is used when
249 : // implementing Diffie-Hellman.) The functions take a scalar and a
250 : // u-coordinate as inputs and produce a u-coordinate as output.
251 : // Although the functions work internally with integers, the inputs and
252 : // outputs are 32-byte strings (for X25519) or 56-byte strings (for
253 : // X448) and this specification defines their encoding.
254 :
255 : // The u-coordinates are elements of the underlying field GF(2^255 - 19)
256 : // or GF(2^448 - 2^224 - 1) and are encoded as an array of bytes, u, in
257 : // little-endian order such that u[0] + 256*u[1] + 256^2*u[2] + ... +
258 : // 256^(n-1)*u[n-1] is congruent to the value modulo p and u[n-1] is
259 : // minimal. When receiving such an array, implementations of X25519
260 : // (but not X448) MUST mask the most significant bit in the final byte.
261 : // This is done to preserve compatibility with point formats that
262 : // reserve the sign bit for use in other protocols and to increase
263 : // resistance to implementation fingerprinting.
264 :
265 : // Implementations MUST accept non-canonical values and process them as
266 : // if they had been reduced modulo the field prime. The non-canonical
267 : // values are 2^255 - 19 through 2^255 - 1 for X25519 and 2^448 - 2^224
268 : // - 1 through 2^448 - 1 for X448.
269 :
270 : /* From the text above:
271 : 1. When receiving such an array, implementations of X25519 [...]
272 : MUST mask the most significant bit in the final byte
273 : >> this is done by fd_f25519_frombytes
274 : 2. Implementations MUST accept non-canonical values
275 : >> no extra check needed */
276 196953 : fd_f25519_frombytes( peer_public_key_point_u, peer_public_key );
277 :
278 : // Scalars are assumed to be randomly generated bytes. For X25519, in
279 : // order to decode 32 random bytes as an integer scalar, set the three
280 : // least significant bits of the first byte and the most significant bit
281 : // of the last to zero, set the second most significant bit of the last
282 : // byte to 1 and, finally, decode as little-endian. This means that the
283 : // resulting integer is of the form 2^254 plus eight times a value
284 : // between 0 and 2^251 - 1 (inclusive). Likewise, for X448, set the two
285 : // least significant bits of the first byte to 0, and the most
286 : // significant bit of the last byte to 1. This means that the resulting
287 : // integer is of the form 2^447 plus four times a value between 0 and
288 : // 2^445 - 1 (inclusive).
289 :
290 : /* decodeScalar25519
291 : note: e need to copy the private key, because we need to sanitize it. */
292 196953 : memcpy( secret_scalar, self_private_key, 32UL );
293 196953 : secret_scalar[ 0] &= (uchar)0xF8;
294 196953 : secret_scalar[31] &= (uchar)0x7F;
295 196953 : secret_scalar[31] |= (uchar)0x40;
296 :
297 196953 : fd_x25519_scalar_mul_const_time( shared_secret, secret_scalar, peer_public_key_point_u );
298 :
299 : /* Sanitize */
300 196953 : fd_memset_explicit( secret_scalar, 0, 32UL );
301 :
302 : /* Reject low order points */
303 196953 : if( FD_UNLIKELY( fd_x25519_is_zero_const_time( shared_secret ) ) ) {
304 93 : return NULL;
305 93 : }
306 :
307 196860 : return shared_secret;
308 196953 : }
|