Line data Source code
1 : #ifndef HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_ge_h
2 : #define HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_ge_h
3 :
4 : /* This header provides APIs for manipulating group elements / curve
5 : points in ED25519. Direct quotes from RFC 8032 are indicated with
6 : '//' style comments. */
7 :
8 : /* A curve point will be represented by a FD_R43X6_QUAD (X,Y,Z,T) in
9 : extended homogeneous coordinates where X, Y, Z and T hold fd_r43x6_t
10 : u44 representations typically and X Y = T Z. */
11 :
12 : // Section 5.1.4 (page 11)
13 : //
14 : // A point (x,y) is represented in extended homogeneous coordinates
15 : // (X, Y, Z, T), with x = X/Z, y = Y/Z, x * y = T/Z.
16 :
17 : #include "fd_r43x6.h"
18 :
19 : FD_PROTOTYPES_BEGIN
20 :
21 : /* FD_R43X6_GE_ZERO(P) does P = the curve neutral point (0,1,1,0).
22 : (X,Y,Z,T) will be reduced representations. */
23 :
24 : // Section 5.1.4 (page 11):
25 : //
26 : // The neutral point is (0,1), or equivalently in extended homogeneous
27 : // coordinates (0, Z, Z, 0) for any non-zero Z.
28 :
29 855610 : #define FD_R43X6_GE_ZERO(P) do { P##03 = wwl( 0L,1L,1L,0L, 0L,0L,0L,0L ); P##14 = wwl_zero(); P##25 = wwl_zero(); } while(0)
30 :
31 : /* FD_R43X6_GE_ONE(P) does P = the curve "base" point. (X,Y,Z,T) are all
32 : reduced representations with Z==1. Section 5.1 (page 9):
33 :
34 : B = (15112221349535400772501151409588531511454012693041857206046113283949847762202,
35 : 46316835694926478169428394003475163141307993866256225615783033603165251855960)
36 :
37 : The below limbs for a reduced fd_r43x6_t representation were computed
38 : from the above using Python. */
39 :
40 : #define FD_R43X6_GE_ONE(P) do { \
41 : P##03 = wwl( 5912276620570L, 7036874417752L, 1L, 2970602692003L, 1206867684910L, 3518437208883L, 0L, 8002368565694L ); \
42 : P##14 = wwl( 5175273663173L, 5277655813324L, 0L, 2381000326097L, 7581689711182L, 7036874417766L, 0L, 787695955620L ); \
43 : P##25 = wwl( 1806891319892L, 1759218604441L, 0L, 4963950264797L, 286998243226L, 879609302220L, 0L, 889305571247L ); \
44 : } while(0)
45 :
46 : /* FD_R43X6_GE_IS_EQ(X,Y) returns 1 if X and Y represent the same curve
47 : point and 0 otherwise. X and Y should be FD_R43X6_QUAD holding u46
48 : representations. */
49 :
50 255759 : #define FD_R43X6_GE_IS_EQ( X, Y ) fd_r43x6_ge_is_eq( X##03,X##14,X##25, Y##03,Y##14,Y##25 )
51 :
52 : FD_FN_UNUSED static int /* let compiler decide if worth inlining */
53 : fd_r43x6_ge_is_eq( wwl_t X03, wwl_t X14, wwl_t X25,
54 255759 : wwl_t Y03, wwl_t Y14, wwl_t Y25 ) {
55 :
56 : /* We use the same method from the spec to test equality. It is worth
57 : noting that the standard itself specifies using a two point check
58 : to avoid doing an unnecessary inversion. E.g. It is somewhat odd
59 : that OpenSSL implementation chooses instead to encode R' to r'
60 : (slow) compare the encoded r and r' for its equality check in
61 : verify. */
62 :
63 : // Section 6 classEdwardsPoint (page 51)
64 : //
65 : // #Check that two points are equal.
66 : // def __eq__(self,y):
67 : // #Need to check x1/z1 == x2/z2 and similarly for y, so cross
68 : // #multiply to eliminate divisions.
69 : // xn1=self.x*y.z
70 : // xn2=y.x*self.z
71 : // yn1=self.y*y.z
72 : // yn2=y.y*self.z
73 : // return xn1==xn2 and yn1==yn2
74 :
75 255759 : fd_r43x6_t xn1, xn2, yn1, yn2;
76 255759 : FD_R43X6_QUAD_PERMUTE ( X, 2,0,2,1, X ); /* X = XZ |XX |XZ |XY, in u46|u46|u46|u46 */
77 255759 : FD_R43X6_QUAD_PERMUTE ( Y, 0,2,1,2, Y ); /* Y = YX |YZ |YY |YZ , in u46|u46|u46|u46 */
78 255759 : FD_R43X6_QUAD_MUL_FAST( X, X, Y ); /* X = xn2|xn1|yn2|yn1, in u62|u62|u62|u62 */
79 255759 : FD_R43X6_QUAD_UNPACK ( xn2, xn1, yn2, yn1, X );
80 255759 : return (int)(!fd_r43x6_is_nonzero( fd_r43x6_sub_fast( xn1, xn2 ) /* in s62 */ )) &
81 255759 : (int)(!fd_r43x6_is_nonzero( fd_r43x6_sub_fast( yn1, yn2 ) /* in s62 */ ));
82 255759 : }
83 :
84 : /* FD_R43X6_QUAD_1112d(Q) does Q = (1,1,2,2*d). (X,Y,Z,T) will be
85 : reduced fd_r43x6_t. */
86 :
87 4722415 : #define FD_R43X6_QUAD_1112d( Q ) do { \
88 4722415 : Q##03 = wwl( 1L, 1L, 1L, 3934839304537L, 0L, 0L, 0L, 521695520920L ); \
89 4722415 : Q##14 = wwl( 0L, 0L, 0L, 507525298899L, 0L, 0L, 0L, 6596238350568L ); \
90 4722415 : Q##25 = wwl( 0L, 0L, 0L, 15037786634L, 0L, 0L, 0L, 309467527341L ); \
91 4722415 : } while(0)
92 :
93 : /* FD_R43X6_GE_ADD(P3,P1,P2) computes P3 = P1 + P2 where P1, P2 and P3
94 : are FD_R43X6_QUAD. P1 and P2 should hold s61 representations. P3
95 : will hold u44 representations on return. In place operation fine. */
96 :
97 : // Section 5.1.4 (page 12):
98 : //
99 : // The following formulas for adding two points, (x3,y3) =
100 : // (x1,y1)+(x2,y2), on twisted Edwards curves with a=-1, square a, and
101 : // non-square d are described in Section 3.1 of [Edwards-revisited] and
102 : // in [EFD-TWISTED-ADD]. They are complete, i.e., they work for any
103 : // pair of valid input points.
104 : //
105 : // A = (Y1-X1)*(Y2-X2)
106 : // B = (Y1+X1)*(Y2+X2)
107 : // C = T1*2*d*T2
108 : // D = Z1*2*Z2
109 : // E = B-A
110 : // F = D-C
111 : // G = D+C
112 : // H = B+A
113 : // X3 = E*F
114 : // Y3 = G*H
115 : // T3 = E*H
116 : // Z3 = F*G
117 :
118 : #if 1 /* Seems very slightly faster than the below */
119 2012615 : #define FD_R43X6_GE_ADD( P3, P1, P2 ) do { \
120 2012615 : FD_R43X6_QUAD_DECL ( _1112d ); \
121 2012615 : FD_R43X6_QUAD_DECL ( _ta ); \
122 2012615 : FD_R43X6_QUAD_DECL ( _tb ); \
123 2012615 : FD_R43X6_QUAD_1112d ( _1112d ); /* (1, 1, 1, 2*d ), u43|u43|u43|u43 */ \
124 2012615 : FD_R43X6_QUAD_PERMUTE ( _ta, 1,0,2,3, P1 ); /* _ta = (Y1, X1, Z1, T1 ), s61|s61|s61|s61 */ \
125 2012615 : FD_R43X6_QUAD_PERMUTE ( _tb, 1,0,2,3, P2 ); /* _tb = (Y2, X2, Z2, T2 ), s61|s61|s61|s61 */ \
126 2012615 : FD_R43X6_QUAD_LANE_SUB_FAST( _ta, _ta, 1,0,0,0, _ta, P1 ); /* _ta = (Y1-X1,X1, Z1, T1 ), s62|s61|s61|s61 */ \
127 2012615 : FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,0, _tb, P2 ); /* _tb = (Y2-X2,X2, Z2, T2 ), s62|s61|s61|s61 */ \
128 2012615 : FD_R43X6_QUAD_LANE_ADD_FAST( _ta, _ta, 0,1,1,0, _ta, P1 ); /* _ta = (Y1-X1,Y1+X1,Z1*2, T1 ), s62|s62|s61|s61 */ \
129 2012615 : FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,0,0, _tb, P2 ); /* _tb = (Y2-X2,Y2+X2,Z2, T2 ), s62|s62|s61|s61 */ \
130 2012615 : FD_R43X6_QUAD_MUL_FAST ( _ta, _ta, _tb ); /* _ta = (A, B, D, C ), u62|u62|u62|u62 */ \
131 2012615 : FD_R43X6_QUAD_FOLD_UNSIGNED( _ta, _ta ); /* _ta = (Y1-X1,Y1+X1,Z1*2, T1*2d), u44|u44|u44|u44 */ \
132 2012615 : FD_R43X6_QUAD_MUL_FAST ( _ta, _ta, _1112d ); /* _ta = (Y1-X1,Y1+X1,Z1*2, T1*2d), u62|u62|u62|u62 */ \
133 2012615 : FD_R43X6_QUAD_FOLD_UNSIGNED( _ta, _ta ); /* _ta = (A, B, D, C ), u44|u44|u44|u44 */ \
134 2012615 : FD_R43X6_QUAD_PERMUTE ( _tb, 1,0,3,2, _ta ); /* _tb = (B, A, C, D ), u62|u62|u62|u62 */ \
135 2012615 : FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,1, _tb, _ta ); /* _tb = (E, A, C, F ), s62|u62|u62|s62 */ \
136 2012615 : FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,1,0, _tb, _ta ); /* _tb = (E, H, G, F ), s62|u63|u63|s62 */ \
137 2012615 : FD_R43X6_QUAD_PERMUTE ( _ta, 0,2,2,0, _tb ); /* _ta = (E, G, G, E ), u44|u44|u44|u44 */ \
138 2012615 : FD_R43X6_QUAD_PERMUTE ( _tb, 3,1,3,1, _tb ); /* _tb = (F, H, F, H ), u44|u44|u44|u44 */ \
139 2012615 : FD_R43X6_QUAD_MUL_FAST ( _ta, _ta, _tb ); /* _ta = (X3, Y3, Z3, T3 ), u62|u62|u62|u62 */ \
140 2012615 : FD_R43X6_QUAD_FOLD_UNSIGNED( P3, _ta ); /* P3 = (X3, Y3, Z3, T3 ), u44|u44|u44|u44 */ \
141 2012615 : } while(0)
142 : #else /* Seems very slightly slower than the above */
143 : #define FD_R43X6_GE_ADD( P3, P1, P2 ) do { \
144 : FD_R43X6_QUAD_DECL ( _ta ); \
145 : FD_R43X6_QUAD_DECL ( _tb ); \
146 : FD_R43X6_QUAD_PERMUTE ( _ta, 1,0,2,3, P1 ); /* _ta = (Y1, X1, Z1, T1), s61|s61|s61|s61 */ \
147 : FD_R43X6_QUAD_PERMUTE ( _tb, 1,0,2,3, P2 ); /* _tb = (Y2, X2, Z2, T2), s61|s61|s61|s61 */ \
148 : FD_R43X6_QUAD_LANE_SUB_FAST( _ta, _ta, 1,0,0,0, _ta, P1 ); /* _ta = (Y1-X1,X1, Z1, T1), s62|s61|s61|s61 */ \
149 : FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,0, _tb, P2 ); /* _tb = (Y2-X2,X2, Z2, T2), s62|s61|s61|s61 */ \
150 : FD_R43X6_QUAD_LANE_ADD_FAST( _ta, _ta, 0,1,1,0, _ta, P1 ); /* _ta = (Y1-X1,Y1+X1,2*Z1, T1), s62|s62|s62|s61 */ \
151 : FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,0,0, _tb, P2 ); /* _tb = (Y2-X2,Y2+X2,Z2, T2), s62|s62|s61|s61 */ \
152 : FD_R43X6_QUAD_FOLD_SIGNED ( _ta, _ta ); /* _ta = (Y1-X1,Y1+X1,2*Z1, T1), u44|u44|u44|u44 */ \
153 : FD_R43X6_QUAD_FOLD_SIGNED ( _tb, _tb ); /* _tb = (Y2-X2,Y2+X2,Z2, T2), u44|u44|u44|u44 */ \
154 : fd_r43x6_t _YmX1, _YpX1, _2Z1, _T1; \
155 : FD_R43X6_QUAD_UNPACK( _YmX1, _YpX1, _2Z1, _T1, _ta ); \
156 : FD_R43X6_QUAD_PACK( _ta, _YmX1, _YpX1, _2Z1, fd_r43x6_mul( _T1, fd_r43x6_2d() ) ); \
157 : FD_R43X6_QUAD_MUL_FAST ( _ta, _ta, _tb ); /* _ta = (A, B, D, C ), u62|u62|u62|u62 */ \
158 : FD_R43X6_QUAD_PERMUTE ( _tb, 1,0,3,2, _ta ); /* _tb = (B, A, C, D ), u62|u62|u62|u62 */ \
159 : FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,1, _tb, _ta ); /* _tb = (E, A, C, F ), s62|u62|u62|s62 */ \
160 : FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,1,0, _tb, _ta ); /* _tb = (E, H, G, F ), s62|u63|u63|s62 */ \
161 : FD_R43X6_QUAD_FOLD_SIGNED ( _tb, _tb ); /* _tb = (E, H, G, F ), u44|u44|u44|u44 */ \
162 : FD_R43X6_QUAD_PERMUTE ( _ta, 0,2,2,0, _tb ); /* _ta = (E, G, G, E ), u44|u44|u44|u44 */ \
163 : FD_R43X6_QUAD_PERMUTE ( _tb, 3,1,3,1, _tb ); /* _tb = (F, H, F, H ), u44|u44|u44|u44 */ \
164 : FD_R43X6_QUAD_MUL_FAST ( _ta, _ta, _tb ); /* _ta = (X3, Y3, Z3, T3), u62|u62|u62|u62 */ \
165 : FD_R43X6_QUAD_FOLD_UNSIGNED( P3, _ta ); /* P3 = (X3, Y3, Z3, T3), u44|u44|u44|u44 */ \
166 : } while(0)
167 : #endif
168 :
169 : /* FD_R43X6_GE_ADD_TABLE does the same thing as FD_R43X6_GE_ADD where T1
170 : holds (Y1-X1,Y1+X1,Z1*2,T1*2d). T1 and P2 should be in s61
171 : representations. P3 will hold u44 representations. */
172 :
173 : #define FD_R43X6_GE_ADD_TABLE( P3, T1, P2 ) do { \
174 : FD_R43X6_QUAD_DECL ( _ta ); \
175 : FD_R43X6_QUAD_DECL ( _tb ); \
176 : FD_R43X6_QUAD_MOV ( _ta, T1 ); /* _ta = (Y1-X1,Y1+X1,Z1*2,T1*2d), s61|s61|s61|s61 */ \
177 : FD_R43X6_QUAD_PERMUTE ( _tb, 1,0,2,3, P2 ); /* _tb = (Y2, X2, Z2, T2 ), s61|s61|s61|s61 */ \
178 : FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,0, _tb, P2 ); /* _tb = (Y2-X2,X2, Z2, T2 ), s62|s61|s61|s61 */ \
179 : FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,0,0, _tb, P2 ); /* _tb = (Y2-X2,Y2+X2,Z2, T2 ), s62|s62|s61|s61 */ \
180 : FD_R43X6_QUAD_MUL_FAST ( _ta, _ta, _tb ); /* _ta = (A, B, D, C ), u62|u62|u62|u62 */ \
181 : FD_R43X6_QUAD_FOLD_UNSIGNED( _ta, _ta ); /* _ta = (A, B, D, C ), u44|u44|u44|u44 */ \
182 : FD_R43X6_QUAD_PERMUTE ( _tb, 1,0,3,2, _ta ); /* _tb = (B, A, C, D ), u62|u62|u62|u62 */ \
183 : FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,1, _tb, _ta ); /* _tb = (E, A, C, F ), s62|u62|u62|s62 */ \
184 : FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,1,0, _tb, _ta ); /* _tb = (E, H, G, F ), s62|u63|u63|s62 */ \
185 : FD_R43X6_QUAD_PERMUTE ( _ta, 0,2,2,0, _tb ); /* _ta = (E, G, G, E ), u44|u44|u44|u44 */ \
186 : FD_R43X6_QUAD_PERMUTE ( _tb, 3,1,3,1, _tb ); /* _tb = (F, H, F, H ), u44|u44|u44|u44 */ \
187 : FD_R43X6_QUAD_MUL_FAST ( _ta, _ta, _tb ); /* _ta = (X3, Y3, Z3, T3 ), u62|u62|u62|u62 */ \
188 : FD_R43X6_QUAD_FOLD_UNSIGNED( P3, _ta ); /* P3 = (X3, Y3, Z3, T3 ), u44|u44|u44|u44 */ \
189 : } while(0)
190 :
191 : /* FD_R43X6_GE_DBL(P3,P1) computes P3 = 2*P1 where P1 and P3 are
192 : FD_R43X6_GE. P1 should hold u44 representations. P3 will hold u44
193 : representations on return. In place operation fine. */
194 :
195 : // Section 5.1.4 (page 12):
196 : //
197 : // For point doubling, (x3,y3) = (x1,y1)+(x1,y1), one could just
198 : // substitute equal points in the above (because of completeness, such
199 : // substitution is valid) and observe that four multiplications turn
200 : // into squares. However, using the formulas described in Section 3.2
201 : // of [Edwards-revisited] and in [EFD-TWISTED-DBL] saves a few smaller
202 : // operations.
203 : //
204 : // A = X1^2
205 : // B = Y1^2
206 : // C = 2*Z1^2
207 : // H = A+B
208 : // E = H-(X1+Y1)^2
209 : // G = A-B
210 : // F = C+G
211 : // X3 = E*F
212 : // Y3 = G*H
213 : // T3 = E*H
214 : // Z3 = F*G
215 :
216 : /* TODO: CONSIDER MUL INSTEAD OF SQR TO GET THE 2* AT THE SAME TIME? */
217 67752671 : #define FD_R43X6_GE_DBL( P3, P1 ) do { \
218 67752671 : FD_R43X6_QUAD_DECL ( _ta ); \
219 67752671 : FD_R43X6_QUAD_DECL ( _tb ); \
220 67752671 : FD_R43X6_QUAD_DECL ( _BB ); \
221 67752671 : FD_R43X6_QUAD_PERMUTE ( _ta, 1,1,2,0, P1 ); /* _ta = (Y1, Y1,Z1, X1), u44/u44/u44/u44 */ \
222 67752671 : FD_R43X6_QUAD_LANE_ADD_FAST( _ta, _ta, 1,0,0,0, _ta, P1 ); /* _ta = (X1+Y1, Y1,Z1, X1), u45/u44/u44/u44 */ \
223 67752671 : FD_R43X6_QUAD_SQR_FAST ( _ta, _ta ); /* _ta = ((X1+Y1)^2,B, Z1^2,A ), u61/u61/u61/u61 */ \
224 67752671 : FD_R43X6_QUAD_FOLD_UNSIGNED( _ta, _ta ); /* _ta = ((X1+Y1)^2,B, Z1^2,A ), u44/u44/u44/u44 */ \
225 67752671 : FD_R43X6_QUAD_LANE_ADD_FAST( _ta, _ta, 0,0,1,0, _ta, _ta ); /* _ta = ((X1+Y1)^2,B, C, A ), u44/u44/u45/u44 */ \
226 67752671 : FD_R43X6_QUAD_PERMUTE ( _tb, 3,3,3,3, _ta ); /* _tb = (A, A, A, A ), u44/u44/u44/u44 */ \
227 67752671 : FD_R43X6_QUAD_PERMUTE ( _BB, 1,1,1,1, _ta ); /* _BB = (B, B, B, B ), u44/u44/u44/u44 */ \
228 67752671 : FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 1,0,0,1, _tb, _BB ); /* _tb = (H, A, A, H ), u45/u44/u44/u45 */ \
229 67752671 : FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 0,1,1,0, _tb, _BB ); /* _tb = (H, G, G, H ), u45/u45/u45/u45 */ \
230 67752671 : FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,0,1,0, _tb, _ta ); /* _tb = (H, G, F, H ), u45/u45/u46/u45 */ \
231 67752671 : FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,0, _tb, _ta ); /* _tb = (E, G, F, H ), u46/u45/u46/u45 */ \
232 67752671 : FD_R43X6_QUAD_PERMUTE ( _ta, 0,1,1,0, _tb ); /* _tb = (E, G, G, E ), u46/u45/u45/u46 */ \
233 67752671 : FD_R43X6_QUAD_PERMUTE ( _tb, 2,3,2,3, _tb ); /* _tb = (F, H, F, H ), u46/u45/u46/u45 */ \
234 67752671 : FD_R43X6_QUAD_MUL_FAST ( _ta, _ta, _tb ); /* _ta = (X3, Y3,Z3, T3), u62/u62/u62/u62 */ \
235 67752671 : FD_R43X6_QUAD_FOLD_UNSIGNED( P3, _ta ); /* P3 = (X3, Y3,Z3, T3), u44/u44/u44/u44 */ \
236 67752671 : } while(0)
237 :
238 : /* FD_R43X6_GE_IS_SMALL_ORDER(P) returns 1 if [8]P is the curve neutral
239 : point and 0 otherwise. P should be a FD_R43X6_QUAD holding u44
240 : representations of a valid curve point. */
241 :
242 : #define FD_R43X6_GE_IS_SMALL_ORDER( P ) fd_r43x6_ge_is_small_order( P##03,P##14,P##25 )
243 :
244 : FD_FN_UNUSED static int /* let compiler decide if worth inlining */
245 0 : fd_r43x6_ge_is_small_order( wwl_t P03, wwl_t P14, wwl_t P25 ) {
246 0 : for( int i=0; i<3; i++ ) FD_R43X6_GE_DBL( P, P ); /* P = [8]P, in u44|u44|u44|u44 */
247 0 :
248 0 : /* We do a faster check than is_eq above by propagating the 0 and 1
249 0 : values of the curve neutral point into the multiplication and
250 0 : simplifying it. This is equivalent to checking that the result has
251 0 : the form (0|Z|Z|0). Note that if x is a representation of field
252 0 : element 0, t is also a representation 0 as t = x*y. */
253 0 :
254 0 : fd_r43x6_t x, y, z, t;
255 0 : FD_R43X6_QUAD_UNPACK( x, y, z, t, P ); (void)t;
256 0 : return (int)(!fd_r43x6_is_nonzero( x )) & (int)(!fd_r43x6_is_nonzero( fd_r43x6_sub_fast( y, z ) /* in s44 */ ));
257 0 : }
258 :
259 : /* FD_R43X6_GE_ENCODE(h,P) encodes a curve point P stored in the
260 : FD_R43X6_QUAD P into a unique compressed representation and writes it
261 : to the 32-byte memory region whose first byte in the callers address
262 : space is h. P should hold u47 representations. */
263 :
264 : #define FD_R43X6_GE_ENCODE(h,P) wv_stu( (h), fd_r43x6_ge_encode( P##03, P##14, P##25 ) )
265 :
266 : FD_FN_CONST wv_t
267 : fd_r43x6_ge_encode( wwl_t P03, wwl_t P14, wwl_t P25 );
268 :
269 : /* FD_R43X6_GE_DECODE(P,s) decodes a encoded curve point stored at the
270 : 32-byte region whose first byte in the caller's address space is
271 : pointed to by s into the curve point P. Returns 0 on success (P will
272 : hold the decoded curve point on return in u44 representations) and a
273 : negative error code on failure (P will hold reduced 0 for X,Y,Z,T).
274 : The below implementation avoids pointer escapes to help the
275 : optimizer. */
276 :
277 : #define FD_R43X6_GE_DECODE( P,s ) (__extension__({ \
278 : FD_R43X6_QUAD_DECL( _P ); \
279 : int _err = fd_r43x6_ge_decode( &_P03, &_P14, &_P25, (s) ); \
280 : FD_R43X6_QUAD_MOV( P, _P ); \
281 : _err; \
282 : }))
283 :
284 : int
285 : fd_r43x6_ge_decode( wwl_t * _P03, wwl_t * _P14, wwl_t * _P25,
286 : void const * _vs );
287 :
288 : /* FD_R43X6_GE_DECODE2( Pa,sa, Pb,sb ) does:
289 :
290 : if( GE_DECODE( Pa,sa ) ) { (PbX,PbY,PbZ,PbT) = 0; return -1; }
291 : else if( GE_DECODE( Pb,sb ) ) { (PaX,PaY,PaZ,PaT) = 0; return -2; }
292 : return 0;
293 :
294 : but faster. */
295 :
296 300755 : #define FD_R43X6_GE_DECODE2( Pa,sa, Pb,sb ) (__extension__({ \
297 300755 : FD_R43X6_QUAD_DECL( _Pa ); FD_R43X6_QUAD_DECL( _Pb ); \
298 300755 : int _err = fd_r43x6_ge_decode2( &_Pa03, &_Pa14, &_Pa25, (sa), &_Pb03, &_Pb14, &_Pb25, (sb) ); \
299 300755 : FD_R43X6_QUAD_MOV( Pa, _Pa ); FD_R43X6_QUAD_MOV( Pb, _Pb ); \
300 300755 : _err; \
301 300755 : }))
302 :
303 : int
304 : fd_r43x6_ge_decode2( wwl_t * _Pa03, wwl_t * _Pa14, wwl_t * _Pa25,
305 : void const * _vsa,
306 : wwl_t * _Pb03, wwl_t * _Pb14, wwl_t * _Pb25,
307 : void const * _vsb );
308 :
309 : /* FD_R43X6_GE_SMUL_BASE(R,s) computes R = [s]B where B is the base
310 : curve point. s points to a 32-byte memory region holding a little
311 : endian uint256 scalar in [0,2^255). In-place operation fine. The
312 : implementation has OpenSSL style timing attack mitigations. The
313 : returned R will hold u44 representations.
314 :
315 : FD_R43X6_GE_SMUL_BASE_VARTIME does the same thing but uses a faster
316 : variable time algorithm.
317 :
318 : Written this funny way to prevent pointer escapes from interfering
319 : the optimizer and allow for easy testing of different implementations
320 : as this one of this most performance critical operations in the code
321 : base.
322 :
323 : Performance of fd_ed25519_public_from_private (this is almost just a
324 : pure smul_base so it is a good indicator of practical end-to-end
325 : performance of smul_base ... sign for small messages will show
326 : similar results) on a single 2.3 GHz icelake-server core under gcc-12
327 : circa 2023 Sep:
328 :
329 : ref: ~37.0 us ("vartime" style)
330 : large: ~9.2 us ("vartime" style)
331 : small: ~11.3 us (w/timing attack mitigations)
332 :
333 : For reference:
334 :
335 : scalar: ~46.0 us ("small" style w/timing attack mitigations)
336 : AVX-2: ~24.9 us ("small" style w/timing attack mitigations)
337 :
338 : In the large implementation, if table symmetry is not exploited, it
339 : gets slightly faster (~9.0 us) but the table footprint roughly
340 : doubles (to ~765KiB) such that it has double the cache pressure. If
341 : table symmetry and GE_ADD precomputation is omitted (i.e. GE_ADD is
342 : used instead of GE_ADD_TABLE), it runs at ~10.0 us.
343 :
344 : If the large implementation is modified to use OpenSSL-style timing
345 : attack mitigations, it runs at ~11.5 us because the mitigations are
346 : so expensive (these scan the whole table every time such that the
347 : table lookup timing should be independent of the input s, which is a
348 : blunt and portable if naive way of doing it).
349 :
350 : In the small implementation, by using a 4-bit at-a-time
351 : implementation, the table footprint can be reduced to 48KiB.
352 : OpenSSL-style timing attack mitigations are then much less expensive
353 : but more computation is required. The result has virtually identical
354 : performance but much less cache pressure than the large
355 : implementation with timing attack mitigations. If timing attack
356 : mitigations are removed from small, it runs at ~11.1 us.
357 :
358 : TL;DR This is ~4-5x faster than the original scalar implementation
359 : and ~2.2-2.7x faster than the original AVX-2 accelerated
360 : implementation. Performance should roughly scale with core clock
361 : speed for these operations. */
362 :
363 : #define FD_R43X6_GE_SMUL_BASE(R,s) do { \
364 : FD_R43X6_QUAD_DECL( _R ); fd_r43x6_ge_smul_base_small( &_R03, &_R14, &_R25, (s) ); \
365 : FD_R43X6_QUAD_MOV( R, _R ); \
366 : } while(0)
367 :
368 : #define FD_R43X6_GE_SMUL_BASE_VARTIME(R,s) do { \
369 : FD_R43X6_QUAD_DECL( _R ); fd_r43x6_ge_smul_base_large( &_R03, &_R14, &_R25, (s) ); \
370 : FD_R43X6_QUAD_MOV( R, _R ); \
371 : } while(0)
372 :
373 : void
374 : fd_r43x6_ge_smul_base_ref( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
375 : void const * _vs ); /* vartime */
376 :
377 : void
378 : fd_r43x6_ge_smul_base_large( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
379 : void const * _vs ); /* vartime */
380 :
381 : void
382 : fd_r43x6_ge_smul_base_small( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
383 : void const * _vs ); /* has timing attack mitigations */
384 :
385 : /* FD_R43X6_GE_FMA_VARTIME computes R = [s]P + Q where s points to a
386 : 32-bit memory region holding a little endian uint256 scalar in
387 : [0,2^255). P and Q are FD_R43X6_QUADs holding curve points in a u44
388 : representation. R is a FD_R43X6_QUAD that will hold the result in a
389 : u44 representation on return. Uses a variable time algorithm.
390 : In-place operation fine. */
391 :
392 : #define FD_R43X6_GE_FMA_VARTIME(R,s,P,Q) do { \
393 : FD_R43X6_QUAD_DECL( _R ); fd_r43x6_ge_fma_sparse( &_R03,&_R14,&_R25, (s), P##03,P##14,P##25, Q##03,Q##14,Q##25 ); \
394 : FD_R43X6_QUAD_MOV( R, _R ); \
395 : } while(0)
396 :
397 : void
398 : fd_r43x6_ge_fma_ref( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
399 : void const * _vs,
400 : wwl_t P03, wwl_t P14, wwl_t P25,
401 : wwl_t Q03, wwl_t Q14, wwl_t Q25 ); /* vartime */
402 :
403 : void
404 : fd_r43x6_ge_fma_sparse( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
405 : void const * _vs,
406 : wwl_t P03, wwl_t P14, wwl_t P25,
407 : wwl_t Q03, wwl_t Q14, wwl_t Q25 ); /* vartime */
408 :
409 : /* FD_R43X6_GE_DMUL_VARTIME(R,s,k,A) computes R = [s]B + [k]A where s
410 : and k point to 32-byte memory regions holding little endian uint256
411 : scalars in [0,2^255) and A is a FD_R43X6_QUAD holding a curve point
412 : in a u44 representation. B is the base curve point. R is a
413 : FD_R43X6_QUAD that will hold the result in a u44 representation on
414 : return. Uses a variable time algorithm. In-place operation fine. */
415 :
416 : #define FD_R43X6_GE_DMUL_VARTIME(R,s,k,A) do { \
417 : FD_R43X6_QUAD_DECL( _R ); fd_r43x6_ge_dmul_sparse( &_R03,&_R14,&_R25, (s), (k), A##03,A##14,A##25 ); \
418 : FD_R43X6_QUAD_MOV( R, _R ); \
419 : } while(0)
420 :
421 : void
422 : fd_r43x6_ge_dmul_ref( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
423 : void const * _vs,
424 : void const * _vk,
425 : wwl_t A03, wwl_t A14, wwl_t A25 ); /* vartime */
426 :
427 : void
428 : fd_r43x6_ge_dmul_sparse( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
429 : void const * _vs,
430 : void const * _vk,
431 : wwl_t A03, wwl_t A14, wwl_t A25 ); /* vartime */
432 :
433 : /* fd_r43x6_ge_sparse_table computes a table of odd scalar multiples of
434 : P stores them in table. Given a w in [-max,max], the 3 wwl_t's
435 : holding the FD_R43X6_QUAD for [w]P will start at table index:
436 : 3*((max+w)/2). This quad will hold Y-X|Y+X|2*Z|T*2*d in a u44
437 : representation. max should be positive and odd and table should have
438 : space for 3*(max+1) entries. This is mostly for internal use. */
439 :
440 : void
441 : fd_r43x6_ge_sparse_table( wwl_t * table,
442 : wwl_t P03, wwl_t P14, wwl_t P25,
443 : int max );
444 :
445 : /* TODO: Consider pure for multi-return functions? */
446 :
447 : FD_PROTOTYPES_END
448 :
449 : #endif /* HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_ge_h */
|