Line data Source code
1 : #include "fd_r43x6.h"
2 :
3 : /* fd_r43x6_repsqr_mul(x,y,n) returns z = [x^(2^n)] y of an unreduced
4 : fd_r43x6_t (in u44) with lanes 6 and 7 zero where x is an unreduced
5 : fd_r43x6_t (in u47). Assumes lanes 6 and 7 of x are zero. Computed
6 : via n repeated squarings, yielding a cost of n fd_r43x6_sqr. */
7 :
8 : FD_FN_CONST static fd_r43x6_t
9 : fd_r43x6_repsqr_mul( fd_r43x6_t x,
10 : fd_r43x6_t y,
11 7212326 : ulong n ) {
12 :
13 : /* The below is R43X6_SQR1_INL wrapped in a loop to encourage inlining
14 : of the loop body and encourage the compiler to hoist various compile
15 : time constants out of the loop. REPSQR is almost always paired with
16 : a MUL and this is almost always iterated. So we incorporate the mul
17 : into this operation too to try to get the whole operation inlined
18 : but without too much instruction footprint for the below use cases. */
19 :
20 171915567 : for( ; n; n-- ) FD_R43X6_SQR1_INL( x, x );
21 7212326 : FD_R43X6_MUL1_INL( x, x, y );
22 7212326 : return x;
23 7212326 : }
24 :
25 : /* fd_r43x6_repsqr_mul2 does:
26 : *_za = fd_r43x6_repsqr_mul( xa, ya, n );
27 : *_zb = fd_r43x6_repsqr_mul( xb, yb, n );
28 : but faster. */
29 :
30 : static void
31 : fd_r43x6_repsqr_mul2( fd_r43x6_t * _za, fd_r43x6_t xa, fd_r43x6_t ya,
32 : fd_r43x6_t * _zb, fd_r43x6_t xb, fd_r43x6_t yb,
33 6192989 : ulong n ) {
34 :
35 : /* Similar considerations as repsqr_mul */
36 :
37 146942739 : for( ; n; n-- ) FD_R43X6_SQR2_INL( xa, xa, xb, xb );
38 6192989 : FD_R43X6_MUL2_INL( xa, xa, ya, xb, xb, yb );
39 6192989 : *_za = xa; *_zb = xb;
40 6192989 : }
41 :
42 : fd_r43x6_t
43 262247 : fd_r43x6_invert( fd_r43x6_t z ) {
44 :
45 : /* Theory:
46 :
47 : z^p = z in GF(p)
48 : -> z^(p-1) z = z
49 : -> z^(p-1) = 1
50 : -> z^(p-2) z = 1
51 : -> z^(p-2) is the multiplicative inverse of z in GF(p).
52 :
53 : Since p-2 is impractically large, we have to do this indirectly.
54 : This technique is adapted from the OpenSSL implementation.
55 :
56 : z^(p-2) = z^(2^255-21)
57 : = z^[(2^255)-(2^5)+11]
58 : = z^[(2^250)(2^5)-(2^5)+11]
59 : = z^[(2^250-1)(2^5) + 11]
60 : = [z^(2^250-1)]^(2^5) z^11
61 :
62 : z^11 is straightforward to compute directly. [...]^(2^5) is
63 : straightforward to compute by repeated squaring. z^(2^n-1) can be
64 : computed by a combination of repeated squaring and factorizations
65 : like:
66 :
67 : z^(2^n-1) = z^[(2^(n/2)+1)(2^(n/2)-1)]
68 : = z^[2^(n/2) (2^(n/2)-1) + (2^(n/2)-1)]
69 : = [z^(2^(n/2)-1)]^(2^(n/2)) z^(2^(n/2)-1)
70 :
71 : where the first term is the n/2 repeated squaring of z^(2^(n/2)-1)
72 : and the second term is the factor that initialized the repeated
73 : squaring. */
74 :
75 : /* Compute z^11 (and z^9 along the way) */
76 :
77 262247 : fd_r43x6_t z2 = fd_r43x6_sqr ( z ); /* TODO: consider repsqr_mul(z,one,1) for more reuse? */
78 262247 : fd_r43x6_t z9 = fd_r43x6_repsqr_mul( z2, z, 2UL );
79 262247 : fd_r43x6_t z11 = fd_r43x6_repsqr_mul( z9, z2, 0UL );
80 :
81 : /* Compute z^(2^250-1) */
82 :
83 262247 : fd_r43x6_t z2e5m1 = fd_r43x6_repsqr_mul( z11, z9, 1UL );
84 262247 : fd_r43x6_t z2e10m1 = fd_r43x6_repsqr_mul( z2e5m1, z2e5m1, 5UL );
85 262247 : fd_r43x6_t z2e20m1 = fd_r43x6_repsqr_mul( z2e10m1, z2e10m1, 10UL );
86 262247 : fd_r43x6_t z2e40m1 = fd_r43x6_repsqr_mul( z2e20m1, z2e20m1, 20UL );
87 262247 : fd_r43x6_t z2e50m1 = fd_r43x6_repsqr_mul( z2e40m1, z2e10m1, 10UL );
88 262247 : fd_r43x6_t z2e100m1 = fd_r43x6_repsqr_mul( z2e50m1, z2e50m1, 50UL );
89 262247 : fd_r43x6_t z2e200m1 = fd_r43x6_repsqr_mul( z2e100m1, z2e100m1, 100UL );
90 262247 : fd_r43x6_t z2e250m1 = fd_r43x6_repsqr_mul( z2e200m1, z2e50m1, 50UL );
91 :
92 : /* Combine z^(2^250-1) and z^11 */
93 :
94 262247 : return fd_r43x6_repsqr_mul( z2e250m1, z11, 5UL );
95 262247 : }
96 :
97 : fd_r43x6_t
98 393419 : fd_r43x6_pow22523( fd_r43x6_t z ) {
99 :
100 : /* This works nearly identical to invert. The factorization is:
101 :
102 : z^(2^252-3) = z^[(2^252)-(2^2)+1]
103 : = z^[(2^250-1)(2^2)+1]
104 : = [z^(2^250-1)]^(2^2) z
105 :
106 : We compute z^(2^250-1) the same way as invert and then do a
107 : slightly different final combination. */
108 :
109 : /* Compute z^(2^250-1) */
110 :
111 393419 : fd_r43x6_t z2 = fd_r43x6_sqr ( z ); /* TODO: consider repsqr_mul(z,one,1) for more reuse? */
112 393419 : fd_r43x6_t z9 = fd_r43x6_repsqr_mul( z2, z, 2UL );
113 393419 : fd_r43x6_t z11 = fd_r43x6_repsqr_mul( z9, z2, 0UL );
114 393419 : fd_r43x6_t z2e5m1 = fd_r43x6_repsqr_mul( z11, z9, 1UL );
115 393419 : fd_r43x6_t z2e10m1 = fd_r43x6_repsqr_mul( z2e5m1, z2e5m1, 5UL );
116 393419 : fd_r43x6_t z2e20m1 = fd_r43x6_repsqr_mul( z2e10m1, z2e10m1, 10UL );
117 393419 : fd_r43x6_t z2e40m1 = fd_r43x6_repsqr_mul( z2e20m1, z2e20m1, 20UL );
118 393419 : fd_r43x6_t z2e50m1 = fd_r43x6_repsqr_mul( z2e40m1, z2e10m1, 10UL );
119 393419 : fd_r43x6_t z2e100m1 = fd_r43x6_repsqr_mul( z2e50m1, z2e50m1, 50UL );
120 393419 : fd_r43x6_t z2e200m1 = fd_r43x6_repsqr_mul( z2e100m1,z2e100m1, 100UL );
121 393419 : fd_r43x6_t z2e250m1 = fd_r43x6_repsqr_mul( z2e200m1,z2e50m1, 50UL );
122 :
123 : /* Combine z^(2^250-1) and z */
124 :
125 393419 : return fd_r43x6_repsqr_mul( z2e250m1, z, 2UL );
126 393419 : }
127 :
128 : void
129 : fd_r43x6_pow22523_2( fd_r43x6_t * _za, fd_r43x6_t za,
130 562999 : fd_r43x6_t * _zb, fd_r43x6_t zb ) {
131 :
132 : /* This is identical to the above but runs two calculations at the
133 : same time for lots of ILP. */
134 :
135 562999 : fd_r43x6_t z2a, z2b; FD_R43X6_SQR2_INL ( z2a, za, z2b, zb );
136 562999 : fd_r43x6_t z9a, z9b; fd_r43x6_repsqr_mul2( &z9a, z2a, za, &z9b, z2b, zb, 2UL );
137 562999 : fd_r43x6_t z11a, z11b; fd_r43x6_repsqr_mul2( &z11a, z9a, z2a, &z11b, z9b, z2b, 0UL );
138 562999 : fd_r43x6_t z2e5m1a, z2e5m1b; fd_r43x6_repsqr_mul2( &z2e5m1a, z11a, z9a, &z2e5m1b, z11b, z9b, 1UL );
139 562999 : fd_r43x6_t z2e10m1a, z2e10m1b; fd_r43x6_repsqr_mul2( &z2e10m1a, z2e5m1a, z2e5m1a, &z2e10m1b, z2e5m1b, z2e5m1b, 5UL );
140 562999 : fd_r43x6_t z2e20m1a, z2e20m1b; fd_r43x6_repsqr_mul2( &z2e20m1a, z2e10m1a, z2e10m1a, &z2e20m1b, z2e10m1b, z2e10m1b, 10UL );
141 562999 : fd_r43x6_t z2e40m1a, z2e40m1b; fd_r43x6_repsqr_mul2( &z2e40m1a, z2e20m1a, z2e20m1a, &z2e40m1b, z2e20m1b, z2e20m1b, 20UL );
142 562999 : fd_r43x6_t z2e50m1a, z2e50m1b; fd_r43x6_repsqr_mul2( &z2e50m1a, z2e40m1a, z2e10m1a, &z2e50m1b, z2e40m1b, z2e10m1b, 10UL );
143 562999 : fd_r43x6_t z2e100m1a, z2e100m1b; fd_r43x6_repsqr_mul2( &z2e100m1a,z2e50m1a, z2e50m1a, &z2e100m1b,z2e50m1b, z2e50m1b, 50UL );
144 562999 : fd_r43x6_t z2e200m1a, z2e200m1b; fd_r43x6_repsqr_mul2( &z2e200m1a,z2e100m1a,z2e100m1a, &z2e200m1b,z2e100m1b,z2e100m1b, 100UL );
145 562999 : fd_r43x6_t z2e250m1a, z2e250m1b; fd_r43x6_repsqr_mul2( &z2e250m1a,z2e200m1a,z2e50m1a, &z2e250m1b,z2e200m1b,z2e50m1b, 50UL );
146 :
147 : /**/ fd_r43x6_repsqr_mul2( _za, z2e250m1a,za, _zb, z2e250m1b,zb, 2UL );
148 562999 : }
|