Line data Source code
1 : #ifndef HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_h
2 : #define HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_h
3 :
4 : #if FD_HAS_AVX512
5 :
6 : #include "../../../util/simd/fd_avx.h"
7 : #include "../../../util/simd/fd_avx512.h"
8 :
9 : /* A fd_r43x6_t represents a GF(p) element, where p = 2^255-19, in a
10 : little endian 6 long radix 2^43 limb representation. The 6 limbs are
11 : held in lanes 0 through 5 of an AVX-512 vector. That is, given a
12 : fd_r43x6_t x, the field element represented by x is:
13 :
14 : ( x0 + x1 2^43 + x2 2^86 + x3 2^129 + x4 2^172 + x5 2^215 ) mod p
15 :
16 : where xn is the n-th 64-bit vector lane treated as a long. Lanes 6
17 : and 7 are ignored. The below will often use the shorthand:
18 :
19 : <x0,x1,x2,x3,x4,x5>
20 :
21 : for the above expression.
22 :
23 : This representation is redundant: multiple fd_r43x6_t can represent
24 : the same element. Most functions have restrictions on the which
25 : representations can be used for inputs and which representations they
26 : produce to support high performance implementation and composability.
27 :
28 : Frequently used representations are:
29 :
30 : - arbitrary: limbs are in [-2^63,2^63)
31 : - signed: limbs are in [-2^62,2^62)
32 : - unsigned: limbs are in [0,2^62)
33 : - unreduced: limbs are in [0,2^47)
34 : - unpacked: limbs 0-4 are in [0,2^43), limb 5 is in [0,2^41)
35 : - nearly reduced: limbs 0-4 are in [0,2^43), limb 5 is in [0,2^41), the packed uint256 value is in [0,2*p)
36 : - reduced: limbs 0-4 are in [0,2^43), limb 5 is in [0,2^40), the packed uint256 value is in [0,p)
37 :
38 : As a frequently used shorthand when analyzing range of limbs, unn
39 : indicates limbs are in [0,2^nn) and snn indicates limbs are in
40 : (-2^nn,2^nn).
41 :
42 : Note:
43 :
44 : - There is only one reduced fd_r43x6_t for each element.
45 :
46 : - There are two nearly reduced fd_r43x6_t for each element.
47 :
48 : - reduced is a subset of nearly reduced is a subset of unpacked is a
49 : subset of unreduced is a subset of unsigned is a subset of signed
50 : is a subset arbitrary.
51 :
52 : - unpacked, nearly reduced and reduced fd_r43x6_t be quickly
53 : converted into a packed uint256 value used by various cryptographic
54 : protocols and vice versa.
55 :
56 : - Cheat sheet
57 :
58 : unpack maps uint256 to unpacked
59 : pack maps unpacked to uint256
60 :
61 : * These are used to interface with external protocols.
62 :
63 : fold_unsigned maps unsigned to unreduced
64 : fold_signed maps signed to unreduced
65 :
66 : * fold_* are fast and typically used to keep the ranges of limbs
67 : reasonable in long running calculations without needing to do
68 : more expensive approx_mod_* or mod_* operations.
69 :
70 : approx_mod maps arbitrary to nearly reduced
71 : approx_mod_signed maps signed to nearly reduced
72 : approx_mod_unsigned maps unsigned to nearly reduced
73 : approx_mod_unreduced maps unreduced to nearly reduced
74 : approx_mod_unpacked maps unpacked to nearly reduced
75 :
76 : * approx_mod_* are typically used to get inputs to a long running
77 : calculation into a suitable form or by mod_* below.
78 :
79 : mod maps arbitrary to reduced (equiv to approx_mod / mod_nearly_reduced)
80 : mod_signed maps signed to reduced (equiv to approx_mod_signed / mod_nearly_reduced)
81 : mod_unsigned maps unsigned to reduced (equiv to approx_mod_unsigned / mod_nearly_reduced)
82 : mod_unreduced maps unreduced to reduced (equiv to approx_mod_unreduced / mod_nearly_reduced)
83 : mod_unpacked maps unpacked to reduced (equiv to approx_mod_unpacked / mod_nearly_reduced)
84 : mod_nearly_reduced maps nearly reduced to reduced
85 :
86 : * mod_* are typically used for produce a final unique result at
87 : the end of a long running calculation.
88 :
89 : add_fast maps unreduced x unreduced to unsigned (among others), see fold_unsigned above
90 : sub_fast maps unreduced x unreduced to signed (among others), see fold_signed above
91 : or, reduced x reduced to unsigned (among others), see fold_unsigned above
92 : mul_fast maps unreduced x unreduced to unsigned, see fold_unsigned above
93 : sqr_fast maps unreduced to unsigned, see fold_unsigned above
94 : neg maps unreduced to unreduced
95 : or, reduced to reduced
96 : add maps unreduced x unreduced to unreduced
97 : sub maps unreduced x unreduced to unreduced
98 : mul maps unreduced x unreduced to unreduced
99 : sqr maps unreduced to unreduced
100 : scale maps [0,2^47) x unreduced to unreduced
101 : scaleadd maps unreduced x [0,2^47) x unreduced to unreduced
102 : invert maps unreduced to unreduced
103 : is_nonzero maps signed to [0,1]
104 : diagnose maps signed to [-1,0,1]
105 : pow22523 maps unreduced to unreduced
106 :
107 : * These are used to implement HPC calculations on GF(p) elements. */
108 :
109 242936403 : #define fd_r43x6_t wwl_t
110 :
111 : FD_PROTOTYPES_BEGIN
112 :
113 : /* fd_r43x6(x0,x1,x2,x3,x4,x5) constructs an arbitrary r43x6_t from the
114 : given limbs. Lanes 6 and 7 will be zero. This macro is robust.
115 : Note: implementing via setr was benchmarked as slightly faster than
116 : loading from a stack tmp (probably due to better compiler code gen). */
117 :
118 293473743 : #define fd_r43x6(x0,x1,x2,x3,x4,x5) wwl( (x0),(x1),(x2),(x3),(x4),(x5), 0L,0L )
119 :
120 : /* fd_r43x6_extract_limbs(x,y) extracts the limbs of an arbitrary
121 : fd_r43x6_t x into the longs y0-y5. This is primarily for use in
122 : operations that are not vectorized. This macro is robust. Note:
123 : implementing via extract was benchmarked as slightly faster than
124 : storing to a stack tmp and reloading (probably due to better compiler
125 : code gen). */
126 :
127 295670180 : #define fd_r43x6_extract_limbs(x,y) do { \
128 295670180 : wwl_t _x = (x); \
129 295670180 : __m256i _xl = _mm512_extracti64x4_epi64( _x, 0 ); \
130 295670180 : __m256i _xh = _mm512_extracti64x4_epi64( _x, 1 ); \
131 295670180 : y##0 = _mm256_extract_epi64( _xl, 0 ); \
132 295670180 : y##1 = _mm256_extract_epi64( _xl, 1 ); \
133 295670180 : y##2 = _mm256_extract_epi64( _xl, 2 ); \
134 295670180 : y##3 = _mm256_extract_epi64( _xl, 3 ); \
135 295670180 : y##4 = _mm256_extract_epi64( _xh, 0 ); \
136 295670180 : y##5 = _mm256_extract_epi64( _xh, 1 ); \
137 295670180 : } while(0)
138 :
139 : /* fd_r43x6_zero(), fd_r43x6_one(), fd_r43x6_p(), fd_r43x6_d(),
140 : fd_r43x6_2d(), fd_r43x6_imag() returns the reduced fd_r43x6_t for
141 : zero (reduced), one (reduced), 2^255-19 (the non-trivial nearly
142 : reduced representation), d (reduced), 2*d (reduced) and sqrt(-1)
143 : (reduced) respectively. d is defined as per IETF RFC 8032 Section
144 : 5.1 (page 9) as -121665/121666. imag^2 = -1 mod p = p-1. These
145 : macros are robust. Lanes 6 and 7 will be zero. */
146 :
147 1 : #define fd_r43x6_zero() wwl_zero()
148 300756 : #define fd_r43x6_one() wwl( 1L, 0L, 0L, 0L, 0L, 0L, 0L,0L )
149 20000001 : #define fd_r43x6_p() wwl( 8796093022189L,8796093022207L,8796093022207L,8796093022207L,8796093022207L,1099511627775L, 0L,0L )
150 300755 : #define fd_r43x6_d() wwl( 6365466163363L, 253762649449L, 7518893317L, 260847760460L,7696165686388L, 704489577558L, 0L,0L )
151 : #define fd_r43x6_2d() wwl( 3934839304537L, 507525298899L, 15037786634L, 521695520920L,6596238350568L, 309467527341L, 0L,0L )
152 300755 : #define fd_r43x6_imag() wwl( 3467281080496L,6582290652611L,5210002954932L, 329084955603L,4526638806224L, 373767602335L, 0L,0L )
153 :
154 : /* fd_r43x6_unpack(u) returns an unpacked r43x6_t corresponding to an
155 : arbitrary uint256 stored little endian 4 ulong radix 2^64 limb
156 : representation held in an AVX-2 vector:
157 :
158 : u = u0 + u1 2^64 + u2 2^128 + u3 2^192
159 :
160 : where un is the n-th 64-bit vector lane treated as a ulong. Returned
161 : lanes 6 and 7 will be zero. If u is in [0,2*p), the return will be a
162 : nearly reduced fd_r43x6_t. If u is in [0,p), the return will be a
163 : reduced fd_r43x6_t. */
164 :
165 : FD_FN_CONST static inline fd_r43x6_t
166 32062720 : fd_r43x6_unpack( wv_t u ) {
167 32062720 : wwl_t const zero = wwl_zero();
168 32062720 : wwl_t const perm = wwl( 0x3f3f050403020100L, // r0 = bits 0: 42 (43 bits, zero extend to 64 bits)
169 32062720 : 0x3f3f0a0908070605L, // r1 = bits 43: 85 (43 bits, zero extend to 64 bits)
170 32062720 : 0x3f100f0e0d0c0b0aL, // r2 = bits 86:128 (43 bits, zero extend to 64 bits)
171 32062720 : 0x3f3f151413121110L, // r3 = bits 129:171 (43 bits, zero extend to 64 bits)
172 32062720 : 0x3f3f1a1918171615L, // r4 = bits 172:214 (43 bits, zero extend to 64 bits)
173 32062720 : 0x3f3f1f1e1d1c1b1aL, // r5 = bits 215:255 (41 bits, zero extend to 64 bits)
174 32062720 : 0x3f3f3f3f3f3f3f3fL, // r6 = zero
175 32062720 : 0x3f3f3f3f3f3f3f3fL ); // r7 = zero
176 32062720 : wwl_t const rshift = wwl( 0L, 3L, 6L, 1L, 4L, 7L, 0L, 0L ); // r0/r1/r2/r3/r4/r5 bit 0 is u bit 0/43/86/129/172
177 32062720 : wwl_t const mask = wwl_bcast( (1L<<43)-1L ); // Keep 43 least significant bits for each lane
178 32062720 : return wwl_and( wwl_shru_vector( _mm512_permutexvar_epi8( perm, _mm512_inserti64x4( zero, u, 0 ) ), rshift ), mask );
179 32062720 : }
180 :
181 : /* fd_r43x6_pack(r) is the inverse of fd_r43x6_unpack. r should be an
182 : unpacked fd_r43x6_t. If r is also nearly reduced, the return will be
183 : in [0,2*p). If r is also reduced fd_r43x6_t, the return will be in
184 : [0,p). Ignores lanes 6 and 7. */
185 :
186 : FD_FN_CONST static inline wv_t
187 33210693 : fd_r43x6_pack( fd_r43x6_t r ) {
188 :
189 : /* 43 21
190 : 0 42 43 63
191 : u0 = r0_0 ... r0_42 r1_0 ... r1_20
192 :
193 : 22 42
194 : 0 21 22 63
195 : u1 = r1_21 ... r1_42 r2_0 ... r2_41
196 :
197 : 1 43 20
198 : 0 1 43 44 63
199 : u2 = r2_42 r3_0 ... r3_42 r4_0 ... r4_19
200 :
201 : 23 41
202 : 0 22 23 63
203 : u3 = r4_20 ... r3_42 r5_0 ... r5_40
204 :
205 : t0 t1 t2
206 : u0 = (r0>> 0) | (r1<<43) | (r1<<43); ... Last term redundant to keep vectorized
207 : u1 = (r1>>21) | (r2<<22) | (r2<<22); ... "
208 : u2 = (r2>>42) | (r3<< 1) | (r4<<44);
209 : u3 = (r4>>20) | (r5<<23) | (r5<<23); ... " */
210 :
211 33210693 : wwl_t t0 = wwl_shru_vector( wwl_permute( wwl( 0L, 1L, 2L, 4L, 0L,0L,0L,0L ), r ), wwl( 0L,21L,42L,20L, 0L,0L,0L,0L ) );
212 33210693 : wwl_t t1 = wwl_shl_vector ( wwl_permute( wwl( 1L, 2L, 3L, 5L, 0L,0L,0L,0L ), r ), wwl( 43L,22L, 1L,23L, 0L,0L,0L,0L ) );
213 33210693 : wwl_t t2 = wwl_shl_vector ( wwl_permute( wwl( 1L, 2L, 4L, 5L, 0L,0L,0L,0L ), r ), wwl( 43L,22L,44L,23L, 0L,0L,0L,0L ) );
214 :
215 33210693 : return _mm512_extracti64x4_epi64( wwl_or( wwl_or( t0, t1 ), t2 ), 0 );
216 33210693 : }
217 :
218 : /* fd_r43x6_approx_carry_propagate_limbs(x,y) computes a signed
219 : fd_r43x6_t equivalent to an arbitrary fd_r43x6_t that has been
220 : extracted into the longs x0-x5 and stores the result into the longs
221 : y0-y5. On return:
222 :
223 : y0 in [-19*2^23,2^43+19*(2^23-1))
224 : y1-y4 in [ -2^20,2^43+ (2^20-1))
225 : y5 in [ -2^20,2^40+ (2^20-1))
226 :
227 : In-place operation fine. This macro is robust.
228 :
229 : If x is unsigned or more generally x0-x5 in [0,2^63), the result will
230 : be an unreduced fd_r43x6_t with:
231 :
232 : y0 in [0,2^43+19*(2^23-1))
233 : y1-y4 in [0,2^43+ (2^20-1))
234 : y5 in [0,2^40+ (2^20-1))
235 :
236 : Theory:
237 :
238 : x = <x0,x1,x2,x3,x4,x5>
239 : = <x0l,x1l,x2l,x3l,x4l,x5l> + <2^43*x0h,2^43*x1h,2^43*x2h,2^43*x3h,2^43*x4h,2^40*x5h>
240 : = <x0l,x1l,x2l,x3l,x4l,x5l> + <19*x5h,x0h,x1h,x2h,x3h,x4h>
241 :
242 : where x0h=floor(x0/2^43) and x0l = x0-2^43*x0h and similarly for
243 : x1-x4 while x5h = floor(x5/2^40) and x5l=x5-2^40*x5h.
244 :
245 : Above we used:
246 :
247 : 2^215*2^40*x5h = 2^255*x5h = (p+19)*x5h mod p = 19*x5h mod p.
248 :
249 : Equivalently, x0l-x5l are the least significant {43,43,43,43,43,40}
250 : bits of x0-x5 and x0h-x5l are the sign extending right shifts of
251 : x0-x5 by the same.
252 :
253 : For arbitrary x we have:
254 :
255 : x0l-x4l in [0,2^43), x0h-x4h in [-2^20,2^20)
256 : x5l in [0,2^40), x5h in [-2^23,2^23)
257 :
258 : while for x0-x5 in [0,2^63) (which includes unsigned) we have:
259 :
260 : x0l-x4l in [0,2^43), x0h-x4h in [0,2^20)
261 : x5l in [0,2^40), x5h in [0,2^23)
262 :
263 : This yields the above ranges for y0-y5. There are no intermediate
264 : overflows in the computation.
265 :
266 : This is a building block for more complex mappings where x's limbs
267 : have already been extracted in order to minimize the number of limb
268 : extracts and fd_r43x6 constructs.
269 :
270 : Note that if we used ulongs (and thus zero padding right shifts)
271 : below, this same style calculation could be used on an arbitrary
272 : _ulong_ limbed x. The result would still be an unreduced fd_r43x6_t
273 : with:
274 :
275 : y0-y4 in [0,2^43+19*(2^24-1))
276 : y5 in [0,2^40+ 2^21-1 ) */
277 :
278 103472838 : #define fd_r43x6_approx_carry_propagate_limbs(x,y) do { \
279 103472838 : long const _m43 = (1L<<43)-1L; \
280 103472838 : long const _m40 = (1L<<40)-1L; \
281 103472838 : long _x0 = (x##0); \
282 103472838 : long _x1 = (x##1); \
283 103472838 : long _x2 = (x##2); \
284 103472838 : long _x3 = (x##3); \
285 103472838 : long _x4 = (x##4); \
286 103472838 : long _x5 = (x##5); \
287 103472838 : (y##0) = (_x0 & _m43) + 19L*(_x5>>40); \
288 103472838 : (y##1) = (_x1 & _m43) + (_x0>>43); \
289 103472838 : (y##2) = (_x2 & _m43) + (_x1>>43); \
290 103472838 : (y##3) = (_x3 & _m43) + (_x2>>43); \
291 103472838 : (y##4) = (_x4 & _m43) + (_x3>>43); \
292 103472838 : (y##5) = (_x5 & _m40) + (_x4>>43); \
293 103472838 : } while(0)
294 :
295 : /* fd_r43x6_approx_carry_propagate is a vectorized version of the above.
296 : Returned lanes 6 and 7 are zero. If we've already extracted x's
297 : limbs and/or need the resulting limbs after, it is usually faster to
298 : do the extract and then use the scalar implementation.
299 :
300 : The fold_unsigned variant is the same but assumes x is unsigned or
301 : more generally x0-x5 in [0,2^63). This allows a faster madd52lo to
302 : be used instead of a slower mullo. (Hat tip to Philip Taffet for
303 : pointing this out.) It will also work for an arbitrary limbed ulong
304 : x.
305 :
306 : The fold_signed variant assumes x is signed or more generally:
307 :
308 : x0 in [-2^63+19*2^23,2^63)
309 : x1-x5 in [-2^63 +2^20,2^63)
310 :
311 : and subtracts x by <19*2^23,2^20,2^20,2^20,2^20,2^20> (which will not
312 : overflow) before the approx carry propagate and add it back after.
313 : This will not overflow and will not change the element represented.
314 : But it does yield an unreduced result with limbs:
315 :
316 : y0 in [0,2^43+19*(2^24-1))
317 : y1-y4 in [0,2^43+ (2^21-1))
318 : y5 in [0,2^40+ (2^21-1))
319 :
320 : These variants are particularly useful for mapping results of a
321 : additions and subtractions into unreduced results for subsequent
322 : operations. */
323 :
324 : #if 0 /* A mullo based implementation is slightly slower ... */
325 : #define fd_r43x6_approx_carry_propagate( x ) (__extension__({ \
326 : long const _m43 = (1L<<43)-1L; \
327 : long const _m40 = (1L<<40)-1L; \
328 : wwl_t _x = (x); \
329 : wwl_add( wwl_and( _x, wwl( _m43,_m43,_m43,_m43,_m43,_m40, 0L,0L ) ), \
330 : wwl_mul( wwl( 19L,1L,1L,1L,1L,1L, 0L,0L ), \
331 : wwl_permute( wwl( 5L,0L,1L,2L,3L,4L, 6L,7L ), \
332 : wwl_shr_vector( _x, wwl( 43L,43L,43L,43L,43L,40L, 0L,0L ) ) ) ) ); \
333 : }))
334 : #else /* ... than a more obtuse shift-and-add based implementation */
335 : #define fd_r43x6_approx_carry_propagate( x ) (__extension__({ \
336 : long const _m43 = (1L<<43)-1L; \
337 : long const _m40 = (1L<<40)-1L; \
338 : wwl_t _x = (x); \
339 : wwl_t _xl = wwl_and( _x, wwl( _m43,_m43,_m43,_m43,_m43,_m40, 0L,0L ) ); \
340 : wwl_t _xh = wwl_shr_vector( _x, wwl( 43L,43L,43L,43L,43L,40L, 0L,0L ) ); \
341 : wwl_t _c = wwl_select( wwl( 5L,0L,1L,2L,3L,4L, 8L,8L ), _xh, wwl_zero() ); \
342 : wwl_t _d = wwl_and( wwl_add( wwl_shl( _c, 1 ), wwl_shl( _c, 4 ) ), wwl( -1L,0L,0L,0L,0L,0L, 0L,0L ) ); \
343 : /* _xl = < x0l,x1l,x2l,x3l,x4l,x5l, 0,0> */ \
344 : /* _c = < x5h,x0h,x1h,x2h,x3h,x4h, 0,0> */ \
345 : /* _d = <18*x5h, 0, 0, 0, 0, 0, 0,0> */ \
346 : wwl_add( wwl_add( _xl, _c ), _d ); \
347 : }))
348 : #endif
349 :
350 916922305 : #define fd_r43x6_fold_unsigned( x ) (__extension__({ \
351 916922305 : long const _m43 = (1L<<43)-1L; \
352 916922305 : long const _m40 = (1L<<40)-1L; \
353 916922305 : wwl_t _x = (x); \
354 916922305 : wwl_madd52lo( wwl_and( _x, wwl( _m43,_m43,_m43,_m43,_m43,_m40, 0L,0L ) ), \
355 916922305 : wwl( 19L,1L,1L,1L,1L,1L, 0L,0L ), \
356 916922305 : wwl_permute( wwl( 5L,0L,1L,2L,3L,4L, 6L,7L ), \
357 916922305 : wwl_shru_vector( _x, wwl( 43L,43L,43L,43L,43L,40L, 0L,0L ) ) ) ); \
358 916922305 : }))
359 :
360 3855097 : #define fd_r43x6_fold_signed( x ) (__extension__({ \
361 3855097 : wwl_t const _b = wwl( 19L<<23,1L<<20,1L<<20,1L<<20,1L<<20,1L<<20, 0L,0L ); \
362 3855097 : wwl_add( fd_r43x6_approx_carry_propagate( wwl_sub( (x), _b ) ), _b ); \
363 3855097 : }))
364 :
365 : /* fd_r43x6_biased_carry_propagate_limbs computes an equivalent
366 : fd_r43x6_t to a signed fd_r43x6_t that has been extracted into the
367 : longs x0-x5 and stores the result into the longs y0-y5. x5 is
368 : subtracted by a small bias in [0,2^20] before that is added back
369 : after. This has no impact on the element represented but can impact
370 : the range needed for limb 5. In-place operation fine. This macro is
371 : robust.
372 :
373 : IMPORTANT! THIS SHOULD NOT BE APPLIED TO ARBITRARY X.
374 :
375 : If x is signed or more generally:
376 :
377 : x0 in [-2^63+19*2^23,2^63-19*(2^23-1))
378 : x1-x4 in [-2^63+ 2^20,2^63- (2^20-1))
379 : x5 in [-2^63+b, 2^63 )
380 :
381 : On return y will be signed but only in one limb such that it is
382 : almost nearly reduced:
383 :
384 : y0-y4 in [0, 2^43 )
385 : y5 in [-2^20+b,2^40+2^20-1+b)
386 :
387 : Thus, with b=2^20, if x is signed or more generally
388 :
389 : x0 in [-2^63+19*2^23,2^63-19*(2^23-1))
390 : x1-x4 in [-2^63+ 2^20,2^63- (2^20-1))
391 : x5 in [-2^63+ 2^20,2^63 )
392 :
393 : On return y will be nearly reduced with y5 in [0,2^40+2^21-1).
394 :
395 : And, with b=0, if x is unsigned or more generally:
396 :
397 : x0 in [0,2^63-19*(2^23-1))
398 : x1-x4 in [0,2^63- (2^20-1))
399 : x5 in [0,2^63 )
400 :
401 : On return y will be nearly reduced with y5 in [0,2^40+2^20-1).
402 :
403 : With b=0, the more restricted x is, the tighter the y5 range. If x
404 : is {unreduced,unpacked,nearly reduced}, with b=0, on return, y will
405 : be nearly reduced with y5 in {[0,2^40+2^5-1),[0,2^40+1),[0,2^40+1)}.
406 : If x is reduced, on return, y will be reduced.
407 :
408 : Under the hood, this does a serial carry propagate on the limbs.
409 :
410 : Theory:
411 :
412 : Break an arbitrary x5 into its lower and upper bits as was done for
413 : approx_carry_propagate. Then:
414 :
415 : x = <x0,x1,x2,x3,x4,x5l> + 2^40 <0,0,0,0,0,x5h>
416 : = <x0,x1,x2,x3,x4,x5l> + 19 <x5h,0,0,0,0,0>
417 : = <x0l+19*x5h,x1,x2,x3,x4,x5l>
418 :
419 : As x5h will be in [-2^23,2^23), if the initial x0 is in
420 : [-2^63+19*2^23,2^63-19*(2^23-1)), this new representation can be
421 : computed without intermediate overflow with limb 4 in [0,2^40) and
422 : limb 0 in [-2^63,2^63).
423 :
424 : Now break an arbitrary x0 into its lower and upper bits. Then:
425 :
426 : x = <x0,x1,x2,x3,x4,x5>
427 : = <x0l,x1,x2,x3,x4,x5> + 2^43 <x0h,0,0,0,0,0>
428 : = <x0l,x1,x2,x3,x4,x5> + <0,x0h,0,0,0,0>
429 : = <x0l,x1+x0h,x2,x3,x5>
430 :
431 : As x0h will be in [-2^20,2^20), if the initial x1 is in
432 : [-2^63+2^20,2^63-(2^20-1)), this new representation can be computed
433 : without intermediate overflow with limb 0 in [0,2^43) and limb 1 in
434 : [-2^63,2^63).
435 :
436 : We can similarly serially propagate limb 1's carries to 2, 2 to 3, 3
437 : to 4 and 4 to 5. As x4h will be in [-2^20,2^20), the limb 5's final
438 : value will be in [-2^20,2^40+2^20-1).
439 :
440 : If x is in unsigned or in the unsigned range described above, limb 4's
441 : carry will be in [0,2^20) such that the result will be nearly reduced
442 : with limb 5 in [0,2^40+2^20-1).
443 :
444 : If x is {unreduced,unpacked,nearly reduced}, limb 4's carry will be
445 : in {[0,2^5),[0,1],[0,1]} such that limb 5 will be in
446 : {[0,2^40+2^5-1),[0,2^40+1),[0,2^40+1)}.
447 :
448 : If x was reduced, all carries will be zero such that y will have the
449 : same limbs as x.
450 :
451 : This yields the above.
452 :
453 : Like approx_carry_propagate, this is a building block for more
454 : complex mappings where x's limbs have already been extracted in order
455 : to minimize the number of limb extracts and fd_r43x6 constructs. */
456 :
457 265670080 : #define fd_r43x6_biased_carry_propagate_limbs(x,y,b) do { \
458 265670080 : long const _m43 = (1L<<43)-1L; \
459 265670080 : long const _m40 = (1L<<40)-1L; \
460 265670080 : long _y0 = (x##0); \
461 265670080 : long _y1 = (x##1); \
462 265670080 : long _y2 = (x##2); \
463 265670080 : long _y3 = (x##3); \
464 265670080 : long _y4 = (x##4); \
465 265670080 : long _y5 = (x##5); \
466 265670080 : long _b = (b); \
467 265670080 : long _c; \
468 265670080 : _y5 -= _b; \
469 265670080 : _c = _y5>>40; _y5 &= _m40; _y0 += 19L*_c; \
470 265670080 : _c = _y0>>43; _y0 &= _m43; _y1 += _c; \
471 265670080 : _c = _y1>>43; _y1 &= _m43; _y2 += _c; \
472 265670080 : _c = _y2>>43; _y2 &= _m43; _y3 += _c; \
473 265670080 : _c = _y3>>43; _y3 &= _m43; _y4 += _c; \
474 265670080 : _c = _y4>>43; _y4 &= _m43; _y5 += _c; \
475 265670080 : _y5 += _b; \
476 265670080 : (y##0) = _y0; \
477 265670080 : (y##1) = _y1; \
478 265670080 : (y##2) = _y2; \
479 265670080 : (y##3) = _y3; \
480 265670080 : (y##4) = _y4; \
481 265670080 : (y##5) = _y5; \
482 265670080 : } while(0)
483 :
484 : /* Note: fd_r43x6_biased_carry_propagate_limbs does not have a good
485 : AVX-512 implementation (it is highly sequential). */
486 :
487 : /* fd_r43x6_mod_nearly_reduced_limbs computes the reduced fd_r43x6_t for
488 : a nearly reduced fd_r43x6_t that has been extracted into the longs
489 : x0-x5 and stores the limbs in the longs y0-y5. In-place operation
490 : fine. This macro is robust. Theory:
491 :
492 : Let x = q p + r where q is an integer and r is in [0,p). Since x is
493 : in [0,2*p), q is in [0,1]. If x<p, q=0 and r=x, otherwise, q=1 and
494 : r=x-p. Using p = 2^255 - 19, x<p implies x+19<2^255. Thus, if x+19
495 : has bit 255 (limb 5 bit 40) clear, r=x, otherwise, r=x-p.
496 :
497 : In pseudo code:
498 : y = x + 19 ... < 2*p+19 = 2^256-19 < 2^256
499 : q = y>>255 ... in [0,1]
500 : if !q, r = x ... x in [0,p ) -> r in [0,p)
501 : else r = x - (2^255-19) ... x in [p,2*p) -> r in [0,p)
502 :
503 : Simplifying:
504 : y = x + 19
505 : q = y>>255
506 : if !q, r = y - 19
507 : else r = y - 2^255
508 :
509 : Or:
510 : y = x + 19
511 : q = y>>255
512 : y -= q<<255 ... clear bit 255
513 : if !q, r = y - 19
514 : else r = y
515 :
516 : Or, branchless for deterministic performance:
517 : y = x + 19
518 : q = y>>255
519 : y -= q<<255
520 : r = y - if(!q,19,0) */
521 :
522 125669680 : #define fd_r43x6_mod_nearly_reduced_limbs(x,y) do { \
523 125669680 : long const _m43 = (1L<<43)-1L; \
524 125669680 : long const _m40 = (1L<<40)-1L; \
525 125669680 : long _y0 = (x##0); \
526 125669680 : long _y1 = (x##1); \
527 125669680 : long _y2 = (x##2); \
528 125669680 : long _y3 = (x##3); \
529 125669680 : long _y4 = (x##4); \
530 125669680 : long _y5 = (x##5); \
531 125669680 : long _c; \
532 125669680 : \
533 125669680 : /* y = x + 19, q = y>>255, y -= q<<255 */ \
534 125669680 : _y0 += 19L; \
535 125669680 : _c = _y0 >> 43; _y0 &= _m43; _y1 += _c; \
536 125669680 : _c = _y1 >> 43; _y1 &= _m43; _y2 += _c; \
537 125669680 : _c = _y2 >> 43; _y2 &= _m43; _y3 += _c; \
538 125669680 : _c = _y3 >> 43; _y3 &= _m43; _y4 += _c; \
539 125669680 : _c = _y4 >> 43; _y4 &= _m43; _y5 += _c; \
540 125669680 : _c = _y5 >> 40; _y5 &= _m40; \
541 125669680 : \
542 125669680 : /* r = y - if(!q,19,0) */ \
543 125669680 : _y0 -= fd_long_if( !_c, 19L, 0L ); \
544 125669680 : _c = _y0 >> 43; _y0 &= _m43; _y1 += _c; \
545 125669680 : _c = _y1 >> 43; _y1 &= _m43; _y2 += _c; \
546 125669680 : _c = _y2 >> 43; _y2 &= _m43; _y3 += _c; \
547 125669680 : _c = _y3 >> 43; _y3 &= _m43; _y4 += _c; \
548 125669680 : _c = _y4 >> 43; _y4 &= _m43; _y5 += _c; \
549 125669680 : \
550 125669680 : (y##0) = _y0; \
551 125669680 : (y##1) = _y1; \
552 125669680 : (y##2) = _y2; \
553 125669680 : (y##3) = _y3; \
554 125669680 : (y##4) = _y4; \
555 125669680 : (y##5) = _y5; \
556 125669680 : } while(0)
557 :
558 : /* Note: fd_r43x6_mod_nearly_reduced_limbs does not have a good AVX-512
559 : implementation (it is highly sequential). */
560 :
561 : /* fd_r43x6_approx_mod(x) returns a nearly reduced fd_r43x6_t equivalent
562 : to an arbitrary fd_r43x6_t x. On return y5 will be in [0,2^40+2).
563 :
564 : fd_r43x6_approx_mod_signed(x) does the same for signed x or, more
565 : generally:
566 :
567 : x0 in [-2^63+19*2^23,2^63-19*(2^23-1))
568 : x1-x4 in [-2^63+ 2^20,2^63- (2^20-1))
569 : x5 in [-2^63+ 2^20,2^63 )
570 :
571 : fd_r43x6_approx_mod_unsigned(x) does the same for unsigned x or, more
572 : generally:
573 :
574 : x0 in [0,2^63-19*(2^23-1))
575 : x1-x4 in [0,2^63- (2^20-1))
576 : x5 in [0,2^63 )
577 :
578 : On return y5 will be in [0,2^40+2^20-1).
579 :
580 : fd_r43x6_approx_mod_unreduced(x) does the same for unreduced x. On
581 : return y5 will be in [0,2^40+2^5-1).
582 :
583 : fd_r43x6_approx_mod_unpacked(x) does the same for unpacked x. On
584 : return y5 will be in [0,2^40+1). */
585 :
586 : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
587 80000100 : fd_r43x6_approx_mod( fd_r43x6_t x ) {
588 80000100 : long y0, y1, y2, y3, y4, y5;
589 80000100 : fd_r43x6_extract_limbs( x, y );
590 :
591 : /* At this point y is arbitrary. We do an approx carry propagate to
592 : reduce the range of limbs suitable for a biased carry propagate.
593 : (Note: it is faster here to do extract then approx-cp than
594 : vector-approx-cp then extract.) */
595 :
596 80000100 : fd_r43x6_approx_carry_propagate_limbs( y, y );
597 :
598 : /* At this point y has:
599 :
600 : y0 in [-19*2^23,2^43+19*(2^23-1))
601 : y1-y4 in [ -2^20,2^43+ (2^20-1))
602 : y5 in [ -2^20,2^40+ (2^20-1))
603 :
604 : An unbiased carry propagate could produce y5=-1. So, we use a
605 : biased c-p with a b=1. TODO: CONSIDER JUST USING 2^20 BIAS ALWAYS? */
606 :
607 80000100 : fd_r43x6_biased_carry_propagate_limbs( y, y, 1L );
608 :
609 80000100 : return fd_r43x6( y0, y1, y2, y3, y4, y5 );
610 80000100 : }
611 :
612 : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
613 20000100 : fd_r43x6_approx_mod_signed( fd_r43x6_t x ) {
614 20000100 : long y0, y1, y2, y3, y4, y5;
615 20000100 : fd_r43x6_extract_limbs( x, y );
616 :
617 : /* At this point y has:
618 :
619 : x0 in [-2^63+19*2^23,2^63-19*(2^23-1))
620 : x1-x4 in [-2^63+ 2^20,2^63- (2^20-1))
621 : x5 in [-2^63+ 2^20,2^63 )
622 :
623 : so we can do a biased carry propagate y with b=2^20 as described
624 : above. */
625 :
626 20000100 : fd_r43x6_biased_carry_propagate_limbs( y, y, 1L<<20 );
627 :
628 20000100 : return fd_r43x6( y0, y1, y2, y3, y4, y5 );
629 20000100 : }
630 :
631 : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
632 60000300 : fd_r43x6_approx_mod_unsigned( fd_r43x6_t x ) {
633 60000300 : long y0, y1, y2, y3, y4, y5;
634 60000300 : fd_r43x6_extract_limbs( x, y );
635 :
636 : /* At this point y has:
637 :
638 : x0 in [0,2^63-19*(2^23-1))
639 : x1-x4 in [0,2^63- (2^20-1))
640 : x5 in [0,2^63 )
641 :
642 : so we can do an unbiased carry propagate as described above. */
643 :
644 60000300 : fd_r43x6_biased_carry_propagate_limbs( y, y, 0L );
645 :
646 60000300 : return fd_r43x6( y0, y1, y2, y3, y4, y5 );
647 60000300 : }
648 :
649 : #define fd_r43x6_approx_mod_unreduced fd_r43x6_approx_mod_unsigned /* no difference in impl, tighter y5 result described above */
650 : #define fd_r43x6_approx_mod_unpacked fd_r43x6_approx_mod_unsigned /* no difference in impl, tighter y5 result described above */
651 :
652 : /* fd_r43x6_mod(x) returns the reduced fd_r43x6_t equivalent to an
653 : arbitrary fd_r43x6_t x.
654 :
655 : fd_r43x6_approx_mod_signed(x) does the same for signed x or more
656 : generally:
657 :
658 : x0 in [-2^63+19*2^23,2^63-19*(2^23-1))
659 : x1-x4 in [-2^63+ 2^20,2^63- (2^20-1))
660 : x5 in [-2^63+ 2^20,2^63 )
661 :
662 : fd_r43x6_mod_unreduced(x) does the same for unreduced x, or, more
663 : generally:
664 :
665 : x0 in [0,2^63-19*(2^23-1))
666 : x1-x4 in [0,2^63- (2^20-1))
667 : x5 in [0,2^63 )
668 :
669 : fd_r43x6_mod_unpacked(x) does the same for unpacked x.
670 :
671 : fd_r43x6_mod_nearly_reduced(x) does the same for nearly reduced x. */
672 :
673 : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
674 23472738 : fd_r43x6_mod( fd_r43x6_t x ) {
675 23472738 : long y0, y1, y2, y3, y4, y5;
676 23472738 : fd_r43x6_extract_limbs( x, y );
677 23472738 : fd_r43x6_approx_carry_propagate_limbs( y, y );
678 23472738 : fd_r43x6_biased_carry_propagate_limbs( y, y, 1L );
679 : /* At this point, x is nearly reduced */
680 23472738 : fd_r43x6_mod_nearly_reduced_limbs( y, y );
681 23472738 : return fd_r43x6( y0, y1, y2, y3, y4, y5 );
682 23472738 : }
683 :
684 : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
685 20000101 : fd_r43x6_mod_signed( fd_r43x6_t x ) {
686 20000101 : long y0, y1, y2, y3, y4, y5;
687 20000101 : fd_r43x6_extract_limbs( x, y );
688 20000101 : fd_r43x6_biased_carry_propagate_limbs( y, y, 1L<<20 );
689 : /* At this point, x is nearly reduced */
690 20000101 : fd_r43x6_mod_nearly_reduced_limbs( y, y );
691 20000101 : return fd_r43x6( y0, y1, y2, y3, y4, y5 );
692 20000101 : }
693 :
694 : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
695 60000303 : fd_r43x6_mod_unsigned( fd_r43x6_t x ) {
696 60000303 : long y0, y1, y2, y3, y4, y5;
697 60000303 : fd_r43x6_extract_limbs( x, y );
698 60000303 : fd_r43x6_biased_carry_propagate_limbs( y, y, 0L );
699 : /* At this point, x is nearly reduced */
700 60000303 : fd_r43x6_mod_nearly_reduced_limbs( y, y );
701 60000303 : return fd_r43x6( y0, y1, y2, y3, y4, y5 );
702 60000303 : }
703 :
704 : #define fd_r43x6_mod_unreduced fd_r43x6_mod_unsigned /* no difference in impl */
705 : #define fd_r43x6_mod_unpacked fd_r43x6_mod_unsigned /* no difference in impl */
706 :
707 : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
708 20000100 : fd_r43x6_mod_nearly_reduced( fd_r43x6_t x ) {
709 20000100 : long y0, y1, y2, y3, y4, y5;
710 20000100 : fd_r43x6_extract_limbs( x, y );
711 : /* At this point, x is already nearly reduced */
712 20000100 : fd_r43x6_mod_nearly_reduced_limbs( y, y );
713 20000100 : return fd_r43x6( y0, y1, y2, y3, y4, y5 );
714 20000100 : }
715 :
716 : /* fd_r43x6_neg_fast(x) returns z = -x, computed as z = p - x
717 : fd_r43x6_add_fast(x,y) returns z = x + y
718 : fd_r43x6_sub_fast(x,y) returns z = x - y, computed as z = x + (p - y)
719 :
720 : These will be applied to lanes 6 and 7 and these assume that the
721 : corresponding limbs of x and y when added / subtracted will produce a
722 : result that doesn't overflow the range of a long.
723 :
724 : For example, it is safe to add a large (but bounded) number of
725 : unpacked x to produce an unreduced sum and then do a single mod at
726 : the end to produce the reduced result. With 0 < ll <= mm < 63 and
727 : letting nn = mm+1, given the input ranges, the below give
728 : conservative output ranges.
729 :
730 : -ull -> sll
731 : -sll -> sll
732 :
733 : ull + umm -> unn, umm + ull -> unn
734 : ull + smm -> snn, umm + sll -> snn
735 : sll + umm -> snn, smm + ull -> snn
736 : sll + smm -> snn, smm + sll -> snn
737 :
738 : ull - umm -> smm, umm - ull -> smm
739 : ull - smm -> snn, umm - sll -> snn
740 : sll - umm -> snn, smm - ull -> snn
741 : sll - smm -> snn, smm - sll -> snn */
742 :
743 1001823 : #define fd_r43x6_neg_fast( x ) wwl_sub( fd_r43x6_p(), (x) )
744 17947099 : #define fd_r43x6_add_fast( x, y ) wwl_add( (x), (y) )
745 1113028 : #define fd_r43x6_sub_fast( x, y ) wwl_add( (x), wwl_sub( (fd_r43x6_p()), (y) ) )
746 :
747 : /* fd_r43x6_mul_fast(x,y) returns z = x*y as an unsigned fd_r43x6_t
748 : with lanes 6 and 7 zero where x and y are unreduced fd_r43x6_t's
749 : (i.e. in u47). Ignores lanes 6 and 7 of x and assumes lanes 6 and 7
750 : of y are zero. More specifically, u44/u45/u46/u47 inputs produce a
751 : u62/u62/u62/u63 output. */
752 :
753 : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
754 : fd_r43x6_mul_fast( fd_r43x6_t x,
755 152051764 : fd_r43x6_t y ) {
756 :
757 : /* 5x5 grade school-ish multiplication accelerated with AVX512 integer
758 : madd52 instructions. The basic algorithm is:
759 :
760 : x*y = (sum_i xi 2^(43*i))*(sum_j yj 2^(43*j))
761 : = sum_i sum_j xi*xj 2^(43*(i+j))
762 : = sum_i sum_j (pijl + 2^43 pijh) 2^(43*(i+j))
763 : = sum_i sum_j ( pijl 2^(43*(i+j)) + pijh 2^(43*(i+j+1)) )
764 : = sum_k zk 2^43 k
765 :
766 : where the product of xi*xj has been split such that:
767 :
768 : pijl + 2^43 pijh = xi*xj
769 :
770 : and zk has grouped all terms with the same scale factor:
771 :
772 : zk = sum_i sum_j ( pijl ((i+j)==k) + pijh ((i+j+1)==k) )
773 :
774 : Or graphically:
775 :
776 : x5 x4 x3 x2 x1 x0
777 : x y5 y4 y3 y2 y1 y0
778 : --------------------------------
779 : p50l p40l p30l p20l p10l p00l -> t0
780 : p50h p40h p30h p20h p10h p00h \
781 : p51l p41l p31l p21l p11l p01l / t1
782 : p51h p41h p31h p21h p11h p01h \
783 : p52l p42l p32l p22l p12l p02l / t2
784 : p52h p42h p32h p22h p12h p02h \
785 : p53l p43l p33l p23l p13l p03l / t3
786 : p53h p43h p33h p23h p13h p03h \
787 : p54l p44l p34l p24l p14l p04l / t4
788 : p54h p44h p34h p24h p14h p04h \
789 : p55l p45l p35l p25l p15l p05l / t5
790 : p55h p45h p35h p25h p15h p05h -> t6
791 : -----------------------------------------------------------
792 : z11 z10 z9 z8 z7 z6 z5 z4 z3 z2 z1 z0
793 : \----------------/ \-----------------------------------/
794 : zh zl
795 :
796 : The conventional split would require xi and xj to be in [0,2^43) and
797 : yield pijl and pijh in [0,2^43). But we need to use a different
798 : split to exploit the madd52 instructions:
799 :
800 : ul = madd52lo(al,x,y) = LO64( al + LO52( LO52(x)*LO52(y) ) )
801 : uh = madd52hi(ah,x,y) = LO64( ah + HI52( LO52(x)*LO52(y) ) )
802 :
803 : Consider when al and ah are zero. Since x and y here are unreduced
804 : and thus in [0,2^47), we have:
805 :
806 : ul = LO52( x*y )
807 : uh = HI52( x*y )
808 : --> ul + 2^52 uh = x*y
809 : --> ul + 2^43 (2^9 uh) = x*y
810 :
811 : Thus, we can use pl=ul and ph=2^9 uh from these instructions as
812 : a split for the above. With this we have:
813 :
814 : pl in [0,2^52)
815 : ph in [0,2^51)
816 :
817 : so:
818 :
819 : z{0,1,2,3, 4, 5} < { 2, 5, 8,11,14,17} 2^51
820 : z{6,7,8,9,10,11} < {16,13,10, 7, 4, 1} 2^51
821 :
822 : Note: the [0,2^47) range for unreduced fd_r43x6_t was picked to
823 : yield pl and ph with similar ranges while allowing for fast
824 : reduction below.
825 :
826 : Note: It might seem even better to use a 5 long radix 52 limb
827 : representation such that madd naturally produces the desired
828 : splitting. If the only consideration is the above, this is correct.
829 :
830 : But this calculation is frequently used in long sequential chains
831 : (e.g. the repeated squarings done to compute the multiplicative
832 : inverse). In the 52x5 case, the zk reduction required to produce an
833 : output representation that can be fed into the next multiplication
834 : has no "headroom". All carries from limbs 0-3 must be fully
835 : propagated to limb 4 such that all limbs are in [0,2^52) to be able
836 : to use madd52 subsequent multiply operations. This requires then
837 : extracting all the limbs and doing a slow sequential calculation as
838 : part of the zk reduction. This throws away most of the advantage of
839 : using instructions like madd52 in the first place.
840 :
841 : Using 43x6 is nearly the same cost as 52x5 for the above because we
842 : have unused AVX512 lanes and the scaling needed to tweak the
843 : madd52hi result is a fast shift operation. Critically, the needed
844 : zk reduction (described below) can be done with a fast approximate
845 : carry propagate to get a result that can be immediately fed into
846 : subsequent multiplications.
847 :
848 : The overall impact is that this is typically ~2-3x faster than, for
849 : example, the fd_ed25519_fe_t scalar multiplier on platforms with
850 : AVX512 support.
851 :
852 : This implementation is not the "textbook" style found in the
853 : literature. The textbook implementations accumulate zh and zl by
854 : interleaving alignr with the madds. The putative benefit of such is
855 : that it can make use of the madd52hi adder to save some adds over
856 : the below. Unfortunately, such requires more alignr / more
857 : instruction footprint / more sequential dependencies and has less
858 : ILP. In practice, adds are much cheaper than alignr (remember, data
859 : motion has been more expensive than computation for decades). So
860 : spending some adds to buy fewer alignr, a smaller instruction
861 : footprint and more ILP is a great trade and yields the faster than
862 : textbook implementation below. */
863 :
864 152051764 : wwl_t const zero = wwl_zero();
865 :
866 152051764 : wwl_t x0 = wwl_permute( zero, x );
867 152051764 : wwl_t x1 = wwl_permute( wwl_one(), x );
868 152051764 : wwl_t x2 = wwl_permute( wwl_bcast( 2L ), x );
869 152051764 : wwl_t x3 = wwl_permute( wwl_bcast( 3L ), x );
870 152051764 : wwl_t x4 = wwl_permute( wwl_bcast( 4L ), x );
871 152051764 : wwl_t x5 = wwl_permute( wwl_bcast( 5L ), x );
872 :
873 152051764 : wwl_t t0 = wwl_madd52lo( zero, x0, y );
874 152051764 : wwl_t t1 = wwl_madd52lo( wwl_shl( wwl_madd52hi( zero, x0, y ), 9 ), x1, y );
875 152051764 : wwl_t t2 = wwl_madd52lo( wwl_shl( wwl_madd52hi( zero, x1, y ), 9 ), x2, y );
876 152051764 : wwl_t t3 = wwl_madd52lo( wwl_shl( wwl_madd52hi( zero, x2, y ), 9 ), x3, y );
877 152051764 : wwl_t t4 = wwl_madd52lo( wwl_shl( wwl_madd52hi( zero, x3, y ), 9 ), x4, y );
878 152051764 : wwl_t t5 = wwl_madd52lo( wwl_shl( wwl_madd52hi( zero, x4, y ), 9 ), x5, y );
879 152051764 : wwl_t t6 = wwl_shl( wwl_madd52hi( zero, x5, y ), 9 );
880 :
881 152051764 : wwl_t p0j = t0; /* note: q0j = 0 */
882 152051764 : wwl_t p1j = wwl_slide( zero, t1, 7 ); /* note: q1j = 0 */
883 152051764 : wwl_t p2j = wwl_slide( zero, t2, 6 ); /* note: q2j = 0 */
884 152051764 : wwl_t p3j = wwl_slide( zero, t3, 5 ); wwl_t q3j = wwl_slide( t3, zero, 5 );
885 152051764 : wwl_t p4j = wwl_slide( zero, t4, 4 ); wwl_t q4j = wwl_slide( t4, zero, 4 );
886 152051764 : wwl_t p5j = wwl_slide( zero, t5, 3 ); wwl_t q5j = wwl_slide( t5, zero, 3 );
887 152051764 : wwl_t p6j = wwl_slide( zero, t6, 2 ); wwl_t q6j = wwl_slide( t6, zero, 2 );
888 :
889 152051764 : wwl_t zl = wwl_add( wwl_add( wwl_add( p0j, p1j ), wwl_add( p2j, p3j ) ), wwl_add( wwl_add( p4j, p5j ), p6j ) );
890 152051764 : wwl_t zh = wwl_add( wwl_add( q3j, q4j ), wwl_add( q5j, q6j ) );
891 :
892 : /* At this point:
893 : z = <zl0,zl1,zl2,zl3,zl4,zl5,zl6,zl7> + 2^344 <zh0,zh1,zh2,zh3,0,0,0,0> */
894 :
895 152051764 : wwl_t za = wwl_and( zl, wwl( -1L,-1L,-1L,-1L,-1L,-1L, 0L,0L ) );
896 152051764 : wwl_t zb = wwl_slide( zl, zh, 6 );
897 :
898 : /* At this point:
899 :
900 : z = <za0,za1,za2,za3,za4,za5> + 2^258 <zb0,zb1,zb2,zb3,zb4,zb5>
901 :
902 : where (as shown above):
903 :
904 : za{0,1,2,3,4,5} < 2^51 { 2, 5, 8,11,14,17}
905 : zb{0,1,2,3,4,5) < 2^51 {16,13,10, 7, 4, 1}
906 :
907 : Using:
908 :
909 : 2^258 mod p = (p+19) 2^3 mod p = 19*2^3 = 152
910 :
911 : we can reduce this to 6 limbs via:
912 :
913 : z = <za0,za1,za2,za3,za4,za5> + 152 <zb0,zb1,zb2,zb3,zb4,zb5>
914 :
915 : We can do the sum directly because:
916 :
917 : z{0,1,2,3,4,5} < 2^51 {2434,1981,1528,1075,622,169} < 2^63
918 :
919 : (These limbs are too large to use in a subsequent multiply but they
920 : are in the range where we can do a fd_r43x6_fold_unsigned,
921 : yielding:
922 :
923 : z0 in [0,2^43+19*(2^23-1))
924 : z1-z4 in [0,2^43+ (2^20-1))
925 : z5 in [0,2^40+ (2^20-1))
926 :
927 : This is an unreduced fd_r43x6_t and thus suitable direct use in
928 : subsequent multiply operations. See fd_r43x6_mul.)
929 :
930 : Note that mullo is slow. Since 152 = 2^7 + 2^4 + 2^3 and is
931 : applied to all lanes, we can compute za+152*zb slightly faster via
932 : shift and add techniques. */
933 :
934 152051764 : return wwl_add( wwl_add( za, wwl_shl( zb, 7 ) ), wwl_add( wwl_shl( zb, 4 ), wwl_shl( zb, 3 ) ) );
935 152051764 : }
936 :
937 : /* fd_r43x6_sqr_fast(x) returns z = x^2 as an unsigned fd_r43x6_t with
938 : lanes 6 and 7 zero where x is an unreduced fd_r43x6_t (i.e. in u47).
939 : Assumes lanes 6 and 7 of x are zero. More specifically,
940 : u44/u45/u46/u47 inputs produce a u61/u61/u62/u62 output.
941 :
942 : IMPORTANT! z may not be the same representation returned by
943 : fd_r43x6_mul_fast(x,x). This is because doubling of the off-diagonal
944 : partial products is done _before_ the multiplications while it is
945 : done _after_ the multiplications in fd_r43x6_mul. This is faster for
946 : sqr and reduces the range of the outputs slightly (which can then be
947 : used to further optimize code using sqr). */
948 :
949 : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
950 851153335 : fd_r43x6_sqr_fast( fd_r43x6_t x ) {
951 851153335 : wwl_t const zero = wwl_zero();
952 :
953 : /* The goal of this implementation is to compute each product once, but
954 : to make sure each product gets generated in the right AVX lane so
955 : that we don't have to do a ton of shuffling at the end. In
956 : exchange, we do a lot of permutevars upfront to get everything
957 : setup. It's possible trading permutevars for madd52s might improve
958 : performance.
959 :
960 : We'll compute
961 : p0 = x{0,0,0,0,0,0,3,3} * x{0,1,2,3,4,5,3,4}
962 : p1 = x{4,4,1,1,1,1,1,-} * x{4,5,1,2,3,4,5,-}
963 : p2 = x{3,-,5,-,2,2,2,2} * x{5,-,5,-,2,3,4,5}
964 : with the scaling of non-square terms omitted above. A dash
965 : indicates a don't care value, since we have to do 24
966 : multiplications but there are only 21 unique terms.
967 :
968 : All the terms of p0 belong in the low final result, but the first
969 : two terms of p1, and the first and third terms of p2 belong in the
970 : high final result. The other gotcha is that each multiplication
971 : has a high and a low component, so we confusingly have two
972 : different notions of high/low. */
973 :
974 851153335 : wwl_t x0 = wwl_permute( wwl( 0L, 0L, 0L, 0L, 0L, 0L, 3L, 3L ), x );
975 851153335 : wwl_t x1 = wwl_permute( wwl( 0L, 1L, 2L, 3L, 4L, 5L, 3L, 4L ), x );
976 851153335 : wwl_t x2 = wwl_permute( wwl( 4L, 4L, 1L, 1L, 1L, 1L, 1L, 7L ), x );
977 851153335 : wwl_t x3 = wwl_permute( wwl( 4L, 5L, 1L, 2L, 3L, 4L, 5L, 7L ), x );
978 851153335 : wwl_t x4 = wwl_permute( wwl( 3L, 7L, 5L, 7L, 2L, 2L, 2L, 2L ), x );
979 851153335 : wwl_t x5 = wwl_permute( wwl( 5L, 7L, 5L, 7L, 2L, 3L, 4L, 5L ), x );
980 :
981 : /* Double the non-square terms. */
982 :
983 851153335 : x0 = wwl_shl_vector( x0, wwl( 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L ) );
984 851153335 : x2 = wwl_shl_vector( x2, wwl( 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L ) );
985 851153335 : x4 = wwl_shl_vector( x4, wwl( 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L ) );
986 :
987 851153335 : wwl_t p0l = wwl_madd52lo( zero, x0, x1 );
988 851153335 : wwl_t p1l = wwl_madd52lo( zero, x2, x3 );
989 851153335 : wwl_t p2l = wwl_madd52lo( zero, x4, x5 );
990 :
991 : /* Use the same approach as in the multiply to generate the high bits
992 : of each individual product. */
993 :
994 851153335 : wwl_t p0h = wwl_shl( wwl_madd52hi( zero, x0, x1 ), 9 );
995 851153335 : wwl_t p1h = wwl_shl( wwl_madd52hi( zero, x2, x3 ), 9 );
996 851153335 : wwl_t p2h = wwl_shl( wwl_madd52hi( zero, x4, x5 ), 9 );
997 :
998 : /* Generate masks to split p_i into the terms that belong in the high
999 : word and low word. */
1000 :
1001 851153335 : wwl_t const mask1 = wwl( -1L,-1L, 0L, 0L, 0L, 0L, 0L,0L );
1002 851153335 : wwl_t const mask2 = wwl( -1L, 0L,-1L, 0L, 0L, 0L, 0L,0L );
1003 851153335 : wwl_t zll = wwl_add( p0l, wwl_add( wwl_andnot( mask1, p1l ), wwl_andnot( mask2, p2l ) ) );
1004 851153335 : wwl_t zlh = wwl_add( p0h, wwl_add( wwl_andnot( mask1, p1h ), wwl_andnot( mask2, p2h ) ) );
1005 851153335 : wwl_t zhl = wwl_add( wwl_and ( mask1, p1l ), wwl_and ( mask2, p2l ) );
1006 851153335 : wwl_t zhh = wwl_add( wwl_and ( mask1, p1h ), wwl_and ( mask2, p2h ) );
1007 :
1008 : /* Generate zl and zh as in fd_r43x6_mul_fast */
1009 :
1010 851153335 : wwl_t zl = wwl_add( zll, wwl_slide( zero, zlh, 7 ) );
1011 851153335 : wwl_t zh = wwl_add( zhl, wwl_add( wwl_slide( zero, zhh, 7 ), wwl_slide( zlh, zero, 7 ) ) );
1012 :
1013 851153335 : wwl_t za = wwl_and( zl, wwl( -1L,-1L,-1L,-1L,-1L,-1L, 0L,0L ) );
1014 851153335 : wwl_t zb = wwl_slide( zl, zh, 6 );
1015 :
1016 : /* By the same type of analysis above, we can still do the sum
1017 : directly as:
1018 :
1019 : z{0,1,2,3,4,5} < 2^51 {1826,1371,1222,767,618,163} < 2^62
1020 :
1021 : (Note that the result is fits into u62 instead of u63 like for
1022 : mul_fast.) */
1023 :
1024 851153335 : return wwl_add( wwl_add( za, wwl_shl( zb, 7 ) ), wwl_add( wwl_shl( zb, 4 ), wwl_shl( zb, 3 ) ) );
1025 851153335 : }
1026 :
1027 : /* fd_r43x6_scale_fast(x0,y) returns z = <x0,0,0,0,0>*y as an unsigned
1028 : fd_r43x6_t with lanes 6 and 7 zero where x is in [0,2^47) and y is an
1029 : unreduced fd_r43x6_t (i.e. in u47). Assumes lanes 6 and 7 of y are
1030 : zero. More specifically, u47 inputs produce a u59 output. */
1031 :
1032 : FD_FN_CONST static inline fd_r43x6_t
1033 : fd_r43x6_scale_fast( long _x0,
1034 46741205 : fd_r43x6_t y ) {
1035 :
1036 : /* This is fd_r43x6_mul with x = <_x0,0,0,0,0> and zeros values
1037 : optimized out. See fd_r43x6_mul for detailed explanation how this
1038 : works. */
1039 :
1040 46741205 : wwl_t const zero = wwl_zero();
1041 :
1042 46741205 : wwl_t x0 = wwl_bcast( _x0 );
1043 :
1044 46741205 : wwl_t t0 = wwl_madd52lo( zero, x0, y );
1045 46741205 : wwl_t t1 = wwl_shl( wwl_madd52hi( zero, x0, y ), 9 );
1046 :
1047 46741205 : wwl_t p0j = t0;
1048 46741205 : wwl_t p1j = wwl_slide( zero, t1, 7 );
1049 :
1050 46741205 : wwl_t zl = wwl_add( p0j, p1j );
1051 :
1052 46741205 : wwl_t za = wwl_and( zl, wwl( -1L,-1L,-1L,-1L,-1L,-1L, 0L,0L ) );
1053 46741205 : wwl_t zb = wwl_slide( zl, zero, 6 );
1054 :
1055 : /* At this point:
1056 :
1057 : za{0,1,2,3,4,5} < 2^51 {2,3,3,3,3,3}
1058 : zb0 < 2^51
1059 :
1060 : such that:
1061 :
1062 : z{0,1,2,3,4,5} < 2^51 {154,3,3,3,3,3} < 2^63 */
1063 :
1064 46741205 : return wwl_add( wwl_add( za, wwl_shl( zb, 7 ) ), wwl_add( wwl_shl( zb, 4 ), wwl_shl( zb, 3 ) ) );
1065 46741205 : }
1066 :
1067 : /* fd_r43x6_neg/add/sub/mul/scale fold the results of their fast
1068 : counterparts above. Given unreduced r43x6_t's (i.e. in u47) with
1069 : lanes 6 and 7 zero and/or x0 in [0,2^47), these return unreduced
1070 : results (in u44) with lanes 6 and 7 zero. */
1071 :
1072 : #define fd_r43x6_neg( x ) fd_r43x6_fold_signed ( fd_r43x6_neg_fast ( (x) ) )
1073 1323168 : #define fd_r43x6_add( x, y ) fd_r43x6_fold_unsigned( fd_r43x6_add_fast ( (x), (y) ) )
1074 601510 : #define fd_r43x6_sub( x, y ) fd_r43x6_fold_signed ( fd_r43x6_sub_fast ( (x), (y) ) )
1075 71187608 : #define fd_r43x6_mul( x, y ) fd_r43x6_fold_unsigned( fd_r43x6_mul_fast ( (x), (y) ) )
1076 764411529 : #define fd_r43x6_sqr( x ) fd_r43x6_fold_unsigned( fd_r43x6_sqr_fast ( (x) ) )
1077 : #define fd_r43x6_scale( x0, y ) fd_r43x6_fold_unsigned( fd_r43x6_scale_fast( (x0), (y) ) )
1078 :
1079 : /* fd_r43x6_invert(z) returns the multiplicative inverse of z in GF(p)
1080 : as an unreduced fd_r43x6_t (in u44) with lanes 6 and 7 zero where z
1081 : is an unreduced fd_r43x6_t (i.e. in u47) with lanes 6 and 7 zero. */
1082 :
1083 : FD_FN_CONST fd_r43x6_t
1084 : fd_r43x6_invert( fd_r43x6_t z );
1085 :
1086 : /* Miscellaneous APIs *************************************************/
1087 :
1088 : /* fd_r43x6_if(c,x,y) returns x if c is non-zero and y if not.
1089 : Branchless for deterministic timing. This macro is robust. */
1090 :
1091 2289794 : #define fd_r43x6_if(c,x,y) wwl_if( (-!(c)) & 0xff, (y), (x) )
1092 :
1093 : /* fd_r43x6_swap_if(c,x,y) will swap the contents of x and y if c is
1094 : non-zero and leave the contents of x and y unchanged otherwise.
1095 : Branchless for deterministic timing. This macro is robust. */
1096 :
1097 10000000 : #define fd_r43x6_swap_if(c,x,y) do { \
1098 10000000 : wwl_t _x = (x); \
1099 10000000 : wwl_t _y = (y); \
1100 10000000 : int _m = (-!(c)) & 0xff; \
1101 10000000 : (x) = wwl_if( _m, _x, _y ); \
1102 10000000 : (y) = wwl_if( _m, _y, _x ); \
1103 10000000 : } while(0)
1104 :
1105 : /* fd_r43x6_is_nonzero(x) reduces a signed fd_r43x6_t x (i.e. in s62)
1106 : and returns 0 if the result is zero and 1 if the result is non-zero. */
1107 :
1108 : FD_FN_UNUSED FD_FN_CONST static int /* Work around -Winline */
1109 1677416 : fd_r43x6_is_nonzero( fd_r43x6_t x ) {
1110 1677416 : long l0, l1, l2, l3, l4, l5;
1111 1677416 : fd_r43x6_extract_limbs( x, l ); /* l is signed */
1112 1677416 : fd_r43x6_biased_carry_propagate_limbs( l, l, 1L<<20 ); /* l is nearly reduced */
1113 1677416 : fd_r43x6_mod_nearly_reduced_limbs( l, l ); /* l is reduced */
1114 1677416 : return !!(l0|l1|l2|l3|l4|l5);
1115 1677416 : }
1116 :
1117 : /* fd_r43x6_diagnose(x) reduces a signed r43x6_t x (i.e. in s62) and
1118 : returns -1 if the result is zero and the least significant bit of the
1119 : reduced result otherwise. */
1120 :
1121 : FD_FN_UNUSED FD_FN_CONST static int /* Work around -Winline */
1122 519022 : fd_r43x6_diagnose( fd_r43x6_t x ) {
1123 519022 : long l0, l1, l2, l3, l4, l5;
1124 519022 : fd_r43x6_extract_limbs( x, l ); /* l is signed */
1125 519022 : fd_r43x6_biased_carry_propagate_limbs( l, l, 1L<<20 ); /* l is nearly reduced */
1126 519022 : fd_r43x6_mod_nearly_reduced_limbs( l, l ); /* l is reduced */
1127 519022 : return fd_int_if( !(l0|l1|l2|l3|l4|l5), -1, (int)(l0 & 1L) ); /* cmov */
1128 519022 : }
1129 :
1130 : /* fd_r43x6_pow22523(z) returns z^((p-5)/8) = z^(2^252 - 3) as an unreduced
1131 : fd_r43x6_t (in u44) with lanes 6 and 7 zero where z is an unreduced
1132 : r43x6_t (i.e. in u47) with lanes 6 and 7 of zero. */
1133 :
1134 : FD_FN_CONST fd_r43x6_t
1135 : fd_r43x6_pow22523( fd_r43x6_t z );
1136 :
1137 : FD_PROTOTYPES_END
1138 :
1139 : #include "fd_r43x6_inl.h"
1140 :
1141 : #endif /* FD_HAS_AVX512 */
1142 :
1143 : #endif /* HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_h */
|