Line data Source code
1 : #ifndef HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_inl_h
2 : #define HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_inl_h
3 :
4 : #ifndef HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_h
5 : #error "Do not include this directly; use fd_r43x6.h"
6 : #endif
7 :
8 : /* Protocols like ED25519 do many GF(p) operations that can be run
9 : in parallel in principle. But, because of the complexity of the
10 : individual operations, optimizers struggle with extracting the ILP
11 : (e.g. to get at the ILP in, for example, 3 independent fd_r43x6_mul,
12 : it has to decide to inline all 3 when its heuristics usually indicate
13 : is each mul is too expensive in code footprint to justify inlining
14 : even one and then do a very long range reorganization of the assembly
15 : instructions when its heuristics usually indicate to avoid such to
16 : keep compile time computational complexity reasonable.
17 :
18 : Further, when there are enough operations that can be run in
19 : parallel, it is often a net win to swizzle / deswizzle the data
20 : layout to make use of otherwise unused vector lanes. The optimizer's
21 : ability to do such radical code transformations, is limited at best
22 : and practically impossible for transformations could generate a
23 : different but mathematically equivalent representation of the result,
24 : akin to fd_r43x6_mul(x,x) vs fd_r43x6_sqr(x).
25 :
26 : It is also useful to annotate such parallelism in the protocol
27 : implementations such that they can be upgraded with no change to take
28 : advantage of newer hardware, better compilers, etc by updating these
29 : implementations as appropriate.
30 :
31 : The below makes a low to mid tens of percent performance improvement
32 : for things like ED25519 verify on gcc-12 and icelake-server. */
33 :
34 : FD_PROTOTYPES_BEGIN
35 :
36 : /* FD_R43X6_QUAD_DECL(Q) declares the wwl_t's Q03, Q14 and Q25 in the
37 : local scope to represent fd_r43x6_t X, Y, Z and T, but in a more
38 : efficient way for data parallel GF(p) operations under the hood.
39 : Organization:
40 :
41 : Q03 = [ X0 Y0 Z0 T0 | X3 Y3 Z3 T3 ]
42 : Q14 = [ X1 Y1 Z1 T1 | X4 Y4 Z4 T4 ]
43 : Q25 = [ X2 Y2 Z2 T2 | X5 Y5 Z5 T5 ]
44 :
45 : where Xi is the i-th limb of X. */
46 :
47 1052217379 : #define FD_R43X6_QUAD_DECL( Q ) wwl_t Q##03, Q##14, Q##25
48 :
49 : /* FD_R43X6_QUAD_MOV( D, S ) does D = S. D and S are FD_R43X6_QUAD
50 : declarations in the local scope. */
51 :
52 829078333 : #define FD_R43X6_QUAD_MOV( D, S ) do { D##03 = S##03; D##14 = S##14; D##25 = S##25; } while(0)
53 :
54 : /* FD_R43X6_QUAD_PACK(Q,x,y,z,t) does Q = (x,y,z,t) where Q is a
55 : FD_R43X6_QUAD declared in the local scope, x, y, z and t are
56 : arbitrary fd_r43x6_t. */
57 :
58 87579845 : #define FD_R43X6_QUAD_PACK( Q, x,y,z,t ) do { \
59 87579845 : wwl_t _r0 = (x); \
60 87579845 : wwl_t _r1 = (y); \
61 87579845 : wwl_t _r2 = (z); \
62 87579845 : wwl_t _r3 = (t); \
63 87579845 : /* At this point _r0 = x0 x1 x2 x3 x4 x5 -- -- */ \
64 87579845 : /* _r1 = y0 y1 y2 y3 y4 y5 -- -- */ \
65 87579845 : /* _r2 = z0 z1 z2 z3 z4 z5 -- -- */ \
66 87579845 : /* _r3 = t0 t1 t2 t3 t4 t5 -- -- */ \
67 87579845 : /* Transpose 2x2 blocks */ \
68 87579845 : /* No _mm256_permute2f128_si256 equivalent? Sigh ... */ \
69 87579845 : wwl_t _t0 = wwl_select( wwl( 0, 1, 8, 9, 4, 5,12,13 ), _r0, _r2 ); \
70 87579845 : wwl_t _t1 = wwl_select( wwl( 0, 1, 8, 9, 4, 5,12,13 ), _r1, _r3 ); \
71 87579845 : wwl_t _t2 = wwl_select( wwl( 2, 3,10,11, 6, 7,12,13 ), _r0, _r2 ); \
72 87579845 : wwl_t _t3 = wwl_select( wwl( 2, 3,10,11, 6, 7,12,13 ), _r1, _r3 ); \
73 87579845 : /* At this point _t0 = x0 x1 z0 z1 x4 x5 z4 z5 */ \
74 87579845 : /* _t1 = y0 y1 t0 t1 y4 y5 t4 t5 */ \
75 87579845 : /* _t2 = x2 x3 z2 z3 -- -- -- -- */ \
76 87579845 : /* _t3 = y2 y3 t2 t3 -- -- -- -- */ \
77 87579845 : /* Transpose 1x1 blocks */ \
78 87579845 : wwl_t _c04 = _mm512_unpacklo_epi64( _t0, _t1 ); \
79 87579845 : wwl_t _c15 = _mm512_unpackhi_epi64( _t0, _t1 ); \
80 87579845 : wwl_t _c26 = _mm512_unpacklo_epi64( _t2, _t3 ); \
81 87579845 : wwl_t _c37 = _mm512_unpackhi_epi64( _t2, _t3 ); \
82 87579845 : /* At this point _c04 = x0 y0 z0 t0 x4 y4 z4 t4 */ \
83 87579845 : /* _c15 = x1 y1 t1 t1 x5 y5 z5 t5 */ \
84 87579845 : /* _c26 = x2 y2 z2 t2 -- -- -- -- */ \
85 87579845 : /* _c37 = x3 y3 z3 t3 -- -- -- -- */ \
86 87579845 : Q##03 = wwl_pack_halves( _c04,0, _c37,0 ); \
87 87579845 : Q##14 = wwl_pack_h0_h1 ( _c15, _c04 ); \
88 87579845 : Q##25 = wwl_pack_h0_h1 ( _c26, _c15 ); \
89 87579845 : } while(0)
90 :
91 : /* FD_R43X6_QUAD_UNPACK(x,y,z,t,Q) does (x,y,z,t) = Q where x, y, z and
92 : t are arbitrary fd_r43x6_t and Q is a FD_R43X6_QUAD declared in the
93 : local scope. */
94 :
95 138293668 : #define FD_R43X6_QUAD_UNPACK( x,y,z,t, Q ) do { \
96 138293668 : wwl_t _r0 = Q##03; \
97 138293668 : wwl_t _r1 = Q##14; \
98 138293668 : wwl_t _r2 = Q##25; \
99 138293668 : wwl_t _r3 = wwl_zero(); \
100 138293668 : /* At this point _r0 = x0 y0 z0 t0 x3 y3 z3 t3 */ \
101 138293668 : /* _r1 = x1 y1 z1 t1 x4 y4 z4 t4 */ \
102 138293668 : /* _r2 = x2 y2 z2 t2 x5 y5 z5 t5 */ \
103 138293668 : /* _r3 = 0 0 0 0 0 0 0 0 */ \
104 138293668 : /* Transpose 1x1 blocks */ \
105 138293668 : wwl_t _c0 = _mm512_unpacklo_epi64( _r0, _r1 ); \
106 138293668 : wwl_t _c1 = _mm512_unpackhi_epi64( _r0, _r1 ); \
107 138293668 : wwl_t _c2 = _mm512_unpacklo_epi64( _r2, _r3 ); \
108 138293668 : wwl_t _c3 = _mm512_unpackhi_epi64( _r2, _r3 ); \
109 138293668 : /* At this point _c0 = x0 x1 z0 z1 x3 x4 z3 z4 */ \
110 138293668 : /* _c1 = y0 y1 t0 t1 y3 y4 t3 t4 */ \
111 138293668 : /* _c2 = x2 0 z2 0 x5 0 z5 0 */ \
112 138293668 : /* _c3 = y2 0 t2 0 y5 0 t5 0 */ \
113 138293668 : (x) = wwl_select( wwl( 0,1, 8, 4,5,12, 9,9 ), _c0,_c2 ); \
114 138293668 : (y) = wwl_select( wwl( 0,1, 8, 4,5,12, 9,9 ), _c1,_c3 ); \
115 138293668 : (z) = wwl_select( wwl( 2,3,10, 6,7,14, 9,9 ), _c0,_c2 ); \
116 138293668 : (t) = wwl_select( wwl( 2,3,10, 6,7,14, 9,9 ), _c1,_c3 ); \
117 138293668 : } while(0)
118 :
119 : /* FD_R43X6_QUAD_PERMUTE(D,S) does:
120 : D = [ S(imm0) S(imm1) S(imm2) S(imm3) ]
121 : where imm* are in [0,3] (0/1/2/3->X/Y/Z/T) */
122 :
123 730844719 : #define FD_R43X6_QUAD_PERMUTE( D, imm0,imm1,imm2,imm3, S ) do { \
124 730844719 : wwl_t const _perm = wwl( (imm0),(imm1),(imm2),(imm3), 4+(imm0),4+(imm1),4+(imm2),4+(imm3) ); \
125 730844719 : D##03 = wwl_permute( _perm, S##03 ); \
126 730844719 : D##14 = wwl_permute( _perm, S##14 ); \
127 730844719 : D##25 = wwl_permute( _perm, S##25 ); \
128 730844719 : } while(0)
129 :
130 : /* FD_R43X6_QUAD_LANE_IF does:
131 : D = [ imm0 ? SX : TX, imm1 ? SY : TY, imm2 ? SZ : TZ, imm3 ? ST : TT ]
132 : imm* should be in [0,1]. */
133 :
134 43482010 : #define FD_R43X6_QUAD_LANE_IF( D, imm0,imm1,imm2,imm3, S, T ) do { \
135 43482010 : int _mask = 17*(imm0) + 34*(imm1) + 68*(imm2) + 136*(imm3); \
136 43482010 : D##03 = wwl_if( _mask, S##03, T##03 ); \
137 43482010 : D##14 = wwl_if( _mask, S##14, T##14 ); \
138 43482010 : D##25 = wwl_if( _mask, S##25, T##25 ); \
139 43482010 : } while(0)
140 :
141 : /* FD_R43X6_QUAD_LANE_ADD_FAST does:
142 : D = [ (imm0 ? (PX+QX) : SX) (imm1 ? (PY+QY) : SY) (imm2 ? (PZ+QZ) : SZ) (imm3 ? (PT+QT) : ST) ]
143 : imm* should be in [0,1]. */
144 :
145 453846121 : #define FD_R43X6_QUAD_LANE_ADD_FAST( D, S, imm0,imm1,imm2,imm3, P, Q ) do { \
146 453846121 : int _mask = 17*(imm0) + 34*(imm1) + 68*(imm2) + 136*(imm3); \
147 453846121 : D##03 = wwv_add_if( _mask, P##03, Q##03, S##03 ); \
148 453846121 : D##14 = wwv_add_if( _mask, P##14, Q##14, S##14 ); \
149 453846121 : D##25 = wwv_add_if( _mask, P##25, Q##25, S##25 ); \
150 453846121 : } while(0)
151 :
152 : /* FD_R43X6_QUAD_LANE_SUB_FAST does:
153 : D = [ (imm0 ? (PX-QX) : SX) (imm1 ? (PY-QY) : SY) (imm2 ? (PZ-QZ) : SZ) (imm3 ? (PT-QT) : ST) ]
154 : imm* should be in [0,1]. */
155 323763046 : #define FD_R43X6_QUAD_LANE_SUB_FAST( D, S, imm0,imm1,imm2,imm3, P, Q ) do { \
156 323763046 : int _mask = 17*(imm0) + 34*(imm1) + 68*(imm2) + 136*(imm3); \
157 323763046 : FD_R43X6_QUAD_DECL( M ); \
158 323763046 : M##03 = wwl( 8796093022189L, 8796093022189L, 8796093022189L, 8796093022189L, 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L ); \
159 323763046 : M##14 = wwl( 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L ); \
160 323763046 : M##25 = wwl( 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L, 1099511627775L, 1099511627775L, 1099511627775L, 1099511627775L ); \
161 323763046 : M##03 = wwv_sub( M##03, Q##03 ); \
162 323763046 : M##14 = wwv_sub( M##14, Q##14 ); \
163 323763046 : M##25 = wwv_sub( M##25, Q##25 ); \
164 323763046 : D##03 = wwv_add_if( _mask, P##03, M##03, S##03 ); \
165 323763046 : D##14 = wwv_add_if( _mask, P##14, M##14, S##14 ); \
166 323763046 : D##25 = wwv_add_if( _mask, P##25, M##25, S##25 ); \
167 323763046 : } while(0)
168 :
169 : /* FD_R43X6_QUAD_FOLD_UNSIGNED(R,P) does:
170 : R = [ fd_r43x6_fold_unsigned(PX) fd_r43x6_fold_unsigned(PY) fd_r43x6_fold_unsigned(PZ) fd_r43x6_fold_unsigned(PT) ] */
171 :
172 346902033 : #define FD_R43X6_QUAD_FOLD_UNSIGNED( R, P ) do { \
173 346902033 : long const _m43 = (1L<<43) - 1L; \
174 346902033 : long const _m40 = (1L<<40) - 1L; \
175 346902033 : \
176 346902033 : wwl_t const _m43_m43 = wwl_bcast( _m43 ); \
177 346902033 : wwl_t const _m43_m40 = wwl( _m43,_m43,_m43,_m43, _m40,_m40,_m40,_m40 ); \
178 346902033 : wwl_t const _s43_s40 = wwl( 43L, 43L, 43L, 43L, 40L, 40L, 40L, 40L ); \
179 346902033 : \
180 346902033 : wwl_t _Ph03 = wwl_shru ( P##03, 43 ); \
181 346902033 : wwl_t _Ph14 = wwl_shru ( P##14, 43 ); \
182 346902033 : wwl_t _Ph25 = wwl_shru_vector( P##25, _s43_s40 ); \
183 346902033 : wwl_t _19_Ph25 = wwl_add( _Ph25, wwl_add( wwl_shl( _Ph25, 1 ), wwl_shl( _Ph25, 4 ) ) ); \
184 346902033 : \
185 346902033 : R##03 = wwl_add( wwl_and( P##03, _m43_m43 ), wwl_pack_halves( _19_Ph25,1, _Ph25,0 ) ); \
186 346902033 : R##14 = wwl_add( wwl_and( P##14, _m43_m43 ), _Ph03 ); \
187 346902033 : R##25 = wwl_add( wwl_and( P##25, _m43_m40 ), _Ph14 ); \
188 346902033 : } while(0)
189 :
190 : /* FD_R43X6_QUAD_FOLD_SIGNED(R,P) does:
191 : R = [ fd_r43x6_fold_signed(PX) fd_r43x6_fold_signed(PY) fd_r43x6_fold_signed(PZ) fd_r43x6_fold_signed(PT) ] */
192 26741005 : #define FD_R43X6_QUAD_FOLD_SIGNED( R, P ) do { \
193 26741005 : long const _b0 = 19L<<23; \
194 26741005 : long const _bb = 1L<<20; \
195 26741005 : long const _m43 = (1L<<43) - 1L; \
196 26741005 : long const _m40 = (1L<<40) - 1L; \
197 26741005 : \
198 26741005 : wwl_t const _bias03 = wwl( _b0, _b0, _b0, _b0, _bb, _bb, _bb, _bb ); \
199 26741005 : wwl_t const _bias = wwl_bcast( _bb ); \
200 26741005 : wwl_t const _m43_m43 = wwl_bcast( _m43 ); \
201 26741005 : wwl_t const _m43_m40 = wwl( _m43,_m43,_m43,_m43, _m40,_m40,_m40,_m40 ); \
202 26741005 : wwl_t const _s43_s40 = wwl( 43L, 43L, 43L, 43L, 40L, 40L, 40L, 40L ); \
203 26741005 : \
204 26741005 : wwl_t _P03 = wwl_sub( P##03, _bias03 ); \
205 26741005 : wwl_t _P14 = wwl_sub( P##14, _bias ); \
206 26741005 : wwl_t _P25 = wwl_sub( P##25, _bias ); \
207 26741005 : \
208 26741005 : wwl_t _Ph03 = wwl_shr ( _P03, 43 ); \
209 26741005 : wwl_t _Ph14 = wwl_shr ( _P14, 43 ); \
210 26741005 : wwl_t _Ph25 = wwl_shr_vector( _P25, _s43_s40 ); \
211 26741005 : wwl_t _19_Ph25 = wwl_add( _Ph25, wwl_add( wwl_shl( _Ph25, 1 ), wwl_shl( _Ph25, 4 ) ) ); \
212 26741005 : \
213 26741005 : R##03 = wwl_add( wwl_and( _P03, _m43_m43 ), wwl_add( wwl_pack_halves( _19_Ph25,1, _Ph25,0 ), _bias03 ) ); \
214 26741005 : R##14 = wwl_add( wwl_and( _P14, _m43_m43 ), wwl_add( _Ph03, _bias ) ); \
215 26741005 : R##25 = wwl_add( wwl_and( _P25, _m43_m40 ), wwl_add( _Ph14, _bias ) ); \
216 26741005 : } while(0)
217 :
218 : /* FD_R43X6_QUAD_MUL_FAST(R,P,Q) does (
219 : [ fd_r43x6_mul_fast(PX,QX) fd_r43x6_mul_fast(PY,QY) fd_r43x6_mul_fast(PZ,QZ) fd_r43x6_mul_fast(PT,QT) ]
220 : Written this way so that pointer escapes don't inhibit optimizations. */
221 :
222 263834045 : #define FD_R43X6_QUAD_MUL_FAST( R, P, Q ) do { \
223 263834045 : FD_R43X6_QUAD_DECL( _R ); fd_r43x6_quad_mul_fast( &_R03,&_R14,&_R25, P##03,P##14,P##25, Q##03,Q##14,Q##25 ); \
224 263834045 : FD_R43X6_QUAD_MOV( R, _R ); \
225 263834045 : } while(0)
226 :
227 : FD_FN_UNUSED static void /* let compiler decide if worth inlining */
228 : fd_r43x6_quad_mul_fast( fd_r43x6_t * _z03, fd_r43x6_t * _z14, fd_r43x6_t * _z25,
229 : fd_r43x6_t x03, fd_r43x6_t x14, fd_r43x6_t x25,
230 293834345 : fd_r43x6_t y03, fd_r43x6_t y14, fd_r43x6_t y25 ) {
231 :
232 : /* Grade school-ish from the original mul:
233 :
234 : x5 x4 x3 x2 x1 x0
235 : x y5 y4 y3 y2 y1 y0
236 : --------------------------------
237 : p50l p40l p30l p20l p10l p00l
238 : p50h p40h p30h p20h p10h p00h
239 : p51l p41l p31l p21l p11l p01l
240 : p51h p41h p31h p21h p11h p01h
241 : p52l p42l p32l p22l p12l p02l
242 : p52h p42h p32h p22h p12h p02h
243 : p53l p43l p33l p23l p13l p03l
244 : p53h p43h p33h p23h p13h p03h
245 : p54l p44l p34l p24l p14l p04l
246 : p54h p44h p34h p24h p14h p04h
247 : p55l p45l p35l p25l p15l p05l
248 : p55h p45h p35h p25h p15h p05h
249 : -----------------------------------------------------------
250 : zb5 zb4 zb3 zb2 zb1 zb0 za5 za4 za3 za2 za1 za0
251 :
252 : Reorganize the partials into low and high parts:
253 :
254 : p50l p40l p30l p20l p10l p00l
255 : p51l p41l p31l p21l p11l p01l
256 : p52l p42l p32l p22l p12l p02l
257 : p53l p43l p33l p23l p13l p03l
258 : p54l p44l p34l p24l p14l p04l
259 : p55l p45l p35l p25l p15l p05l
260 :
261 : p50h p40h p30h p20h p10h p00h
262 : p51h p41h p31h p21h p11h p01h
263 : p52h p42h p32h p22h p12h p02h
264 : p53h p43h p33h p23h p13h p03h
265 : p54h p44h p34h p24h p14h p04h
266 : p55h p45h p35h p25h p15h p05h
267 :
268 : We start with 3 8-lane vectors per input. These hold 4 fd_r43x6_t
269 : organized as:
270 :
271 : x03 = [ X0 X3 ], y03 = [ Y0 Y3 ],
272 : x14 = [ X1 X4 ], y14 = [ Y1 Y4 ],
273 : x25 = [ X2 X5 ], y25 = [ Y2 Y5 ]
274 :
275 : Above, Xi indicates limb i for the 4 input. We can quickly form
276 : "xii = [ Xi Xi ]" by packing halves of the x inputs. And then
277 : doing madd52lo of this on a similarly packed yjk we get:
278 :
279 : LO( xii * yjk ) = [ pijl pikl ]
280 :
281 : Doing x00, x11, x22, x33, x44, x55 against y03, y14, y25 yields all
282 : the low partials, organized:
283 :
284 : [ p00l p03l ], [ p01l p04l ], [ p02l p05l ],
285 : [ p10l p13l ], [ p11l p14l ], [ p12l p15l ],
286 : [ p20l p23l ], [ p21l p24l ], [ p22l p25l ],
287 : [ p30l p33l ], [ p31l p34l ], [ p32l p35l ],
288 : [ p40l p43l ], [ p41l p44l ], [ p42l p45l ],
289 : [ p50l p53l ], [ p51l p54l ], [ p52l p55l ]
290 :
291 : If we use the lower half of these results to accumulate the
292 : partials for the first 3 rows, we have:
293 :
294 : p0_q3 = [ p00l p03l ]
295 : p1_q4 = [ p10l p13l ] + [ p01l p04l ]
296 : p2_q5 = [ p20l p23l ] + [ p11l p14l ] + [ p02l p05l ]
297 : p3_q6 = [ p30l p33l ] + [ p21l p24l ] + [ p12l p15l ]
298 : p4_q7 = [ p40l p43l ] + [ p31l p34l ] + [ p22l p25l ]
299 : p5_q8 = [ p50l p53l ] + [ p41l p44l ] + [ p32l p35l ]
300 : p6_q9 = [ p51l p54l ] + [ p42l p45l ]
301 : p7_qa = [ p52l p55l ]
302 :
303 : We also see that doing this implicitly accumulates the last 3 rows
304 : of partials at the same time. Note also that we can use the
305 : accumulate features of MADD to do these accumulations and we have
306 : lots of independent MADD chains.
307 :
308 : The exact same applies for the HI partials. When we sum the LO and
309 : HI partials, we need to shift the HI parts left by 9 for the
310 : reasons described in the scalar version. When we sum the lower and
311 : upper halves to finish the partial accumulation, we repack them
312 : into two FD_R43X6_QUAD representations at the same time.
313 :
314 : This yields the below. This has massive ILP with utilization of
315 : all lanes with no wasted or redundant multiplications and very
316 : minimal fast shuffling. */
317 :
318 293834345 : wwl_t const _zz = wwl_zero();
319 :
320 293834345 : wwl_t x00 = wwl_pack_halves( x03,0, x03,0 );
321 293834345 : wwl_t x11 = wwl_pack_halves( x14,0, x14,0 );
322 293834345 : wwl_t x22 = wwl_pack_halves( x25,0, x25,0 );
323 293834345 : wwl_t x33 = wwl_pack_halves( x03,1, x03,1 );
324 293834345 : wwl_t x44 = wwl_pack_halves( x14,1, x14,1 );
325 293834345 : wwl_t x55 = wwl_pack_halves( x25,1, x25,1 );
326 :
327 293834345 : # if 1 /* This version is faster even though it has more adds due to higher ILP */
328 293834345 : wwl_t p0_q3 = wwl_madd52lo( _zz, x00, y03 );
329 293834345 : wwl_t p1_q4 = wwl_madd52lo( wwl_madd52lo( _zz, x11, y03 ), x00, y14 );
330 293834345 : wwl_t p2_q5 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( _zz, x22, y03 ), x11, y14 ), x00, y25 );
331 293834345 : wwl_t p3_q6 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( _zz, x33, y03 ), x22, y14 ), x11, y25 );
332 293834345 : wwl_t p4_q7 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( _zz, x44, y03 ), x33, y14 ), x22, y25 );
333 293834345 : wwl_t p5_q8 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( _zz, x55, y03 ), x44, y14 ), x33, y25 );
334 293834345 : wwl_t p6_q9 = wwl_madd52lo( wwl_madd52lo( _zz, x55, y14 ), x44, y25 );
335 293834345 : wwl_t p7_qa = wwl_madd52lo( _zz, x55, y25 );
336 :
337 293834345 : /**/ p1_q4 = wwl_add( p1_q4, wwl_shl( wwl_madd52hi( _zz, x00, y03 ), 9 ) );
338 293834345 : /**/ p2_q5 = wwl_add( p2_q5, wwl_shl( wwl_madd52hi( wwl_madd52hi( _zz, x11, y03 ), x00, y14 ), 9 ) );
339 293834345 : /**/ p3_q6 = wwl_add( p3_q6, wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x22, y03 ), x11, y14 ), x00, y25 ), 9 ) );
340 293834345 : /**/ p4_q7 = wwl_add( p4_q7, wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x33, y03 ), x22, y14 ), x11, y25 ), 9 ) );
341 293834345 : /**/ p5_q8 = wwl_add( p5_q8, wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x44, y03 ), x33, y14 ), x22, y25 ), 9 ) );
342 293834345 : /**/ p6_q9 = wwl_add( p6_q9, wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x55, y03 ), x44, y14 ), x33, y25 ), 9 ) );
343 293834345 : /**/ p7_qa = wwl_add( p7_qa, wwl_shl( wwl_madd52hi( wwl_madd52hi( _zz, x55, y14 ), x44, y25 ), 9 ) );
344 293834345 : wwl_t p8_qb = wwl_shl( wwl_madd52hi( _zz, x55, y25 ), 9 );
345 : # else
346 : wwl_t p1_q4 = wwl_shl( wwl_madd52hi( _zz, x00, y03 ), 9 );
347 : wwl_t p2_q5 = wwl_shl( wwl_madd52hi( wwl_madd52hi( _zz, x11, y03 ), x00, y14 ), 9 );
348 : wwl_t p3_q6 = wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x22, y03 ), x11, y14 ), x00, y25 ), 9 );
349 : wwl_t p4_q7 = wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x33, y03 ), x22, y14 ), x11, y25 ), 9 );
350 : wwl_t p5_q8 = wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x44, y03 ), x33, y14 ), x22, y25 ), 9 );
351 : wwl_t p6_q9 = wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x55, y03 ), x44, y14 ), x33, y25 ), 9 );
352 : wwl_t p7_qa = wwl_shl( wwl_madd52hi( wwl_madd52hi( _zz, x55, y14 ), x44, y25 ), 9 );
353 : wwl_t p8_qb = wwl_shl( wwl_madd52hi( _zz, x55, y25 ), 9 );
354 :
355 : wwl_t p0_q3 = wwl_madd52lo( _zz, x00, y03 );
356 : /**/ p1_q4 = wwl_madd52lo( wwl_madd52lo( p1_q4, x11, y03 ), x00, y14 );
357 : /**/ p2_q5 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( p2_q5, x22, y03 ), x11, y14 ), x00, y25 );
358 : /**/ p3_q6 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( p3_q6, x33, y03 ), x22, y14 ), x11, y25 );
359 : /**/ p4_q7 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( p4_q7, x44, y03 ), x33, y14 ), x22, y25 );
360 : /**/ p5_q8 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( p5_q8, x55, y03 ), x44, y14 ), x33, y25 );
361 : /**/ p6_q9 = wwl_madd52lo( wwl_madd52lo( p6_q9, x55, y14 ), x44, y25 );
362 : /**/ p7_qa = wwl_madd52lo( p7_qa, x55, y25 );
363 : # endif
364 :
365 293834345 : wwl_t q6_p3 = wwl_pack_halves( p3_q6,1, p3_q6,0 );
366 293834345 : wwl_t q7_p4 = wwl_pack_halves( p4_q7,1, p4_q7,0 );
367 293834345 : wwl_t q8_p5 = wwl_pack_halves( p5_q8,1, p5_q8,0 );
368 :
369 293834345 : wwl_t za03 = wwv_add_if( 0xF0, p0_q3, q6_p3, p0_q3 );
370 293834345 : wwl_t za14 = wwv_add_if( 0xF0, p1_q4, q7_p4, p1_q4 );
371 293834345 : wwl_t za25 = wwv_add_if( 0xF0, p2_q5, q8_p5, p2_q5 );
372 :
373 293834345 : wwl_t zb03 = wwv_add_if( 0x0F, p6_q9, q6_p3, p6_q9 );
374 293834345 : wwl_t zb14 = wwv_add_if( 0x0F, p7_qa, q7_p4, p7_qa );
375 293834345 : wwl_t zb25 = wwv_add_if( 0x0F, p8_qb, q8_p5, p8_qb );
376 :
377 : /* At this point:
378 :
379 : z = <za0,za1,za2,za3,za4,za5> + 2^258 <zb0,zb1,zb2,zb3,zb4,zb5>
380 : = <za0,za1,za2,za3,za4,za5> + 152 <zb0,zb1,zb2,zb3,zb4,zb5>
381 :
382 : and we can sum this directly (see scalar version for proof). Like
383 : the scalar version, we do the multiplication via shift-and-add
384 : techniques because mullo is slow. */
385 :
386 293834345 : wwl_t z03 = wwl_add( wwl_add( za03, wwl_shl( zb03, 7 ) ), wwl_add( wwl_shl( zb03, 4 ), wwl_shl( zb03, 3 ) ) );
387 293834345 : wwl_t z14 = wwl_add( wwl_add( za14, wwl_shl( zb14, 7 ) ), wwl_add( wwl_shl( zb14, 4 ), wwl_shl( zb14, 3 ) ) );
388 293834345 : wwl_t z25 = wwl_add( wwl_add( za25, wwl_shl( zb25, 7 ) ), wwl_add( wwl_shl( zb25, 4 ), wwl_shl( zb25, 3 ) ) );
389 :
390 293834345 : FD_R43X6_QUAD_MOV( *_z, z );
391 293834345 : }
392 :
393 : /* FD_R43X6_QUAD_SQR_FAST(R,P) does:
394 : [ fd_r43x6_sqr_fast(PX) fd_r43x6_sqr_fast(PY) fd_r43x6_sqr_fast(PZ) fd_r43x6_sqr_fast(PT) ]
395 : Written this way so that pointer escapes don't inhibit optimizations. */
396 :
397 90099279 : #define FD_R43X6_QUAD_SQR_FAST( R, P ) do { \
398 90099279 : FD_R43X6_QUAD_DECL( _R ); fd_r43x6_quad_sqr_fast( &_R03,&_R14,&_R25, P##03,P##14,P##25 ); \
399 90099279 : FD_R43X6_QUAD_MOV( R, _R ); \
400 90099279 : } while(0)
401 :
402 : FD_FN_UNUSED static void /* let compiler decide if worth inlining */
403 : fd_r43x6_quad_sqr_fast( fd_r43x6_t * _z03, fd_r43x6_t * _z14, fd_r43x6_t * _z25,
404 110099479 : fd_r43x6_t x03, fd_r43x6_t x14, fd_r43x6_t x25 ) {
405 :
406 : /* Grade school-ish from the original mul:
407 :
408 : x5 x4 x3 x2 x1 x0
409 : x x5 x4 x3 x2 x1 x0
410 : --------------------------------
411 : p50l p40l p30l p20l p10l p00l
412 : p50h p40h p30h p20h p10h p00h
413 : p51l p41l p31l p21l p11l p01l
414 : p51h p41h p31h p21h p11h p01h
415 : p52l p42l p32l p22l p12l p02l
416 : p52h p42h p32h p22h p12h p02h
417 : p53l p43l p33l p23l p13l p03l
418 : p53h p43h p33h p23h p13h p03h
419 : p54l p44l p34l p24l p14l p04l
420 : p54h p44h p34h p24h p14h p04h
421 : p55l p45l p35l p25l p15l p05l
422 : p55h p45h p35h p25h p15h p05h
423 : -----------------------------------------------------------
424 : zb za z9 z8 z7 z6 z5 z4 z3 z2 z1 z0
425 :
426 : Consider only the low partial rows and note that pijl=pjil here.
427 : This portion of the reduction can be simplified:
428 :
429 : 2*p50l 2*p40l 2*p30l 2*p20l 2*p10l p00l
430 : 2*p51l 2*p41l 2*p31l 2*p21l p11l
431 : 2*p52l 2*p42l 2*p32l p22l
432 : 2*p53l 2*p43l p33l
433 : 2*p54l p44l
434 : p55l
435 : ----------------------------------------------------------------------------
436 : pa p9 p8 p7 p6 p5 p4 p3 p2 p1 p0
437 :
438 : The number of adds and the partials that need to be doubled have a
439 : mirror symmetry about p5. Exploiting this yields:
440 :
441 : 2*p50l|2*p32l 2*p40l|2*p51l 2*p30l|2*p52l 2*p20l|2*p53l 2*p10l|2*p54l p00l|p55l
442 : 2*p41l|2*zero 2*p31l|2*p42l 2*p21l|2*p43l p11l| p44l
443 : p22l| p33l
444 : --------------------------------------------------------------------------------
445 : p55 p46 p37 p28 p19 p0a
446 :
447 : Above a|b means make an 8-lane vector by concatenating the 4 a's
448 : (one for each square in progress) and the 4 b's. Above we have
449 : split the reduction of p5 to get some extra vector multiplier
450 : utilization. Other splits are possible and maybe could usefully
451 : trade some extra computation for less swizzling.
452 :
453 : Similar holds for the high partials:
454 :
455 : 2*p50h|2*p32h 2*p40h|2*p51h 2*p30h|2*p52h 2*p20h|2*p53h 2*p10h|2*p54h p00h|p55h
456 : 2*p41h|2*zero 2*p31h|2*p42h 2*p21h|2*p43h p11h| p44h
457 : p22h| p33h
458 : --------------------------------------------------------------------------------
459 : q66 q57 q48 q39 q2a q1b
460 :
461 : For the reasons described in the scalar implementation, we need to
462 : shift the high partials left by 9 before we can reduce them into
463 : the low partials. As we do this reduction, we repack them into the
464 : FD_R43X6_QUAD's za and zb.
465 :
466 : In doing these reductions, we exploit i<>j symmetry and pair terms
467 : on the left and right halves to minimize input shuffling. For
468 : example, for p1b, we need to form x05=x0|x5 and then compute
469 : p1b=x05*x05. Instead of forming x15 and x04 to compute
470 : p2a=2*x15*x04, we can do p2a=2*p01h|2*p54h and use the x14 we were
471 : passed directly and reuse the x05 formed for p1b.
472 :
473 : This yields the below. Theoretical minimum number of multiplies,
474 : tons of ILP, low swizzling overhead. */
475 :
476 110099479 : wwl_t _zz = wwl_zero();
477 :
478 110099479 : wwl_t x05 = wwl_pack_h0_h1 ( x03, x25 );
479 110099479 : wwl_t x12 = wwl_pack_halves( x14,0, x25,0 );
480 110099479 : wwl_t x34 = wwl_pack_halves( x03,1, x14,1 );
481 110099479 : wwl_t x41 = wwl_pack_halves( x14,1, x14,0 );
482 110099479 : wwl_t x23 = wwl_pack_h0_h1 ( x25, x03 );
483 :
484 110099479 : wwl_t x52 = wwl_pack_halves( x25,1, x25,0 );
485 110099479 : wwl_t x4z = wwl_pack_halves( x14,1, _zz,0 );
486 :
487 110099479 : wwl_t two_x03 = wwl_shl( x03, 1 );
488 110099479 : wwl_t two_x14 = wwl_shl( x14, 1 );
489 110099479 : wwl_t two_x05 = wwl_shl( x05, 1 );
490 110099479 : wwl_t two_x12 = wwl_shl( x12, 1 );
491 :
492 110099479 : # if 1 /* This version is faster even though it has more adds due to better ILP */
493 110099479 : wwl_t p0a = wwl_madd52lo( _zz, x05, x05 );
494 110099479 : wwl_t p19 = wwl_madd52lo( _zz, two_x05, x14 );
495 110099479 : wwl_t p28 = wwl_madd52lo( wwl_madd52lo( _zz, x14, x14 ), two_x03, x25 );
496 110099479 : wwl_t p37 = wwl_madd52lo( wwl_madd52lo( _zz, two_x03, x34 ), two_x12, x25 );
497 110099479 : wwl_t p46 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( _zz, x23, x23 ), two_x05, x41 ), two_x12, x34 );
498 110099479 : wwl_t p55 = wwl_madd52lo( wwl_madd52lo( _zz, two_x03, x52 ), two_x14, x4z );
499 :
500 110099479 : wwl_t q1b = wwl_shl( wwl_madd52hi( _zz, x05, x05 ), 9 );
501 110099479 : wwl_t q2a = wwl_shl( wwl_madd52hi( _zz, two_x05, x14 ), 9 );
502 110099479 : wwl_t q39 = wwl_shl( wwl_madd52hi( wwl_madd52hi( _zz, x14, x14 ), two_x03, x25 ), 9 );
503 110099479 : wwl_t q48 = wwl_shl( wwl_madd52hi( wwl_madd52hi( _zz, two_x03, x34 ), two_x12, x25 ), 9 );
504 110099479 : wwl_t q57 = wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x23, x23 ), two_x05, x41 ), two_x12, x34 ), 9 );
505 110099479 : wwl_t q66 = wwl_shl( wwl_madd52hi( wwl_madd52hi( _zz, two_x03, x52 ), two_x14, x4z ), 9 );
506 :
507 110099479 : wwl_t za03 = wwl_add( wwl_pack_halves( p0a,0, p37,0 ), wwl_pack_halves( _zz,0, q39,0 ) );
508 110099479 : wwl_t za14 = wwl_add( wwl_pack_halves( p19,0, p46,0 ), wwl_pack_halves( q1b,0, q48,0 ) );
509 110099479 : wwl_t za25 = wwl_add( wwl_add( wwl_pack_halves( p28,0, p55,0 ), wwl_pack_halves( q2a,0, q57,0 ) ), wwl_pack_h0_h1( _zz, p55 ) );
510 :
511 110099479 : wwl_t zb03 = wwl_add( wwl_add( wwl_pack_halves( p46,1, p19,1 ), wwl_pack_halves( q66,1, q39,1 ) ), wwl_pack_h0_h1( q66, _zz ) );
512 110099479 : wwl_t zb14 = wwl_add( wwl_pack_halves( p37,1, p0a,1 ), wwl_pack_halves( q57,1, q2a,1 ) );
513 110099479 : wwl_t zb25 = wwl_add( wwl_pack_halves( p28,1, _zz,1 ), wwl_pack_halves( q48,1, q1b,1 ) );
514 : # else
515 : wwl_t q1b = wwl_shl( wwl_madd52hi( _zz, x05, x05 ), 9 );
516 : wwl_t q2a = wwl_shl( wwl_madd52hi( _zz, two_x05, x14 ), 9 );
517 : wwl_t q39 = wwl_shl( wwl_madd52hi( wwl_madd52hi( _zz, x14, x14 ), two_x03, x25 ), 9 );
518 : wwl_t q48 = wwl_shl( wwl_madd52hi( wwl_madd52hi( _zz, two_x03, x34 ), two_x12, x25 ), 9 );
519 : wwl_t q57 = wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x23, x23 ), two_x05, x41 ), two_x12, x34 ), 9 );
520 : wwl_t q66 = wwl_shl( wwl_madd52hi( wwl_madd52hi( _zz, two_x03, x52 ), two_x14, x4z ), 9 );
521 :
522 : wwl_t p0a = wwl_madd52lo( wwl_pack_h0_h1( _zz, q2a ), x05, x05 );
523 : wwl_t p19 = wwl_madd52lo( wwl_pack_h0_h1( q1b, q39 ), two_x05, x14 );
524 : wwl_t p28 = wwl_madd52lo( wwl_madd52lo( wwl_pack_h0_h1( q2a, q48 ), x14, x14 ), two_x03, x25 );
525 : wwl_t p37 = wwl_madd52lo( wwl_madd52lo( wwl_pack_h0_h1( q39, q57 ), two_x03, x34 ), two_x12, x25 );
526 : wwl_t p46 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( wwl_pack_h0_h1( q48, q66 ), x23, x23 ), two_x05, x41 ), two_x12, x34 );
527 : wwl_t p55 = wwl_madd52lo( wwl_madd52lo( wwl_pack_h0_h1( q57, _zz ), two_x03, x52 ), two_x14, x4z );
528 :
529 : wwl_t za03 = wwl_pack_halves( p0a,0, p37,0 );
530 : wwl_t za14 = wwl_pack_halves( p19,0, p46,0 );
531 : wwl_t za25 = wwl_add( wwl_pack_halves( p28,0, p55,0 ), wwl_pack_h0_h1( _zz, p55 ) );
532 :
533 : wwl_t zb03 = wwl_add( wwl_pack_halves( p46,1, p19,1 ), wwl_pack_h0_h1( q66, _zz ) );
534 : wwl_t zb14 = wwl_pack_halves( p37,1, p0a,1 );
535 : wwl_t zb25 = wwl_pack_halves( p28,1, q1b,1 );
536 : # endif
537 :
538 : /* At this point:
539 :
540 : z = <za0,za1,za2,za3,za4,za5> + 2^258 <zb0,zb1,zb2,zb3,zb4,zb5>
541 :
542 : We complete the calc exactly like FD_R43X6_QUAD_MUL above. */
543 :
544 110099479 : wwl_t z03 = wwl_add( wwl_add( za03, wwl_shl( zb03, 7 ) ), wwl_add( wwl_shl( zb03, 4 ), wwl_shl( zb03, 3 ) ) );
545 110099479 : wwl_t z14 = wwl_add( wwl_add( za14, wwl_shl( zb14, 7 ) ), wwl_add( wwl_shl( zb14, 4 ), wwl_shl( zb14, 3 ) ) );
546 110099479 : wwl_t z25 = wwl_add( wwl_add( za25, wwl_shl( zb25, 7 ) ), wwl_add( wwl_shl( zb25, 4 ), wwl_shl( zb25, 3 ) ) );
547 :
548 110099479 : FD_R43X6_QUAD_MOV( *_z, z );
549 110099479 : }
550 :
551 : /* Below, FD_R43X6_MUL4_INL( za,xa,ya, zb,xb,yb, zc,xc,yc, zd,xd,yd )
552 : exactly does:
553 :
554 : za = fd_r43x6_mul( xa, ya );
555 : zb = fd_r43x6_mul( xb, yb );
556 : zc = fd_r43x6_mul( xc, yc );
557 : zd = fd_r43x6_mul( xd, yd );
558 :
559 : Likewise, FD_R43X6_SQR4_INL( za,xa, zb,xb, zc,xc, zd,xd ) exactly does:
560 :
561 : za = fd_r43x6_sqr( xa );
562 : zb = fd_r43x6_sqr( xb );
563 : zc = fd_r43x6_sqr( xc );
564 : zd = fd_r43x6_sqr( xd );
565 :
566 : And, FD_R43X6_POW25223_2_INL( za,xa, zb,xb ) exactly does:
567 :
568 : za = fd_r43x6_pow25223( xa );
569 : zb = fd_r43x6_pow25223( xb );
570 :
571 : Similarly for FD_R43X6_MUL{1,2,3}_INL, FD_R43X6_SQR{1,2,3}_INL and
572 : FD_R43X6_POW25223_1_INL( za ).
573 :
574 : These macros are robust (e.g. these evaluate their arguments once and
575 : they linguistically behave as a single statement) and have the
576 : resulting ILP very exposed to the optimizer and CPU. In-place
577 : operation okay.
578 :
579 : Future implementations might allow these to produce different
580 : mathematically equivalent representations of the result if such
581 : allows higher performance akin to what was done for fd_r43x6_sqr.
582 :
583 : TODO: SUB2_INL to accelerate the folds there?
584 :
585 : TODO: Consider pure for various multi-return function prototypes? */
586 :
587 : #if 0 /* Reference implementation */
588 :
589 : #define FD_R43X6_MUL1_INL( za,xa,ya ) do { \
590 : (za) = fd_r43x6_mul( (xa), (ya) ); \
591 : } while(0)
592 :
593 : #define FD_R43X6_MUL2_INL( za,xa,ya, zb,xb,yb ) do { \
594 : (za) = fd_r43x6_mul( (xa), (ya) ); \
595 : (zb) = fd_r43x6_mul( (xb), (yb) ); \
596 : } while(0)
597 :
598 : #define FD_R43X6_MUL3_INL( za,xa,ya, zb,xb,yb, zc,xc,yc ) do { \
599 : (za) = fd_r43x6_mul( (xa), (ya) ); \
600 : (zb) = fd_r43x6_mul( (xb), (yb) ); \
601 : (zc) = fd_r43x6_mul( (xc), (yc) ); \
602 : } while(0)
603 :
604 : #define FD_R43X6_MUL4_INL( za,xa,ya, zb,xb,yb, zc,xc,yc, zd,xd,yd ) do { \
605 : (za) = fd_r43x6_mul( (xa), (ya) ); \
606 : (zb) = fd_r43x6_mul( (xb), (yb) ); \
607 : (zc) = fd_r43x6_mul( (xc), (yc) ); \
608 : (zd) = fd_r43x6_mul( (xd), (yd) ); \
609 : } while(0)
610 :
611 : #define FD_R43X6_SQR1_INL( za,xa ) do { \
612 : (za) = fd_r43x6_sqr( (xa) ); \
613 : } while(0)
614 :
615 : #define FD_R43X6_SQR2_INL( za,xa, zb,xb ) do { \
616 : (za) = fd_r43x6_sqr( (xa) ); \
617 : (zb) = fd_r43x6_sqr( (xb) ); \
618 : } while(0)
619 :
620 : #define FD_R43X6_SQR3_INL( za,xa, zb,xb, zc,xc ) do { \
621 : (za) = fd_r43x6_sqr( (xa) ); \
622 : (zb) = fd_r43x6_sqr( (xb) ); \
623 : (zc) = fd_r43x6_sqr( (xc) ); \
624 : } while(0)
625 :
626 : #define FD_R43X6_SQR4_INL( za,xa, zb,xb, zc,xc, zd,xd ) do { \
627 : (za) = fd_r43x6_sqr( (xa) ); \
628 : (zb) = fd_r43x6_sqr( (xb) ); \
629 : (zc) = fd_r43x6_sqr( (xc) ); \
630 : (zd) = fd_r43x6_sqr( (xd) ); \
631 : } while(0)
632 :
633 : #define FD_R43X6_POW22523_1_INL( za,xa ) do { \
634 : (za) = fd_r43x6_pow22523( (xa) ); \
635 : } while(0)
636 :
637 : #define FD_R43X6_POW22523_2_INL( za,xa, zb,xb ) do { \
638 : (za) = fd_r43x6_pow22523( (xa) ); \
639 : (zb) = fd_r43x6_pow22523( (xb) ); \
640 : } while(0)
641 :
642 : #else /* HPC implementation */
643 :
644 : /* Nothing to interleave so let compiler decide */
645 :
646 33530118 : #define FD_R43X6_MUL1_INL( z,x,y ) do { \
647 33530118 : (z) = fd_r43x6_mul( (x), (y) ); \
648 33530118 : } while(0)
649 :
650 : /* Seems to be slightly faster to let compiler decide */
651 :
652 18828745 : #define FD_R43X6_MUL2_INL( za,xa,ya, zb,xb,yb ) do { \
653 18828745 : (za) = fd_r43x6_mul( (xa), (ya) ); \
654 18828745 : (zb) = fd_r43x6_mul( (xb), (yb) ); \
655 18828745 : } while(0)
656 :
657 : /* Slightly faster to pack / pack / mul / fold / unpack */
658 :
659 10000000 : #define FD_R43X6_MUL3_INL( za,xa,ya, zb,xb,yb, zc,xc,yc ) do { \
660 10000000 : FD_R43X6_QUAD_DECL( _X ); FD_R43X6_QUAD_PACK ( _X, (xa),(xb),(xc),fd_r43x6_zero() ); \
661 10000000 : FD_R43X6_QUAD_DECL( _Y ); FD_R43X6_QUAD_PACK ( _Y, (ya),(yb),(yc),fd_r43x6_zero() ); \
662 10000000 : FD_R43X6_QUAD_DECL( _Z ); FD_R43X6_QUAD_MUL_FAST ( _Z, _X, _Y ); \
663 10000000 : /**/ FD_R43X6_QUAD_FOLD_UNSIGNED( _Z, _Z ); \
664 10000000 : fd_r43x6_t _zd; FD_R43X6_QUAD_UNPACK ( (za),(zb),(zc),_zd, _Z ); \
665 10000000 : (void)_zd; \
666 10000000 : } while(0)
667 :
668 : /* Substantially faster to pack / pack / mul / fold / unpack */
669 :
670 10030002 : #define FD_R43X6_MUL4_INL( za,xa,ya, zb,xb,yb, zc,xc,yc, zd,xd,yd ) do { \
671 10030002 : FD_R43X6_QUAD_DECL( _X ); FD_R43X6_QUAD_PACK ( _X, (xa),(xb),(xc),(xd) ); \
672 10030002 : FD_R43X6_QUAD_DECL( _Y ); FD_R43X6_QUAD_PACK ( _Y, (ya),(yb),(yc),(yd) ); \
673 10030002 : FD_R43X6_QUAD_DECL( _Z ); FD_R43X6_QUAD_MUL_FAST ( _Z, _X, _Y ); \
674 10030002 : /**/ FD_R43X6_QUAD_FOLD_UNSIGNED( _Z, _Z ); \
675 10030002 : /**/ FD_R43X6_QUAD_UNPACK ( (za),(zb),(zc),(zd), _Z ); \
676 10030002 : } while(0)
677 :
678 : /* Nothing to interleave so let compiler decide */
679 :
680 428724325 : #define FD_R43X6_SQR1_INL( z,x ) do { (z) = fd_r43x6_sqr( (x) ); } while(0)
681 :
682 : /* Seems to be slightly faster to let compiler decide */
683 :
684 152515769 : #define FD_R43X6_SQR2_INL( za,xa, zb,xb ) do { \
685 152515769 : (za) = fd_r43x6_sqr( (xa) ); \
686 152515769 : (zb) = fd_r43x6_sqr( (xb) ); \
687 152515769 : } while(0)
688 :
689 : /* Seems to be slightly faster to let compiler decide */
690 :
691 10000000 : #define FD_R43X6_SQR3_INL( za,xa, zb,xb, zc,xc ) do { \
692 10000000 : (za) = fd_r43x6_sqr( (xa) ); \
693 10000000 : (zb) = fd_r43x6_sqr( (xb) ); \
694 10000000 : (zc) = fd_r43x6_sqr( (xc) ); \
695 10000000 : } while(0)
696 :
697 : /* Substantially faster to pack / pack / sqr / fold / unpack */
698 :
699 10000000 : #define FD_R43X6_SQR4_INL( za,xa, zb,xb, zc,xc, zd,xd ) do { \
700 10000000 : FD_R43X6_QUAD_DECL( _X ); FD_R43X6_QUAD_PACK ( _X, (xa),(xb),(xc),(xd) ); \
701 10000000 : FD_R43X6_QUAD_DECL( _Z ); FD_R43X6_QUAD_SQR_FAST ( _Z, _X ); \
702 10000000 : /**/ FD_R43X6_QUAD_FOLD_UNSIGNED( _Z, _Z ); \
703 10000000 : /**/ FD_R43X6_QUAD_UNPACK ( (za),(zb),(zc),(zd), _Z ); \
704 10000000 : } while(0)
705 :
706 : /* Nothing to interleave so let compiler decide */
707 :
708 131072 : #define FD_R43X6_POW22523_1_INL( za,xa ) do { \
709 131072 : (za) = fd_r43x6_pow22523( (xa) ); \
710 131072 : } while(0)
711 :
712 : /* This is very expensive with a huge instruction footprint. So we just
713 : wrap to avoid pointer escapes from inhibiting optimization and call a
714 : separately compiled version. */
715 :
716 431827 : #define FD_R43X6_POW22523_2_INL( za,xa, zb,xb ) do { \
717 431827 : fd_r43x6_t _za; fd_r43x6_t _zb; \
718 431827 : fd_r43x6_pow22523_2( &_za,(xa), &_zb,(xb) ); \
719 431827 : (za) = _za; (zb) = _zb; \
720 431827 : } while(0)
721 :
722 : void
723 : fd_r43x6_pow22523_2( fd_r43x6_t * _za, fd_r43x6_t za,
724 : fd_r43x6_t * _zb, fd_r43x6_t zb );
725 :
726 : #endif /* HPC implementation */
727 :
728 : FD_PROTOTYPES_END
729 :
730 : #endif /* HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_inl_h */
|