LCOV - cov.lcov - ballet/ed25519/avx512/fd_r43x6

LCOV - code coverage report

Current view:	top level - ballet/ed25519/avx512 - fd_r43x6_inl.h (source / functions)		Hit	Total	Coverage
Test:	cov.lcov	Lines:	260	260	100.0 %
Date:	2024-11-13 11:58:15	Functions:	8	60	13.3 %

          Line data    Source code

       1             : #ifndef HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_inl_h
       2             : #define HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_inl_h
       3             : 
       4             : #ifndef HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_h
       5             : #error "Do not include this directly; use fd_r43x6.h"
       6             : #endif
       7             : 
       8             : /* Protocols like ED25519 do many GF(p) operations that can be run
       9             :    in parallel in principle.  But, because of the complexity of the
      10             :    individual operations, optimizers struggle with extracting the ILP
      11             :    (e.g. to get at the ILP in, for example, 3 independent fd_r43x6_mul,
      12             :    it has to decide to inline all 3 when its heuristics usually indicate
      13             :    is each mul is too expensive in code footprint to justify inlining
      14             :    even one and then do a very long range reorganization of the assembly
      15             :    instructions when its heuristics usually indicate to avoid such to
      16             :    keep compile time computational complexity reasonable.
      17             : 
      18             :    Further, when there are enough operations that can be run in
      19             :    parallel, it is often a net win to swizzle / deswizzle the data
      20             :    layout to make use of otherwise unused vector lanes.  The optimizer's
      21             :    ability to do such radical code transformations, is limited at best
      22             :    and practically impossible for transformations could generate a
      23             :    different but mathematically equivalent representation of the result,
      24             :    akin to fd_r43x6_mul(x,x) vs fd_r43x6_sqr(x).
      25             : 
      26             :    It is also useful to annotate such parallelism in the protocol
      27             :    implementations such that they can be upgraded with no change to take
      28             :    advantage of newer hardware, better compilers, etc by updating these
      29             :    implementations as appropriate.
      30             : 
      31             :    The below makes a low to mid tens of percent performance improvement
      32             :    for things like ED25519 verify on gcc-12 and icelake-server. */
      33             : 
      34             : FD_PROTOTYPES_BEGIN
      35             : 
      36             : /* FD_R43X6_QUAD_DECL(Q) declares the wwl_t's Q03, Q14 and Q25 in the
      37             :    local scope to represent fd_r43x6_t X, Y, Z and T, but in a more
      38             :    efficient way for data parallel GF(p) operations under the hood.
      39             :    Organization:
      40             : 
      41             :      Q03 = [ X0 Y0 Z0 T0 | X3 Y3 Z3 T3 ]
      42             :      Q14 = [ X1 Y1 Z1 T1 | X4 Y4 Z4 T4 ]
      43             :      Q25 = [ X2 Y2 Z2 T2 | X5 Y5 Z5 T5 ]
      44             : 
      45             :    where Xi is the i-th limb of X. */
      46             : 
      47  1052211611 : #define FD_R43X6_QUAD_DECL( Q ) wwl_t Q##03, Q##14, Q##25
      48             : 
      49             : /* FD_R43X6_QUAD_MOV( D, S ) does D = S.  D and S are FD_R43X6_QUAD
      50             :    declarations in the local scope. */
      51             : 
      52   829073124 : #define FD_R43X6_QUAD_MOV( D, S ) do { D##03 = S##03; D##14 = S##14; D##25 = S##25; } while(0)
      53             : 
      54             : /* FD_R43X6_QUAD_PACK(Q,x,y,z,t) does Q = (x,y,z,t) where Q is a
      55             :    FD_R43X6_QUAD declared in the local scope, x, y, z and t are
      56             :    arbitrary fd_r43x6_t. */
      57             : 
      58    87578758 : #define FD_R43X6_QUAD_PACK( Q, x,y,z,t ) do {                           \
      59    87578758 :     wwl_t _r0 = (x);                                                    \
      60    87578758 :     wwl_t _r1 = (y);                                                    \
      61    87578758 :     wwl_t _r2 = (z);                                                    \
      62    87578758 :     wwl_t _r3 = (t);                                                    \
      63    87578758 :     /* At this point _r0 = x0 x1 x2 x3 x4 x5 -- -- */                   \
      64    87578758 :     /*               _r1 = y0 y1 y2 y3 y4 y5 -- -- */                   \
      65    87578758 :     /*               _r2 = z0 z1 z2 z3 z4 z5 -- -- */                   \
      66    87578758 :     /*               _r3 = t0 t1 t2 t3 t4 t5 -- -- */                   \
      67    87578758 :     /* Transpose 2x2 blocks                        */                   \
      68    87578758 :     /* No _mm256_permute2f128_si256 equivalent? Sigh ... */             \
      69    87578758 :     wwl_t _t0 = wwl_select( wwl(  0, 1, 8, 9, 4, 5,12,13 ), _r0, _r2 ); \
      70    87578758 :     wwl_t _t1 = wwl_select( wwl(  0, 1, 8, 9, 4, 5,12,13 ), _r1, _r3 ); \
      71    87578758 :     wwl_t _t2 = wwl_select( wwl(  2, 3,10,11, 6, 7,12,13 ), _r0, _r2 ); \
      72    87578758 :     wwl_t _t3 = wwl_select( wwl(  2, 3,10,11, 6, 7,12,13 ), _r1, _r3 ); \
      73    87578758 :     /* At this point _t0 = x0 x1 z0 z1 x4 x5 z4 z5 */                   \
      74    87578758 :     /*               _t1 = y0 y1 t0 t1 y4 y5 t4 t5 */                   \
      75    87578758 :     /*               _t2 = x2 x3 z2 z3 -- -- -- -- */                   \
      76    87578758 :     /*               _t3 = y2 y3 t2 t3 -- -- -- -- */                   \
      77    87578758 :     /* Transpose 1x1 blocks                        */                   \
      78    87578758 :     wwl_t _c04 = _mm512_unpacklo_epi64( _t0, _t1 );                     \
      79    87578758 :     wwl_t _c15 = _mm512_unpackhi_epi64( _t0, _t1 );                     \
      80    87578758 :     wwl_t _c26 = _mm512_unpacklo_epi64( _t2, _t3 );                     \
      81    87578758 :     wwl_t _c37 = _mm512_unpackhi_epi64( _t2, _t3 );                     \
      82    87578758 :     /* At this point _c04 = x0 y0 z0 t0 x4 y4 z4 t4 */                  \
      83    87578758 :     /*               _c15 = x1 y1 t1 t1 x5 y5 z5 t5 */                  \
      84    87578758 :     /*               _c26 = x2 y2 z2 t2 -- -- -- -- */                  \
      85    87578758 :     /*               _c37 = x3 y3 z3 t3 -- -- -- -- */                  \
      86    87578758 :     Q##03 = wwl_pack_halves( _c04,0, _c37,0 );                          \
      87    87578758 :     Q##14 = wwl_pack_h0_h1 ( _c15,   _c04   );                          \
      88    87578758 :     Q##25 = wwl_pack_h0_h1 ( _c26,   _c15   );                          \
      89    87578758 :   } while(0)
      90             : 
      91             : /* FD_R43X6_QUAD_UNPACK(x,y,z,t,Q) does (x,y,z,t) = Q where x, y, z and
      92             :    t are arbitrary fd_r43x6_t and Q is a FD_R43X6_QUAD declared in the
      93             :    local scope. */
      94             : 
      95   138292640 : #define FD_R43X6_QUAD_UNPACK( x,y,z,t, Q ) do {               \
      96   138292640 :     wwl_t _r0 = Q##03;                                        \
      97   138292640 :     wwl_t _r1 = Q##14;                                        \
      98   138292640 :     wwl_t _r2 = Q##25;                                        \
      99   138292640 :     wwl_t _r3 = wwl_zero();                                   \
     100   138292640 :     /* At this point _r0 = x0 y0 z0 t0 x3 y3 z3 t3 */         \
     101   138292640 :     /*               _r1 = x1 y1 z1 t1 x4 y4 z4 t4 */         \
     102   138292640 :     /*               _r2 = x2 y2 z2 t2 x5 y5 z5 t5 */         \
     103   138292640 :     /*               _r3 =  0  0  0  0  0  0  0  0 */         \
     104   138292640 :     /* Transpose 1x1 blocks */                                \
     105   138292640 :     wwl_t _c0 = _mm512_unpacklo_epi64( _r0, _r1 );            \
     106   138292640 :     wwl_t _c1 = _mm512_unpackhi_epi64( _r0, _r1 );            \
     107   138292640 :     wwl_t _c2 = _mm512_unpacklo_epi64( _r2, _r3 );            \
     108   138292640 :     wwl_t _c3 = _mm512_unpackhi_epi64( _r2, _r3 );            \
     109   138292640 :     /* At this point _c0 = x0 x1 z0 z1 x3 x4 z3 z4 */         \
     110   138292640 :     /*               _c1 = y0 y1 t0 t1 y3 y4 t3 t4 */         \
     111   138292640 :     /*               _c2 = x2  0 z2  0 x5  0 z5  0 */         \
     112   138292640 :     /*               _c3 = y2  0 t2  0 y5  0 t5  0 */         \
     113   138292640 :     (x) = wwl_select( wwl(  0,1, 8, 4,5,12, 9,9 ), _c0,_c2 ); \
     114   138292640 :     (y) = wwl_select( wwl(  0,1, 8, 4,5,12, 9,9 ), _c1,_c3 ); \
     115   138292640 :     (z) = wwl_select( wwl(  2,3,10, 6,7,14, 9,9 ), _c0,_c2 ); \
     116   138292640 :     (t) = wwl_select( wwl(  2,3,10, 6,7,14, 9,9 ), _c1,_c3 ); \
     117   138292640 :   } while(0)
     118             : 
     119             : /* FD_R43X6_QUAD_PERMUTE(D,S) does:
     120             :      D = [ S(imm0) S(imm1) S(imm2) S(imm3) ]
     121             :    where imm* are in [0,3] (0/1/2/3->X/Y/Z/T) */
     122             : 
     123   730839145 : #define FD_R43X6_QUAD_PERMUTE( D, imm0,imm1,imm2,imm3, S ) do {                                  \
     124   730839145 :     wwl_t const _perm = wwl( (imm0),(imm1),(imm2),(imm3), 4+(imm0),4+(imm1),4+(imm2),4+(imm3) ); \
     125   730839145 :     D##03 = wwl_permute( _perm, S##03 );                                                         \
     126   730839145 :     D##14 = wwl_permute( _perm, S##14 );                                                         \
     127   730839145 :     D##25 = wwl_permute( _perm, S##25 );                                                         \
     128   730839145 :   } while(0)
     129             : 
     130             : /* FD_R43X6_QUAD_LANE_IF does:
     131             :      D = [ imm0 ? SX : TX, imm1 ? SY : TY, imm2 ? SZ : TZ, imm3 ? ST : TT ]
     132             :    imm* should be in [0,1]. */
     133             : 
     134    43479970 : #define FD_R43X6_QUAD_LANE_IF( D, imm0,imm1,imm2,imm3, S, T ) do { \
     135    43479970 :     int _mask = 17*(imm0) + 34*(imm1) + 68*(imm2) + 136*(imm3);    \
     136    43479970 :     D##03 = wwl_if( _mask, S##03, T##03 );                         \
     137    43479970 :     D##14 = wwl_if( _mask, S##14, T##14 );                         \
     138    43479970 :     D##25 = wwl_if( _mask, S##25, T##25 );                         \
     139    43479970 :   } while(0)
     140             : 
     141             : /* FD_R43X6_QUAD_LANE_ADD_FAST does:
     142             :      D = [ (imm0 ? (PX+QX) : SX) (imm1 ? (PY+QY) : SY) (imm2 ? (PZ+QZ) : SZ) (imm3 ? (PT+QT) : ST) ]
     143             :    imm* should be in [0,1]. */
     144             : 
     145   453843042 : #define FD_R43X6_QUAD_LANE_ADD_FAST( D, S, imm0,imm1,imm2,imm3, P, Q ) do { \
     146   453843042 :     int _mask = 17*(imm0) + 34*(imm1) + 68*(imm2) + 136*(imm3);             \
     147   453843042 :     D##03 = wwv_add_if( _mask, P##03, Q##03, S##03 );                       \
     148   453843042 :     D##14 = wwv_add_if( _mask, P##14, Q##14, S##14 );                       \
     149   453843042 :     D##25 = wwv_add_if( _mask, P##25, Q##25, S##25 );                       \
     150   453843042 :   } while(0)
     151             : 
     152             : /* FD_R43X6_QUAD_LANE_SUB_FAST does:
     153             :      D = [ (imm0 ? (PX-QX) : SX) (imm1 ? (PY-QY) : SY) (imm2 ? (PZ-QZ) : SZ) (imm3 ? (PT-QT) : ST) ]
     154             :    imm* should be in [0,1]. */
     155   323760559 : #define FD_R43X6_QUAD_LANE_SUB_FAST( D, S, imm0,imm1,imm2,imm3, P, Q ) do { \
     156   323760559 :     int _mask = 17*(imm0) + 34*(imm1) + 68*(imm2) + 136*(imm3); \
     157   323760559 :     FD_R43X6_QUAD_DECL( M );                                    \
     158   323760559 :     M##03 = wwl( 8796093022189L, 8796093022189L, 8796093022189L, 8796093022189L, 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L ); \
     159   323760559 :     M##14 = wwl( 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L ); \
     160   323760559 :     M##25 = wwl( 8796093022207L, 8796093022207L, 8796093022207L, 8796093022207L, 1099511627775L, 1099511627775L, 1099511627775L, 1099511627775L ); \
     161   323760559 :     M##03 = wwv_sub( M##03, Q##03 );                            \
     162   323760559 :     M##14 = wwv_sub( M##14, Q##14 );                            \
     163   323760559 :     M##25 = wwv_sub( M##25, Q##25 );                            \
     164   323760559 :     D##03 = wwv_add_if( _mask, P##03, M##03, S##03 );           \
     165   323760559 :     D##14 = wwv_add_if( _mask, P##14, M##14, S##14 );           \
     166   323760559 :     D##25 = wwv_add_if( _mask, P##25, M##25, S##25 );           \
     167   323760559 :   } while(0)
     168             : 
     169             : /* FD_R43X6_QUAD_FOLD_UNSIGNED(R,P) does:
     170             :      R = [ fd_r43x6_fold_unsigned(PX) fd_r43x6_fold_unsigned(PY) fd_r43x6_fold_unsigned(PZ) fd_r43x6_fold_unsigned(PT) ] */
     171             : 
     172   346899518 : #define FD_R43X6_QUAD_FOLD_UNSIGNED( R, P ) do {                                            \
     173   346899518 :     long const _m43 = (1L<<43) - 1L;                                                        \
     174   346899518 :     long const _m40 = (1L<<40) - 1L;                                                        \
     175   346899518 :                                                                                             \
     176   346899518 :     wwl_t const _m43_m43 = wwl_bcast( _m43 );                                               \
     177   346899518 :     wwl_t const _m43_m40 = wwl( _m43,_m43,_m43,_m43, _m40,_m40,_m40,_m40 );                 \
     178   346899518 :     wwl_t const _s43_s40 = wwl(  43L, 43L, 43L, 43L,  40L, 40L, 40L, 40L );                 \
     179   346899518 :                                                                                             \
     180   346899518 :     wwl_t _Ph03    = wwl_shru       ( P##03, 43      );                                     \
     181   346899518 :     wwl_t _Ph14    = wwl_shru       ( P##14, 43      );                                     \
     182   346899518 :     wwl_t _Ph25    = wwl_shru_vector( P##25, _s43_s40 );                                    \
     183   346899518 :     wwl_t _19_Ph25 = wwl_add( _Ph25, wwl_add( wwl_shl( _Ph25, 1 ), wwl_shl( _Ph25, 4 ) ) ); \
     184   346899518 :                                                                                             \
     185   346899518 :     R##03 = wwl_add( wwl_and( P##03, _m43_m43 ), wwl_pack_halves( _19_Ph25,1, _Ph25,0 ) );  \
     186   346899518 :     R##14 = wwl_add( wwl_and( P##14, _m43_m43 ), _Ph03 );                                   \
     187   346899518 :     R##25 = wwl_add( wwl_and( P##25, _m43_m40 ), _Ph14 );                                   \
     188   346899518 :   } while(0)
     189             : 
     190             : /* FD_R43X6_QUAD_FOLD_SIGNED(R,P) does:
     191             :      R = [ fd_r43x6_fold_signed(PX) fd_r43x6_fold_signed(PY) fd_r43x6_fold_signed(PZ) fd_r43x6_fold_signed(PT) ] */
     192    26739985 : #define FD_R43X6_QUAD_FOLD_SIGNED( R, P ) do {                                                                \
     193    26739985 :     long const _b0  = 19L<<23;                                                                                \
     194    26739985 :     long const _bb  =  1L<<20;                                                                                \
     195    26739985 :     long const _m43 = (1L<<43) - 1L;                                                                          \
     196    26739985 :     long const _m40 = (1L<<40) - 1L;                                                                          \
     197    26739985 :                                                                                                               \
     198    26739985 :     wwl_t const _bias03  = wwl(  _b0, _b0, _b0, _b0,  _bb, _bb, _bb, _bb );                                   \
     199    26739985 :     wwl_t const _bias    = wwl_bcast( _bb );                                                                  \
     200    26739985 :     wwl_t const _m43_m43 = wwl_bcast( _m43 );                                                                 \
     201    26739985 :     wwl_t const _m43_m40 = wwl( _m43,_m43,_m43,_m43, _m40,_m40,_m40,_m40 );                                   \
     202    26739985 :     wwl_t const _s43_s40 = wwl(  43L, 43L, 43L, 43L,  40L, 40L, 40L, 40L );                                   \
     203    26739985 :                                                                                                               \
     204    26739985 :     wwl_t _P03 = wwl_sub( P##03, _bias03 );                                                                   \
     205    26739985 :     wwl_t _P14 = wwl_sub( P##14, _bias   );                                                                   \
     206    26739985 :     wwl_t _P25 = wwl_sub( P##25, _bias   );                                                                   \
     207    26739985 :                                                                                                               \
     208    26739985 :     wwl_t _Ph03    = wwl_shr       ( _P03, 43       );                                                        \
     209    26739985 :     wwl_t _Ph14    = wwl_shr       ( _P14, 43       );                                                        \
     210    26739985 :     wwl_t _Ph25    = wwl_shr_vector( _P25, _s43_s40 );                                                        \
     211    26739985 :     wwl_t _19_Ph25 = wwl_add( _Ph25, wwl_add( wwl_shl( _Ph25, 1 ), wwl_shl( _Ph25, 4 ) ) );                   \
     212    26739985 :                                                                                                               \
     213    26739985 :     R##03 = wwl_add( wwl_and( _P03, _m43_m43 ), wwl_add( wwl_pack_halves( _19_Ph25,1, _Ph25,0 ), _bias03 ) ); \
     214    26739985 :     R##14 = wwl_add( wwl_and( _P14, _m43_m43 ), wwl_add( _Ph03,                                  _bias   ) ); \
     215    26739985 :     R##25 = wwl_add( wwl_and( _P25, _m43_m40 ), wwl_add( _Ph14,                                  _bias   ) ); \
     216    26739985 :   } while(0)
     217             : 
     218             : /* FD_R43X6_QUAD_MUL_FAST(R,P,Q) does (
     219             :      [ fd_r43x6_mul_fast(PX,QX) fd_r43x6_mul_fast(PY,QY) fd_r43x6_mul_fast(PZ,QZ) fd_r43x6_mul_fast(PT,QT) ]
     220             :    Written this way so that pointer escapes don't inhibit optimizations. */
     221             : 
     222   263831718 : #define FD_R43X6_QUAD_MUL_FAST( R, P, Q ) do {                                                                   \
     223   263831718 :     FD_R43X6_QUAD_DECL( _R ); fd_r43x6_quad_mul_fast( &_R03,&_R14,&_R25, P##03,P##14,P##25, Q##03,Q##14,Q##25 ); \
     224   263831718 :     FD_R43X6_QUAD_MOV( R, _R );                                                                                  \
     225   263831718 :   } while(0)
     226             : 
     227             : FD_FN_UNUSED static void /* let compiler decide if worth inlining */
     228             : fd_r43x6_quad_mul_fast( fd_r43x6_t * _z03, fd_r43x6_t * _z14, fd_r43x6_t * _z25,
     229             :                         fd_r43x6_t    x03, fd_r43x6_t    x14, fd_r43x6_t    x25,
     230   293832018 :                         fd_r43x6_t    y03, fd_r43x6_t    y14, fd_r43x6_t    y25 ) {
     231             : 
     232             :   /* Grade school-ish from the original mul:
     233             : 
     234             :                                        x5   x4   x3   x2   x1   x0
     235             :                                   x    y5   y4   y3   y2   y1   y0
     236             :                                   --------------------------------
     237             :                                      p50l p40l p30l p20l p10l p00l
     238             :                                 p50h p40h p30h p20h p10h p00h
     239             :                                 p51l p41l p31l p21l p11l p01l
     240             :                            p51h p41h p31h p21h p11h p01h
     241             :                            p52l p42l p32l p22l p12l p02l
     242             :                       p52h p42h p32h p22h p12h p02h
     243             :                       p53l p43l p33l p23l p13l p03l
     244             :                  p53h p43h p33h p23h p13h p03h
     245             :                  p54l p44l p34l p24l p14l p04l
     246             :             p54h p44h p34h p24h p14h p04h
     247             :             p55l p45l p35l p25l p15l p05l
     248             :        p55h p45h p35h p25h p15h p05h
     249             :        -----------------------------------------------------------
     250             :         zb5  zb4  zb3  zb2  zb1  zb0  za5  za4  za3  za2  za1  za0
     251             : 
     252             :      Reorganize the partials into low and high parts:
     253             : 
     254             :                                      p50l p40l p30l p20l p10l p00l
     255             :                                 p51l p41l p31l p21l p11l p01l
     256             :                            p52l p42l p32l p22l p12l p02l
     257             :                       p53l p43l p33l p23l p13l p03l
     258             :                  p54l p44l p34l p24l p14l p04l
     259             :             p55l p45l p35l p25l p15l p05l
     260             : 
     261             :                                 p50h p40h p30h p20h p10h p00h
     262             :                            p51h p41h p31h p21h p11h p01h
     263             :                       p52h p42h p32h p22h p12h p02h
     264             :                  p53h p43h p33h p23h p13h p03h
     265             :             p54h p44h p34h p24h p14h p04h
     266             :        p55h p45h p35h p25h p15h p05h
     267             : 
     268             :      We start with 3 8-lane vectors per input.  These hold 4 fd_r43x6_t
     269             :      organized as:
     270             : 
     271             :        x03 = [ X0 X3 ], y03 = [ Y0 Y3 ],
     272             :        x14 = [ X1 X4 ], y14 = [ Y1 Y4 ],
     273             :        x25 = [ X2 X5 ], y25 = [ Y2 Y5 ]
     274             : 
     275             :      Above, Xi indicates limb i for the 4 input.  We can quickly form
     276             :      "xii = [ Xi Xi ]" by packing halves of the x inputs.  And then
     277             :      doing madd52lo of this on a similarly packed yjk we get:
     278             : 
     279             :        LO( xii * yjk ) = [ pijl pikl ]
     280             : 
     281             :      Doing x00, x11, x22, x33, x44, x55 against y03, y14, y25 yields all
     282             :      the low partials, organized:
     283             : 
     284             :        [ p00l p03l ], [ p01l p04l ], [ p02l p05l ],
     285             :        [ p10l p13l ], [ p11l p14l ], [ p12l p15l ],
     286             :        [ p20l p23l ], [ p21l p24l ], [ p22l p25l ],
     287             :        [ p30l p33l ], [ p31l p34l ], [ p32l p35l ],
     288             :        [ p40l p43l ], [ p41l p44l ], [ p42l p45l ],
     289             :        [ p50l p53l ], [ p51l p54l ], [ p52l p55l ]
     290             : 
     291             :      If we use the lower half of these results to accumulate the
     292             :      partials for the first 3 rows, we have:
     293             : 
     294             :        p0_q3 = [ p00l p03l ]
     295             :        p1_q4 = [ p10l p13l ] + [ p01l p04l ]
     296             :        p2_q5 = [ p20l p23l ] + [ p11l p14l ] + [ p02l p05l ]
     297             :        p3_q6 = [ p30l p33l ] + [ p21l p24l ] + [ p12l p15l ]
     298             :        p4_q7 = [ p40l p43l ] + [ p31l p34l ] + [ p22l p25l ]
     299             :        p5_q8 = [ p50l p53l ] + [ p41l p44l ] + [ p32l p35l ]
     300             :        p6_q9 =                 [ p51l p54l ] + [ p42l p45l ]
     301             :        p7_qa =                                 [ p52l p55l ]
     302             : 
     303             :      We also see that doing this implicitly accumulates the last 3 rows
     304             :      of partials at the same time.  Note also that we can use the
     305             :      accumulate features of MADD to do these accumulations and we have
     306             :      lots of independent MADD chains.
     307             : 
     308             :      The exact same applies for the HI partials.  When we sum the LO and
     309             :      HI partials, we need to shift the HI parts left by 9 for the
     310             :      reasons described in the scalar version.  When we sum the lower and
     311             :      upper halves to finish the partial accumulation, we repack them
     312             :      into two FD_R43X6_QUAD representations at the same time.
     313             : 
     314             :      This yields the below.  This has massive ILP with utilization of
     315             :      all lanes with no wasted or redundant multiplications and very
     316             :      minimal fast shuffling. */
     317             : 
     318   293832018 :   wwl_t const _zz = wwl_zero();
     319             : 
     320   293832018 :   wwl_t x00   = wwl_pack_halves( x03,0, x03,0 );
     321   293832018 :   wwl_t x11   = wwl_pack_halves( x14,0, x14,0 );
     322   293832018 :   wwl_t x22   = wwl_pack_halves( x25,0, x25,0 );
     323   293832018 :   wwl_t x33   = wwl_pack_halves( x03,1, x03,1 );
     324   293832018 :   wwl_t x44   = wwl_pack_halves( x14,1, x14,1 );
     325   293832018 :   wwl_t x55   = wwl_pack_halves( x25,1, x25,1 );
     326             : 
     327   293832018 : # if 1 /* This version is faster even though it has more adds due to higher ILP */
     328   293832018 :   wwl_t p0_q3 = wwl_madd52lo(                             _zz, x00, y03 );
     329   293832018 :   wwl_t p1_q4 = wwl_madd52lo( wwl_madd52lo(               _zz, x11, y03 ), x00, y14 );
     330   293832018 :   wwl_t p2_q5 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( _zz, x22, y03 ), x11, y14 ), x00, y25 );
     331   293832018 :   wwl_t p3_q6 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( _zz, x33, y03 ), x22, y14 ), x11, y25 );
     332   293832018 :   wwl_t p4_q7 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( _zz, x44, y03 ), x33, y14 ), x22, y25 );
     333   293832018 :   wwl_t p5_q8 = wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( _zz, x55, y03 ), x44, y14 ), x33, y25 );
     334   293832018 :   wwl_t p6_q9 =               wwl_madd52lo( wwl_madd52lo( _zz,             x55, y14 ), x44, y25 );
     335   293832018 :   wwl_t p7_qa =                             wwl_madd52lo( _zz,                         x55, y25 );
     336             : 
     337   293832018 :   /**/  p1_q4 = wwl_add( p1_q4, wwl_shl( wwl_madd52hi(                             _zz, x00, y03 ),                         9 ) );
     338   293832018 :   /**/  p2_q5 = wwl_add( p2_q5, wwl_shl( wwl_madd52hi( wwl_madd52hi(               _zz, x11, y03 ), x00, y14 ),             9 ) );
     339   293832018 :   /**/  p3_q6 = wwl_add( p3_q6, wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x22, y03 ), x11, y14 ), x00, y25 ), 9 ) );
     340   293832018 :   /**/  p4_q7 = wwl_add( p4_q7, wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x33, y03 ), x22, y14 ), x11, y25 ), 9 ) );
     341   293832018 :   /**/  p5_q8 = wwl_add( p5_q8, wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x44, y03 ), x33, y14 ), x22, y25 ), 9 ) );
     342   293832018 :   /**/  p6_q9 = wwl_add( p6_q9, wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz, x55, y03 ), x44, y14 ), x33, y25 ), 9 ) );
     343   293832018 :   /**/  p7_qa = wwl_add( p7_qa, wwl_shl(               wwl_madd52hi( wwl_madd52hi( _zz,             x55, y14 ), x44, y25 ), 9 ) );
     344   293832018 :   wwl_t p8_qb =                 wwl_shl(                             wwl_madd52hi( _zz,                         x55, y25 ), 9 );
     345             : # else
     346             :   wwl_t p1_q4 = wwl_shl( wwl_madd52hi(                             _zz,   x00, y03 ),                         9 );
     347             :   wwl_t p2_q5 = wwl_shl( wwl_madd52hi( wwl_madd52hi(               _zz,   x11, y03 ), x00, y14 ),             9 );
     348             :   wwl_t p3_q6 = wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz,   x22, y03 ), x11, y14 ), x00, y25 ), 9 );
     349             :   wwl_t p4_q7 = wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz,   x33, y03 ), x22, y14 ), x11, y25 ), 9 );
     350             :   wwl_t p5_q8 = wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz,   x44, y03 ), x33, y14 ), x22, y25 ), 9 );
     351             :   wwl_t p6_q9 = wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz,   x55, y03 ), x44, y14 ), x33, y25 ), 9 );
     352             :   wwl_t p7_qa = wwl_shl(               wwl_madd52hi( wwl_madd52hi( _zz,               x55, y14 ), x44, y25 ), 9 );
     353             :   wwl_t p8_qb = wwl_shl(                             wwl_madd52hi( _zz,                           x55, y25 ), 9 );
     354             : 
     355             :   wwl_t p0_q3 =          wwl_madd52lo(                             _zz,   x00, y03 );
     356             :   /**/  p1_q4 =          wwl_madd52lo( wwl_madd52lo(               p1_q4, x11, y03 ), x00, y14 );
     357             :   /**/  p2_q5 =          wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( p2_q5, x22, y03 ), x11, y14 ), x00, y25 );
     358             :   /**/  p3_q6 =          wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( p3_q6, x33, y03 ), x22, y14 ), x11, y25 );
     359             :   /**/  p4_q7 =          wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( p4_q7, x44, y03 ), x33, y14 ), x22, y25 );
     360             :   /**/  p5_q8 =          wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( p5_q8, x55, y03 ), x44, y14 ), x33, y25 );
     361             :   /**/  p6_q9 =                        wwl_madd52lo( wwl_madd52lo( p6_q9,             x55, y14 ), x44, y25 );
     362             :   /**/  p7_qa =                                      wwl_madd52lo( p7_qa,                         x55, y25 );
     363             : # endif
     364             : 
     365   293832018 :   wwl_t q6_p3 = wwl_pack_halves( p3_q6,1, p3_q6,0 );
     366   293832018 :   wwl_t q7_p4 = wwl_pack_halves( p4_q7,1, p4_q7,0 );
     367   293832018 :   wwl_t q8_p5 = wwl_pack_halves( p5_q8,1, p5_q8,0 );
     368             : 
     369   293832018 :   wwl_t za03  = wwv_add_if( 0xF0, p0_q3, q6_p3, p0_q3 );
     370   293832018 :   wwl_t za14  = wwv_add_if( 0xF0, p1_q4, q7_p4, p1_q4 );
     371   293832018 :   wwl_t za25  = wwv_add_if( 0xF0, p2_q5, q8_p5, p2_q5 );
     372             : 
     373   293832018 :   wwl_t zb03  = wwv_add_if( 0x0F, p6_q9, q6_p3, p6_q9 );
     374   293832018 :   wwl_t zb14  = wwv_add_if( 0x0F, p7_qa, q7_p4, p7_qa );
     375   293832018 :   wwl_t zb25  = wwv_add_if( 0x0F, p8_qb, q8_p5, p8_qb );
     376             : 
     377             :   /* At this point:
     378             : 
     379             :        z = <za0,za1,za2,za3,za4,za5> + 2^258 <zb0,zb1,zb2,zb3,zb4,zb5>
     380             :          = <za0,za1,za2,za3,za4,za5> +   152 <zb0,zb1,zb2,zb3,zb4,zb5>
     381             : 
     382             :      and we can sum this directly (see scalar version for proof).  Like
     383             :      the scalar version, we do the multiplication via shift-and-add
     384             :      techniques because mullo is slow. */
     385             : 
     386   293832018 :   wwl_t z03 = wwl_add( wwl_add( za03, wwl_shl( zb03, 7 ) ), wwl_add( wwl_shl( zb03, 4 ), wwl_shl( zb03, 3 ) ) );
     387   293832018 :   wwl_t z14 = wwl_add( wwl_add( za14, wwl_shl( zb14, 7 ) ), wwl_add( wwl_shl( zb14, 4 ), wwl_shl( zb14, 3 ) ) );
     388   293832018 :   wwl_t z25 = wwl_add( wwl_add( za25, wwl_shl( zb25, 7 ) ), wwl_add( wwl_shl( zb25, 4 ), wwl_shl( zb25, 3 ) ) );
     389             : 
     390   293832018 :   FD_R43X6_QUAD_MOV( *_z, z );
     391   293832018 : }
     392             : 
     393             : /* FD_R43X6_QUAD_SQR_FAST(R,P) does:
     394             :      [ fd_r43x6_sqr_fast(PX) fd_r43x6_sqr_fast(PY) fd_r43x6_sqr_fast(PZ) fd_r43x6_sqr_fast(PT) ]
     395             :    Written this way so that pointer escapes don't inhibit optimizations. */
     396             : 
     397    90099028 : #define FD_R43X6_QUAD_SQR_FAST( R, P ) do {                                                   \
     398    90099028 :     FD_R43X6_QUAD_DECL( _R ); fd_r43x6_quad_sqr_fast( &_R03,&_R14,&_R25, P##03,P##14,P##25 ); \
     399    90099028 :     FD_R43X6_QUAD_MOV( R, _R );                                                               \
     400    90099028 :   } while(0)
     401             : 
     402             : FD_FN_UNUSED static void /* let compiler decide if worth inlining */
     403             : fd_r43x6_quad_sqr_fast( fd_r43x6_t * _z03, fd_r43x6_t * _z14, fd_r43x6_t * _z25,
     404   110099228 :                         fd_r43x6_t    x03, fd_r43x6_t    x14, fd_r43x6_t    x25 ) {
     405             : 
     406             :   /* Grade school-ish from the original mul:
     407             : 
     408             :                                        x5   x4   x3   x2   x1   x0
     409             :                                   x    x5   x4   x3   x2   x1   x0
     410             :                                   --------------------------------
     411             :                                      p50l p40l p30l p20l p10l p00l
     412             :                                 p50h p40h p30h p20h p10h p00h
     413             :                                 p51l p41l p31l p21l p11l p01l
     414             :                            p51h p41h p31h p21h p11h p01h
     415             :                            p52l p42l p32l p22l p12l p02l
     416             :                       p52h p42h p32h p22h p12h p02h
     417             :                       p53l p43l p33l p23l p13l p03l
     418             :                  p53h p43h p33h p23h p13h p03h
     419             :                  p54l p44l p34l p24l p14l p04l
     420             :             p54h p44h p34h p24h p14h p04h
     421             :             p55l p45l p35l p25l p15l p05l
     422             :        p55h p45h p35h p25h p15h p05h
     423             :        -----------------------------------------------------------
     424             :          zb   za   z9   z8   z7   z6   z5   z4   z3   z2   z1   z0
     425             : 
     426             :      Consider only the low partial rows and note that pijl=pjil here.
     427             :      This portion of the reduction can be simplified:
     428             : 
     429             :                                           2*p50l 2*p40l 2*p30l 2*p20l 2*p10l   p00l
     430             :                                    2*p51l 2*p41l 2*p31l 2*p21l   p11l
     431             :                             2*p52l 2*p42l 2*p32l   p22l
     432             :                      2*p53l 2*p43l   p33l
     433             :               2*p54l   p44l
     434             :          p55l
     435             :        ----------------------------------------------------------------------------
     436             :            pa     p9     p8     p7     p6     p5     p4     p3     p2     p1     p0
     437             : 
     438             :      The number of adds and the partials that need to be doubled have a
     439             :      mirror symmetry about p5.  Exploiting this yields:
     440             : 
     441             :        2*p50l|2*p32l 2*p40l|2*p51l 2*p30l|2*p52l 2*p20l|2*p53l 2*p10l|2*p54l  p00l|p55l
     442             :        2*p41l|2*zero 2*p31l|2*p42l 2*p21l|2*p43l   p11l|  p44l
     443             :                        p22l|  p33l
     444             :        --------------------------------------------------------------------------------
     445             :             p55           p46           p37           p28           p19          p0a
     446             : 
     447             :      Above a|b means make an 8-lane vector by concatenating the 4 a's
     448             :      (one for each square in progress) and the 4 b's.  Above we have
     449             :      split the reduction of p5 to get some extra vector multiplier
     450             :      utilization.  Other splits are possible and maybe could usefully
     451             :      trade some extra computation for less swizzling.
     452             : 
     453             :      Similar holds for the high partials:
     454             : 
     455             :        2*p50h|2*p32h 2*p40h|2*p51h 2*p30h|2*p52h 2*p20h|2*p53h 2*p10h|2*p54h  p00h|p55h
     456             :        2*p41h|2*zero 2*p31h|2*p42h 2*p21h|2*p43h   p11h|  p44h
     457             :                        p22h|  p33h
     458             :        --------------------------------------------------------------------------------
     459             :             q66           q57           q48           q39           q2a          q1b
     460             : 
     461             :      For the reasons described in the scalar implementation, we need to
     462             :      shift the high partials left by 9 before we can reduce them into
     463             :      the low partials.  As we do this reduction, we repack them into the
     464             :      FD_R43X6_QUAD's za and zb.
     465             : 
     466             :      In doing these reductions, we exploit i<>j symmetry and pair terms
     467             :      on the left and right halves to minimize input shuffling.  For
     468             :      example, for p1b, we need to form x05=x0|x5 and then compute
     469             :      p1b=x05*x05.  Instead of forming x15 and x04 to compute
     470             :      p2a=2*x15*x04, we can do p2a=2*p01h|2*p54h and use the x14 we were
     471             :      passed directly and reuse the x05 formed for p1b.
     472             : 
     473             :      This yields the below.  Theoretical minimum number of multiplies,
     474             :      tons of ILP, low swizzling overhead. */
     475             : 
     476   110099228 :   wwl_t _zz     = wwl_zero();
     477             : 
     478   110099228 :   wwl_t x05     = wwl_pack_h0_h1 ( x03,   x25   );
     479   110099228 :   wwl_t x12     = wwl_pack_halves( x14,0, x25,0 );
     480   110099228 :   wwl_t x34     = wwl_pack_halves( x03,1, x14,1 );
     481   110099228 :   wwl_t x41     = wwl_pack_halves( x14,1, x14,0 );
     482   110099228 :   wwl_t x23     = wwl_pack_h0_h1 ( x25,   x03   );
     483             : 
     484   110099228 :   wwl_t x52     = wwl_pack_halves( x25,1, x25,0 );
     485   110099228 :   wwl_t x4z     = wwl_pack_halves( x14,1, _zz,0 );
     486             : 
     487   110099228 :   wwl_t two_x03 = wwl_shl( x03, 1 );
     488   110099228 :   wwl_t two_x14 = wwl_shl( x14, 1 );
     489   110099228 :   wwl_t two_x05 = wwl_shl( x05, 1 );
     490   110099228 :   wwl_t two_x12 = wwl_shl( x12, 1 );
     491             : 
     492   110099228 : # if 1 /* This version is faster even though it has more adds due to better ILP */
     493   110099228 :   wwl_t p0a     =          wwl_madd52lo(                             _zz,     x05, x05 );
     494   110099228 :   wwl_t p19     =          wwl_madd52lo(                             _zz, two_x05, x14 );
     495   110099228 :   wwl_t p28     =          wwl_madd52lo( wwl_madd52lo(               _zz,     x14, x14 ), two_x03, x25 );
     496   110099228 :   wwl_t p37     =          wwl_madd52lo( wwl_madd52lo(               _zz, two_x03, x34 ), two_x12, x25 );
     497   110099228 :   wwl_t p46     =          wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( _zz,     x23, x23 ), two_x05, x41 ), two_x12, x34 );
     498   110099228 :   wwl_t p55     =          wwl_madd52lo( wwl_madd52lo(               _zz, two_x03, x52 ), two_x14, x4z );
     499             : 
     500   110099228 :   wwl_t q1b     = wwl_shl( wwl_madd52hi(                             _zz,     x05, x05 ),                                 9 );
     501   110099228 :   wwl_t q2a     = wwl_shl( wwl_madd52hi(                             _zz, two_x05, x14 ),                                 9 );
     502   110099228 :   wwl_t q39     = wwl_shl( wwl_madd52hi( wwl_madd52hi(               _zz,     x14, x14 ), two_x03, x25 ),                 9 );
     503   110099228 :   wwl_t q48     = wwl_shl( wwl_madd52hi( wwl_madd52hi(               _zz, two_x03, x34 ), two_x12, x25 ),                 9 );
     504   110099228 :   wwl_t q57     = wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz,     x23, x23 ), two_x05, x41 ), two_x12, x34 ), 9 );
     505   110099228 :   wwl_t q66     = wwl_shl( wwl_madd52hi( wwl_madd52hi(               _zz, two_x03, x52 ), two_x14, x4z ),                 9 );
     506             : 
     507   110099228 :   wwl_t za03    =          wwl_add( wwl_pack_halves( p0a,0, p37,0 ), wwl_pack_halves( _zz,0, q39,0 ) );
     508   110099228 :   wwl_t za14    =          wwl_add( wwl_pack_halves( p19,0, p46,0 ), wwl_pack_halves( q1b,0, q48,0 ) );
     509   110099228 :   wwl_t za25    = wwl_add( wwl_add( wwl_pack_halves( p28,0, p55,0 ), wwl_pack_halves( q2a,0, q57,0 ) ), wwl_pack_h0_h1( _zz, p55 ) );
     510             : 
     511   110099228 :   wwl_t zb03    = wwl_add( wwl_add( wwl_pack_halves( p46,1, p19,1 ), wwl_pack_halves( q66,1, q39,1 ) ), wwl_pack_h0_h1( q66, _zz ) );
     512   110099228 :   wwl_t zb14    =          wwl_add( wwl_pack_halves( p37,1, p0a,1 ), wwl_pack_halves( q57,1, q2a,1 ) );
     513   110099228 :   wwl_t zb25    =          wwl_add( wwl_pack_halves( p28,1, _zz,1 ), wwl_pack_halves( q48,1, q1b,1 ) );
     514             : # else
     515             :   wwl_t q1b     = wwl_shl( wwl_madd52hi(                             _zz,     x05, x05 ),                                 9 );
     516             :   wwl_t q2a     = wwl_shl( wwl_madd52hi(                             _zz, two_x05, x14 ),                                 9 );
     517             :   wwl_t q39     = wwl_shl( wwl_madd52hi( wwl_madd52hi(               _zz,     x14, x14 ), two_x03, x25 ),                 9 );
     518             :   wwl_t q48     = wwl_shl( wwl_madd52hi( wwl_madd52hi(               _zz, two_x03, x34 ), two_x12, x25 ),                 9 );
     519             :   wwl_t q57     = wwl_shl( wwl_madd52hi( wwl_madd52hi( wwl_madd52hi( _zz,     x23, x23 ), two_x05, x41 ), two_x12, x34 ), 9 );
     520             :   wwl_t q66     = wwl_shl( wwl_madd52hi( wwl_madd52hi(               _zz, two_x03, x52 ), two_x14, x4z ),                 9 );
     521             : 
     522             :   wwl_t p0a     =          wwl_madd52lo(                             wwl_pack_h0_h1( _zz, q2a ),     x05, x05 );
     523             :   wwl_t p19     =          wwl_madd52lo(                             wwl_pack_h0_h1( q1b, q39 ), two_x05, x14 );
     524             :   wwl_t p28     =          wwl_madd52lo( wwl_madd52lo(               wwl_pack_h0_h1( q2a, q48 ),     x14, x14 ), two_x03, x25 );
     525             :   wwl_t p37     =          wwl_madd52lo( wwl_madd52lo(               wwl_pack_h0_h1( q39, q57 ), two_x03, x34 ), two_x12, x25 );
     526             :   wwl_t p46     =          wwl_madd52lo( wwl_madd52lo( wwl_madd52lo( wwl_pack_h0_h1( q48, q66 ),     x23, x23 ), two_x05, x41 ), two_x12, x34 );
     527             :   wwl_t p55     =          wwl_madd52lo( wwl_madd52lo(               wwl_pack_h0_h1( q57, _zz ), two_x03, x52 ), two_x14, x4z );
     528             : 
     529             :   wwl_t za03    =          wwl_pack_halves( p0a,0, p37,0 );
     530             :   wwl_t za14    =          wwl_pack_halves( p19,0, p46,0 );
     531             :   wwl_t za25    = wwl_add( wwl_pack_halves( p28,0, p55,0 ), wwl_pack_h0_h1( _zz, p55 ) );
     532             : 
     533             :   wwl_t zb03    = wwl_add( wwl_pack_halves( p46,1, p19,1 ), wwl_pack_h0_h1( q66, _zz ) );
     534             :   wwl_t zb14    =          wwl_pack_halves( p37,1, p0a,1 );
     535             :   wwl_t zb25    =          wwl_pack_halves( p28,1, q1b,1 );
     536             : # endif
     537             : 
     538             :   /* At this point:
     539             : 
     540             :        z = <za0,za1,za2,za3,za4,za5> + 2^258 <zb0,zb1,zb2,zb3,zb4,zb5>
     541             : 
     542             :      We complete the calc exactly like FD_R43X6_QUAD_MUL above. */
     543             : 
     544   110099228 :   wwl_t z03 = wwl_add( wwl_add( za03, wwl_shl( zb03, 7 ) ), wwl_add( wwl_shl( zb03, 4 ), wwl_shl( zb03, 3 ) ) );
     545   110099228 :   wwl_t z14 = wwl_add( wwl_add( za14, wwl_shl( zb14, 7 ) ), wwl_add( wwl_shl( zb14, 4 ), wwl_shl( zb14, 3 ) ) );
     546   110099228 :   wwl_t z25 = wwl_add( wwl_add( za25, wwl_shl( zb25, 7 ) ), wwl_add( wwl_shl( zb25, 4 ), wwl_shl( zb25, 3 ) ) );
     547             : 
     548   110099228 :   FD_R43X6_QUAD_MOV( *_z, z );
     549   110099228 : }
     550             : 
     551             : /* Below, FD_R43X6_MUL4_INL( za,xa,ya, zb,xb,yb, zc,xc,yc, zd,xd,yd )
     552             :    exactly does:
     553             : 
     554             :      za = fd_r43x6_mul( xa, ya );
     555             :      zb = fd_r43x6_mul( xb, yb );
     556             :      zc = fd_r43x6_mul( xc, yc );
     557             :      zd = fd_r43x6_mul( xd, yd );
     558             : 
     559             :    Likewise, FD_R43X6_SQR4_INL( za,xa, zb,xb, zc,xc, zd,xd ) exactly does:
     560             : 
     561             :      za = fd_r43x6_sqr( xa );
     562             :      zb = fd_r43x6_sqr( xb );
     563             :      zc = fd_r43x6_sqr( xc );
     564             :      zd = fd_r43x6_sqr( xd );
     565             : 
     566             :    And, FD_R43X6_POW25223_2_INL( za,xa, zb,xb ) exactly does:
     567             : 
     568             :      za = fd_r43x6_pow25223( xa );
     569             :      zb = fd_r43x6_pow25223( xb );
     570             : 
     571             :    Similarly for FD_R43X6_MUL{1,2,3}_INL, FD_R43X6_SQR{1,2,3}_INL and
     572             :    FD_R43X6_POW25223_1_INL( za ).
     573             : 
     574             :    These macros are robust (e.g. these evaluate their arguments once and
     575             :    they linguistically behave as a single statement) and have the
     576             :    resulting ILP very exposed to the optimizer and CPU.  In-place
     577             :    operation okay.
     578             : 
     579             :    Future implementations might allow these to produce different
     580             :    mathematically equivalent representations of the result if such
     581             :    allows higher performance akin to what was done for fd_r43x6_sqr.
     582             : 
     583             :    TODO: SUB2_INL to accelerate the folds there?
     584             : 
     585             :    TODO: Consider pure for various multi-return function prototypes? */
     586             : 
     587             : #if 0 /* Reference implementation */
     588             : 
     589             : #define FD_R43X6_MUL1_INL( za,xa,ya ) do { \
     590             :     (za) = fd_r43x6_mul( (xa), (ya) );     \
     591             :   } while(0)
     592             : 
     593             : #define FD_R43X6_MUL2_INL( za,xa,ya, zb,xb,yb ) do { \
     594             :     (za) = fd_r43x6_mul( (xa), (ya) );               \
     595             :     (zb) = fd_r43x6_mul( (xb), (yb) );               \
     596             :   } while(0)
     597             : 
     598             : #define FD_R43X6_MUL3_INL( za,xa,ya, zb,xb,yb, zc,xc,yc ) do { \
     599             :     (za) = fd_r43x6_mul( (xa), (ya) );                         \
     600             :     (zb) = fd_r43x6_mul( (xb), (yb) );                         \
     601             :     (zc) = fd_r43x6_mul( (xc), (yc) );                         \
     602             :   } while(0)
     603             : 
     604             : #define FD_R43X6_MUL4_INL( za,xa,ya, zb,xb,yb, zc,xc,yc, zd,xd,yd ) do { \
     605             :     (za) = fd_r43x6_mul( (xa), (ya) );                                   \
     606             :     (zb) = fd_r43x6_mul( (xb), (yb) );                                   \
     607             :     (zc) = fd_r43x6_mul( (xc), (yc) );                                   \
     608             :     (zd) = fd_r43x6_mul( (xd), (yd) );                                   \
     609             :   } while(0)
     610             : 
     611             : #define FD_R43X6_SQR1_INL( za,xa ) do { \
     612             :     (za) = fd_r43x6_sqr( (xa) );        \
     613             :   } while(0)
     614             : 
     615             : #define FD_R43X6_SQR2_INL( za,xa, zb,xb ) do { \
     616             :     (za) = fd_r43x6_sqr( (xa) );               \
     617             :     (zb) = fd_r43x6_sqr( (xb) );               \
     618             :   } while(0)
     619             : 
     620             : #define FD_R43X6_SQR3_INL( za,xa, zb,xb, zc,xc ) do { \
     621             :     (za) = fd_r43x6_sqr( (xa) );                      \
     622             :     (zb) = fd_r43x6_sqr( (xb) );                      \
     623             :     (zc) = fd_r43x6_sqr( (xc) );                      \
     624             :   } while(0)
     625             : 
     626             : #define FD_R43X6_SQR4_INL( za,xa, zb,xb, zc,xc, zd,xd ) do { \
     627             :     (za) = fd_r43x6_sqr( (xa) );                             \
     628             :     (zb) = fd_r43x6_sqr( (xb) );                             \
     629             :     (zc) = fd_r43x6_sqr( (xc) );                             \
     630             :     (zd) = fd_r43x6_sqr( (xd) );                             \
     631             :   } while(0)
     632             : 
     633             : #define FD_R43X6_POW22523_1_INL( za,xa ) do { \
     634             :     (za) = fd_r43x6_pow22523( (xa) );         \
     635             :   } while(0)
     636             : 
     637             : #define FD_R43X6_POW22523_2_INL( za,xa, zb,xb ) do { \
     638             :     (za) = fd_r43x6_pow22523( (xa) );                \
     639             :     (zb) = fd_r43x6_pow22523( (xb) );                \
     640             :   } while(0)
     641             : 
     642             : #else /* HPC implementation */
     643             : 
     644             : /* Nothing to interleave so let compiler decide */
     645             : 
     646    33529460 : #define FD_R43X6_MUL1_INL( z,x,y ) do { \
     647    33529460 :     (z) = fd_r43x6_mul( (x), (y) );     \
     648    33529460 :   } while(0)
     649             : 
     650             : /* Seems to be slightly faster to let compiler decide */
     651             : 
     652    18828724 : #define FD_R43X6_MUL2_INL( za,xa,ya, zb,xb,yb ) do { \
     653    18828724 :     (za) = fd_r43x6_mul( (xa), (ya) );               \
     654    18828724 :     (zb) = fd_r43x6_mul( (xb), (yb) );               \
     655    18828724 :   } while(0)
     656             : 
     657             : /* Slightly faster to pack / pack / mul / fold / unpack */
     658             : 
     659    10000000 : #define FD_R43X6_MUL3_INL( za,xa,ya, zb,xb,yb, zc,xc,yc ) do {                                   \
     660    10000000 :     FD_R43X6_QUAD_DECL( _X ); FD_R43X6_QUAD_PACK         ( _X, (xa),(xb),(xc),fd_r43x6_zero() ); \
     661    10000000 :     FD_R43X6_QUAD_DECL( _Y ); FD_R43X6_QUAD_PACK         ( _Y, (ya),(yb),(yc),fd_r43x6_zero() ); \
     662    10000000 :     FD_R43X6_QUAD_DECL( _Z ); FD_R43X6_QUAD_MUL_FAST     (  _Z, _X, _Y );                        \
     663    10000000 :     /**/                      FD_R43X6_QUAD_FOLD_UNSIGNED( _Z, _Z );                             \
     664    10000000 :     fd_r43x6_t _zd;           FD_R43X6_QUAD_UNPACK       ( (za),(zb),(zc),_zd, _Z );             \
     665    10000000 :     (void)_zd;                                                                                   \
     666    10000000 :   } while(0)
     667             : 
     668             : /* Substantially faster to pack / pack / mul / fold / unpack */
     669             : 
     670    10030002 : #define FD_R43X6_MUL4_INL( za,xa,ya, zb,xb,yb, zc,xc,yc, zd,xd,yd ) do {              \
     671    10030002 :     FD_R43X6_QUAD_DECL( _X ); FD_R43X6_QUAD_PACK         ( _X, (xa),(xb),(xc),(xd) ); \
     672    10030002 :     FD_R43X6_QUAD_DECL( _Y ); FD_R43X6_QUAD_PACK         ( _Y, (ya),(yb),(yc),(yd) ); \
     673    10030002 :     FD_R43X6_QUAD_DECL( _Z ); FD_R43X6_QUAD_MUL_FAST     (  _Z, _X, _Y );             \
     674    10030002 :     /**/                      FD_R43X6_QUAD_FOLD_UNSIGNED( _Z, _Z );                  \
     675    10030002 :     /**/                      FD_R43X6_QUAD_UNPACK       ( (za),(zb),(zc),(zd), _Z ); \
     676    10030002 :   } while(0)
     677             : 
     678             : /* Nothing to interleave so let compiler decide */
     679             : 
     680   428715168 : #define FD_R43X6_SQR1_INL( z,x ) do { (z) = fd_r43x6_sqr( (x) ); } while(0)
     681             : 
     682             : /* Seems to be slightly faster to let compiler decide */
     683             : 
     684   152515514 : #define FD_R43X6_SQR2_INL( za,xa, zb,xb ) do { \
     685   152515514 :     (za) = fd_r43x6_sqr( (xa) );               \
     686   152515514 :     (zb) = fd_r43x6_sqr( (xb) );               \
     687   152515514 :   } while(0)
     688             : 
     689             : /* Seems to be slightly faster to let compiler decide */
     690             : 
     691    10000000 : #define FD_R43X6_SQR3_INL( za,xa, zb,xb, zc,xc ) do { \
     692    10000000 :     (za) = fd_r43x6_sqr( (xa) );                      \
     693    10000000 :     (zb) = fd_r43x6_sqr( (xb) );                      \
     694    10000000 :     (zc) = fd_r43x6_sqr( (xc) );                      \
     695    10000000 :   } while(0)
     696             : 
     697             : /* Substantially faster to pack / pack / sqr / fold / unpack */
     698             : 
     699    10000000 : #define FD_R43X6_SQR4_INL( za,xa, zb,xb, zc,xc, zd,xd ) do {                          \
     700    10000000 :     FD_R43X6_QUAD_DECL( _X ); FD_R43X6_QUAD_PACK         ( _X, (xa),(xb),(xc),(xd) ); \
     701    10000000 :     FD_R43X6_QUAD_DECL( _Z ); FD_R43X6_QUAD_SQR_FAST     ( _Z, _X );                  \
     702    10000000 :     /**/                      FD_R43X6_QUAD_FOLD_UNSIGNED( _Z, _Z );                  \
     703    10000000 :     /**/                      FD_R43X6_QUAD_UNPACK       ( (za),(zb),(zc),(zd), _Z ); \
     704    10000000 :   } while(0)
     705             : 
     706             : /* Nothing to interleave so let compiler decide */
     707             : 
     708      131072 : #define FD_R43X6_POW22523_1_INL( za,xa ) do { \
     709      131072 :     (za) = fd_r43x6_pow22523( (xa) );         \
     710      131072 :   } while(0)
     711             : 
     712             : /* This is very expensive with a huge instruction footprint.  So we just
     713             :    wrap to avoid pointer escapes from inhibiting optimization and call a
     714             :    separately compiled version. */
     715             : 
     716      431826 : #define FD_R43X6_POW22523_2_INL( za,xa, zb,xb ) do { \
     717      431826 :     fd_r43x6_t _za; fd_r43x6_t _zb;                  \
     718      431826 :     fd_r43x6_pow22523_2( &_za,(xa), &_zb,(xb) );     \
     719      431826 :     (za) = _za; (zb) = _zb;                          \
     720      431826 :   } while(0)
     721             : 
     722             : void
     723             : fd_r43x6_pow22523_2( fd_r43x6_t * _za, fd_r43x6_t za,
     724             :                      fd_r43x6_t * _zb, fd_r43x6_t zb );
     725             : 
     726             : #endif /* HPC implementation */
     727             : 
     728             : FD_PROTOTYPES_END
     729             : 
     730             : #endif /* HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_inl_h */

Generated by: LCOV version 1.14