LCOV - code coverage report
Current view: top level - ballet/ed25519/avx512 - fd_r43x6_ge.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 65 78 83.3 %
Date: 2025-01-08 12:08:44 Functions: 2 52 3.8 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_ge_h
       2             : #define HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_ge_h
       3             : 
       4             : /* This header provides APIs for manipulating group elements / curve
       5             :    points in ED25519.  Direct quotes from RFC 8032 are indicated with
       6             :    '//' style comments. */
       7             : 
       8             : /* A curve point will be represented by a FD_R43X6_QUAD (X,Y,Z,T) in
       9             :    extended homogeneous coordinates where X, Y, Z and T hold fd_r43x6_t
      10             :    u44 representations typically and X Y = T Z. */
      11             : 
      12             : // Section 5.1.4 (page 11)
      13             : //
      14             : // A point (x,y) is represented in extended homogeneous coordinates
      15             : // (X, Y, Z, T), with x = X/Z, y = Y/Z, x * y = T/Z.
      16             : 
      17             : #include "fd_r43x6.h"
      18             : 
      19             : FD_PROTOTYPES_BEGIN
      20             : 
      21             : /* FD_R43X6_GE_ZERO(P) does P = the curve neutral point (0,1,1,0).
      22             :    (X,Y,Z,T) will be reduced representations. */
      23             : 
      24             : // Section 5.1.4 (page 11):
      25             : //
      26             : // The neutral point is (0,1), or equivalently in extended homogeneous
      27             : // coordinates (0, Z, Z, 0) for any non-zero Z.
      28             : 
      29      855610 : #define FD_R43X6_GE_ZERO(P) do { P##03 = wwl( 0L,1L,1L,0L, 0L,0L,0L,0L ); P##14 = wwl_zero(); P##25 = wwl_zero(); } while(0)
      30             : 
      31             : /* FD_R43X6_GE_ONE(P) does P = the curve "base" point.  (X,Y,Z,T) are all
      32             :    reduced representations with Z==1.  Section 5.1 (page 9):
      33             : 
      34             :      B = (15112221349535400772501151409588531511454012693041857206046113283949847762202,
      35             :           46316835694926478169428394003475163141307993866256225615783033603165251855960)
      36             : 
      37             :    The below limbs for a reduced fd_r43x6_t representation were computed
      38             :    from the above using Python. */
      39             : 
      40             : #define FD_R43X6_GE_ONE(P) do {                                                                                            \
      41             :     P##03 = wwl( 5912276620570L, 7036874417752L, 1L, 2970602692003L, 1206867684910L, 3518437208883L, 0L, 8002368565694L ); \
      42             :     P##14 = wwl( 5175273663173L, 5277655813324L, 0L, 2381000326097L, 7581689711182L, 7036874417766L, 0L,  787695955620L ); \
      43             :     P##25 = wwl( 1806891319892L, 1759218604441L, 0L, 4963950264797L,  286998243226L,  879609302220L, 0L,  889305571247L ); \
      44             :   } while(0)
      45             : 
      46             : /* FD_R43X6_GE_IS_EQ(X,Y) returns 1 if X and Y represent the same curve
      47             :    point and 0 otherwise.  X and Y should be FD_R43X6_QUAD holding u46
      48             :    representations. */
      49             : 
      50      255759 : #define FD_R43X6_GE_IS_EQ( X, Y ) fd_r43x6_ge_is_eq( X##03,X##14,X##25, Y##03,Y##14,Y##25 )
      51             : 
      52             : FD_FN_UNUSED static int /* let compiler decide if worth inlining */
      53             : fd_r43x6_ge_is_eq( wwl_t X03, wwl_t X14, wwl_t X25,
      54      255759 :                    wwl_t Y03, wwl_t Y14, wwl_t Y25 ) {
      55             : 
      56             :   /* We use the same method from the spec to test equality.  It is worth
      57             :      noting that the standard itself specifies using a two point check
      58             :      to avoid doing an unnecessary inversion.  E.g. It is somewhat odd
      59             :      that OpenSSL implementation chooses instead to encode R' to r'
      60             :      (slow) compare the encoded r and r' for its equality check in
      61             :      verify. */
      62             : 
      63             :   // Section 6 classEdwardsPoint (page 51)
      64             :   //
      65             :   // #Check that two points are equal.
      66             :   // def __eq__(self,y):
      67             :   //     #Need to check x1/z1 == x2/z2 and similarly for y, so cross
      68             :   //     #multiply to eliminate divisions.
      69             :   //     xn1=self.x*y.z
      70             :   //     xn2=y.x*self.z
      71             :   //     yn1=self.y*y.z
      72             :   //     yn2=y.y*self.z
      73             :   //     return xn1==xn2 and yn1==yn2
      74             : 
      75      255759 :   fd_r43x6_t xn1, xn2, yn1, yn2;
      76      255759 :   FD_R43X6_QUAD_PERMUTE ( X, 2,0,2,1, X );         /* X = XZ |XX |XZ |XY,  in u46|u46|u46|u46 */
      77      255759 :   FD_R43X6_QUAD_PERMUTE ( Y, 0,2,1,2, Y );         /* Y = YX |YZ |YY |YZ , in u46|u46|u46|u46 */
      78      255759 :   FD_R43X6_QUAD_MUL_FAST( X, X, Y );               /* X = xn2|xn1|yn2|yn1, in u62|u62|u62|u62 */
      79      255759 :   FD_R43X6_QUAD_UNPACK  ( xn2, xn1, yn2, yn1, X );
      80      255759 :   return (int)(!fd_r43x6_is_nonzero( fd_r43x6_sub_fast( xn1, xn2 ) /* in s62 */ )) &
      81      255759 :          (int)(!fd_r43x6_is_nonzero( fd_r43x6_sub_fast( yn1, yn2 ) /* in s62 */ ));
      82      255759 : }
      83             : 
      84             : /* FD_R43X6_QUAD_1112d(Q) does Q = (1,1,2,2*d).  (X,Y,Z,T) will be
      85             :    reduced fd_r43x6_t. */
      86             : 
      87     4722415 : #define FD_R43X6_QUAD_1112d( Q ) do {                                      \
      88     4722415 :     Q##03 = wwl( 1L, 1L, 1L, 3934839304537L, 0L, 0L, 0L,  521695520920L ); \
      89     4722415 :     Q##14 = wwl( 0L, 0L, 0L,  507525298899L, 0L, 0L, 0L, 6596238350568L ); \
      90     4722415 :     Q##25 = wwl( 0L, 0L, 0L,   15037786634L, 0L, 0L, 0L,  309467527341L ); \
      91     4722415 :   } while(0)
      92             : 
      93             : /* FD_R43X6_GE_ADD(P3,P1,P2) computes P3 = P1 + P2 where P1, P2 and P3
      94             :    are FD_R43X6_QUAD.  P1 and P2 should hold s61 representations.  P3
      95             :    will hold u44 representations on return.  In place operation fine. */
      96             : 
      97             : // Section 5.1.4 (page 12):
      98             : //
      99             : // The following formulas for adding two points, (x3,y3) =
     100             : // (x1,y1)+(x2,y2), on twisted Edwards curves with a=-1, square a, and
     101             : // non-square d are described in Section 3.1 of [Edwards-revisited] and
     102             : // in [EFD-TWISTED-ADD].  They are complete, i.e., they work for any
     103             : // pair of valid input points.
     104             : //
     105             : //   A = (Y1-X1)*(Y2-X2)
     106             : //   B = (Y1+X1)*(Y2+X2)
     107             : //   C = T1*2*d*T2
     108             : //   D = Z1*2*Z2
     109             : //   E = B-A
     110             : //   F = D-C
     111             : //   G = D+C
     112             : //   H = B+A
     113             : //   X3 = E*F
     114             : //   Y3 = G*H
     115             : //   T3 = E*H
     116             : //   Z3 = F*G
     117             : 
     118             : #if 1 /* Seems very slightly faster than the below */
     119     2012615 : #define FD_R43X6_GE_ADD( P3, P1, P2 ) do {                                                                             \
     120     2012615 :     FD_R43X6_QUAD_DECL         ( _1112d );                                                                             \
     121     2012615 :     FD_R43X6_QUAD_DECL         ( _ta );                                                                                \
     122     2012615 :     FD_R43X6_QUAD_DECL         ( _tb );                                                                                \
     123     2012615 :     FD_R43X6_QUAD_1112d        ( _1112d );                      /*       (1,    1,    1,    2*d  ), u43|u43|u43|u43 */ \
     124     2012615 :     FD_R43X6_QUAD_PERMUTE      ( _ta, 1,0,2,3, P1 );            /* _ta = (Y1,   X1,   Z1,   T1   ), s61|s61|s61|s61 */ \
     125     2012615 :     FD_R43X6_QUAD_PERMUTE      ( _tb, 1,0,2,3, P2 );            /* _tb = (Y2,   X2,   Z2,   T2   ), s61|s61|s61|s61 */ \
     126     2012615 :     FD_R43X6_QUAD_LANE_SUB_FAST( _ta, _ta, 1,0,0,0, _ta, P1 );  /* _ta = (Y1-X1,X1,   Z1,   T1   ), s62|s61|s61|s61 */ \
     127     2012615 :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,0, _tb, P2 );  /* _tb = (Y2-X2,X2,   Z2,   T2   ), s62|s61|s61|s61 */ \
     128     2012615 :     FD_R43X6_QUAD_LANE_ADD_FAST( _ta, _ta, 0,1,1,0, _ta, P1 );  /* _ta = (Y1-X1,Y1+X1,Z1*2, T1   ), s62|s62|s61|s61 */ \
     129     2012615 :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,0,0, _tb, P2 );  /* _tb = (Y2-X2,Y2+X2,Z2,   T2   ), s62|s62|s61|s61 */ \
     130     2012615 :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (A,    B,    D,    C    ), u62|u62|u62|u62 */ \
     131     2012615 :     FD_R43X6_QUAD_FOLD_UNSIGNED( _ta, _ta );                    /* _ta = (Y1-X1,Y1+X1,Z1*2, T1*2d), u44|u44|u44|u44 */ \
     132     2012615 :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _1112d );            /* _ta = (Y1-X1,Y1+X1,Z1*2, T1*2d), u62|u62|u62|u62 */ \
     133     2012615 :     FD_R43X6_QUAD_FOLD_UNSIGNED( _ta, _ta );                    /* _ta = (A,    B,    D,    C    ), u44|u44|u44|u44 */ \
     134     2012615 :     FD_R43X6_QUAD_PERMUTE      ( _tb, 1,0,3,2, _ta );           /* _tb = (B,    A,    C,    D    ), u62|u62|u62|u62 */ \
     135     2012615 :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,1, _tb, _ta ); /* _tb = (E,    A,    C,    F    ), s62|u62|u62|s62 */ \
     136     2012615 :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,1,0, _tb, _ta ); /* _tb = (E,    H,    G,    F    ), s62|u63|u63|s62 */ \
     137     2012615 :     FD_R43X6_QUAD_PERMUTE      ( _ta, 0,2,2,0, _tb );           /* _ta = (E,    G,    G,    E    ), u44|u44|u44|u44 */ \
     138     2012615 :     FD_R43X6_QUAD_PERMUTE      ( _tb, 3,1,3,1, _tb );           /* _tb = (F,    H,    F,    H    ), u44|u44|u44|u44 */ \
     139     2012615 :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (X3,   Y3,   Z3,   T3   ), u62|u62|u62|u62 */ \
     140     2012615 :     FD_R43X6_QUAD_FOLD_UNSIGNED( P3, _ta );                     /* P3  = (X3,   Y3,   Z3,   T3   ), u44|u44|u44|u44 */ \
     141     2012615 :   } while(0)
     142             : #else /* Seems very slightly slower than the above */
     143             : #define FD_R43X6_GE_ADD( P3, P1, P2 ) do {                                                                          \
     144             :     FD_R43X6_QUAD_DECL         ( _ta );                                                                             \
     145             :     FD_R43X6_QUAD_DECL         ( _tb );                                                                             \
     146             :     FD_R43X6_QUAD_PERMUTE      ( _ta, 1,0,2,3, P1 );            /* _ta = (Y1,   X1,   Z1,   T1), s61|s61|s61|s61 */ \
     147             :     FD_R43X6_QUAD_PERMUTE      ( _tb, 1,0,2,3, P2 );            /* _tb = (Y2,   X2,   Z2,   T2), s61|s61|s61|s61 */ \
     148             :     FD_R43X6_QUAD_LANE_SUB_FAST( _ta, _ta, 1,0,0,0, _ta, P1 );  /* _ta = (Y1-X1,X1,   Z1,   T1), s62|s61|s61|s61 */ \
     149             :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,0, _tb, P2 );  /* _tb = (Y2-X2,X2,   Z2,   T2), s62|s61|s61|s61 */ \
     150             :     FD_R43X6_QUAD_LANE_ADD_FAST( _ta, _ta, 0,1,1,0, _ta, P1 );  /* _ta = (Y1-X1,Y1+X1,2*Z1, T1), s62|s62|s62|s61 */ \
     151             :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,0,0, _tb, P2 );  /* _tb = (Y2-X2,Y2+X2,Z2,   T2), s62|s62|s61|s61 */ \
     152             :     FD_R43X6_QUAD_FOLD_SIGNED  ( _ta, _ta );                    /* _ta = (Y1-X1,Y1+X1,2*Z1, T1), u44|u44|u44|u44 */ \
     153             :     FD_R43X6_QUAD_FOLD_SIGNED  ( _tb, _tb );                    /* _tb = (Y2-X2,Y2+X2,Z2,   T2), u44|u44|u44|u44 */ \
     154             :     fd_r43x6_t _YmX1, _YpX1, _2Z1, _T1;                                                                             \
     155             :     FD_R43X6_QUAD_UNPACK( _YmX1, _YpX1, _2Z1, _T1, _ta );                                                           \
     156             :     FD_R43X6_QUAD_PACK( _ta, _YmX1, _YpX1, _2Z1, fd_r43x6_mul( _T1, fd_r43x6_2d() ) );                              \
     157             :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (A,    B,    D,    C ), u62|u62|u62|u62 */ \
     158             :     FD_R43X6_QUAD_PERMUTE      ( _tb, 1,0,3,2, _ta );           /* _tb = (B,    A,    C,    D ), u62|u62|u62|u62 */ \
     159             :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,1, _tb, _ta ); /* _tb = (E,    A,    C,    F ), s62|u62|u62|s62 */ \
     160             :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,1,0, _tb, _ta ); /* _tb = (E,    H,    G,    F ), s62|u63|u63|s62 */ \
     161             :     FD_R43X6_QUAD_FOLD_SIGNED  ( _tb, _tb );                    /* _tb = (E,    H,    G,    F ), u44|u44|u44|u44 */ \
     162             :     FD_R43X6_QUAD_PERMUTE      ( _ta, 0,2,2,0, _tb );           /* _ta = (E,    G,    G,    E ), u44|u44|u44|u44 */ \
     163             :     FD_R43X6_QUAD_PERMUTE      ( _tb, 3,1,3,1, _tb );           /* _tb = (F,    H,    F,    H ), u44|u44|u44|u44 */ \
     164             :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (X3,   Y3,   Z3,   T3), u62|u62|u62|u62 */ \
     165             :     FD_R43X6_QUAD_FOLD_UNSIGNED( P3, _ta );                     /* P3  = (X3,   Y3,   Z3,   T3), u44|u44|u44|u44 */ \
     166             :   } while(0)
     167             : #endif
     168             : 
     169             : /* FD_R43X6_GE_ADD_TABLE does the same thing as FD_R43X6_GE_ADD where T1
     170             :    holds (Y1-X1,Y1+X1,Z1*2,T1*2d).  T1 and P2 should be in s61
     171             :    representations.  P3 will hold u44 representations. */
     172             : 
     173             : #define FD_R43X6_GE_ADD_TABLE( P3, T1, P2 ) do {                                                                      \
     174             :     FD_R43X6_QUAD_DECL         ( _ta );                                                                               \
     175             :     FD_R43X6_QUAD_DECL         ( _tb );                                                                               \
     176             :     FD_R43X6_QUAD_MOV          ( _ta, T1 );                     /* _ta = (Y1-X1,Y1+X1,Z1*2,T1*2d), s61|s61|s61|s61 */ \
     177             :     FD_R43X6_QUAD_PERMUTE      ( _tb, 1,0,2,3, P2 );            /* _tb = (Y2,   X2,   Z2,  T2   ), s61|s61|s61|s61 */ \
     178             :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,0, _tb, P2 );  /* _tb = (Y2-X2,X2,   Z2,  T2   ), s62|s61|s61|s61 */ \
     179             :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,0,0, _tb, P2 );  /* _tb = (Y2-X2,Y2+X2,Z2,  T2   ), s62|s62|s61|s61 */ \
     180             :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (A,    B,    D,   C    ), u62|u62|u62|u62 */ \
     181             :     FD_R43X6_QUAD_FOLD_UNSIGNED( _ta, _ta );                    /* _ta = (A,    B,    D,   C    ), u44|u44|u44|u44 */ \
     182             :     FD_R43X6_QUAD_PERMUTE      ( _tb, 1,0,3,2, _ta );           /* _tb = (B,    A,    C,   D    ), u62|u62|u62|u62 */ \
     183             :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,1, _tb, _ta ); /* _tb = (E,    A,    C,   F    ), s62|u62|u62|s62 */ \
     184             :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,1,0, _tb, _ta ); /* _tb = (E,    H,    G,   F    ), s62|u63|u63|s62 */ \
     185             :     FD_R43X6_QUAD_PERMUTE      ( _ta, 0,2,2,0, _tb );           /* _ta = (E,    G,    G,   E    ), u44|u44|u44|u44 */ \
     186             :     FD_R43X6_QUAD_PERMUTE      ( _tb, 3,1,3,1, _tb );           /* _tb = (F,    H,    F,   H    ), u44|u44|u44|u44 */ \
     187             :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (X3,   Y3,   Z3,  T3   ), u62|u62|u62|u62 */ \
     188             :     FD_R43X6_QUAD_FOLD_UNSIGNED( P3, _ta );                     /* P3  = (X3,   Y3,   Z3,  T3   ), u44|u44|u44|u44 */ \
     189             :   } while(0)
     190             : 
     191             : /* FD_R43X6_GE_DBL(P3,P1) computes P3 = 2*P1 where P1 and P3 are
     192             :    FD_R43X6_GE.  P1 should hold u44 representations.  P3 will hold u44
     193             :    representations on return.  In place operation fine. */
     194             : 
     195             : // Section 5.1.4 (page 12):
     196             : //
     197             : // For point doubling, (x3,y3) = (x1,y1)+(x1,y1), one could just
     198             : // substitute equal points in the above (because of completeness, such
     199             : // substitution is valid) and observe that four multiplications turn
     200             : // into squares.  However, using the formulas described in Section 3.2
     201             : // of [Edwards-revisited] and in [EFD-TWISTED-DBL] saves a few smaller
     202             : // operations.
     203             : //
     204             : //   A = X1^2
     205             : //   B = Y1^2
     206             : //   C = 2*Z1^2
     207             : //   H = A+B
     208             : //   E = H-(X1+Y1)^2
     209             : //   G = A-B
     210             : //   F = C+G
     211             : //   X3 = E*F
     212             : //   Y3 = G*H
     213             : //   T3 = E*H
     214             : //   Z3 = F*G
     215             : 
     216             : /* TODO: CONSIDER MUL INSTEAD OF SQR TO GET THE 2* AT THE SAME TIME? */
     217    67752671 : #define FD_R43X6_GE_DBL( P3, P1 ) do {                                                                              \
     218    67752671 :     FD_R43X6_QUAD_DECL         ( _ta );                                                                             \
     219    67752671 :     FD_R43X6_QUAD_DECL         ( _tb );                                                                             \
     220    67752671 :     FD_R43X6_QUAD_DECL         ( _BB );                                                                             \
     221    67752671 :     FD_R43X6_QUAD_PERMUTE      ( _ta, 1,1,2,0, P1 );            /* _ta = (Y1,       Y1,Z1,  X1), u44/u44/u44/u44 */ \
     222    67752671 :     FD_R43X6_QUAD_LANE_ADD_FAST( _ta, _ta, 1,0,0,0, _ta, P1 );  /* _ta = (X1+Y1,    Y1,Z1,  X1), u45/u44/u44/u44 */ \
     223    67752671 :     FD_R43X6_QUAD_SQR_FAST     ( _ta, _ta );                    /* _ta = ((X1+Y1)^2,B, Z1^2,A ), u61/u61/u61/u61 */ \
     224    67752671 :     FD_R43X6_QUAD_FOLD_UNSIGNED( _ta, _ta );                    /* _ta = ((X1+Y1)^2,B, Z1^2,A ), u44/u44/u44/u44 */ \
     225    67752671 :     FD_R43X6_QUAD_LANE_ADD_FAST( _ta, _ta, 0,0,1,0, _ta, _ta ); /* _ta = ((X1+Y1)^2,B, C,   A ), u44/u44/u45/u44 */ \
     226    67752671 :     FD_R43X6_QUAD_PERMUTE      ( _tb, 3,3,3,3, _ta );           /* _tb = (A,        A, A,   A ), u44/u44/u44/u44 */ \
     227    67752671 :     FD_R43X6_QUAD_PERMUTE      ( _BB, 1,1,1,1, _ta );           /* _BB = (B,        B, B,   B ), u44/u44/u44/u44 */ \
     228    67752671 :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 1,0,0,1, _tb, _BB ); /* _tb = (H,        A, A,   H ), u45/u44/u44/u45 */ \
     229    67752671 :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 0,1,1,0, _tb, _BB ); /* _tb = (H,        G, G,   H ), u45/u45/u45/u45 */ \
     230    67752671 :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,0,1,0, _tb, _ta ); /* _tb = (H,        G, F,   H ), u45/u45/u46/u45 */ \
     231    67752671 :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,0, _tb, _ta ); /* _tb = (E,        G, F,   H ), u46/u45/u46/u45 */ \
     232    67752671 :     FD_R43X6_QUAD_PERMUTE      ( _ta, 0,1,1,0, _tb );           /* _tb = (E,        G, G,   E ), u46/u45/u45/u46 */ \
     233    67752671 :     FD_R43X6_QUAD_PERMUTE      ( _tb, 2,3,2,3, _tb );           /* _tb = (F,        H, F,   H ), u46/u45/u46/u45 */ \
     234    67752671 :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (X3,       Y3,Z3,  T3), u62/u62/u62/u62 */ \
     235    67752671 :     FD_R43X6_QUAD_FOLD_UNSIGNED( P3, _ta );                     /* P3  = (X3,       Y3,Z3,  T3), u44/u44/u44/u44 */ \
     236    67752671 :   } while(0)
     237             : 
     238             : /* FD_R43X6_GE_IS_SMALL_ORDER(P) returns 1 if [8]P is the curve neutral
     239             :    point and 0 otherwise.  P should be a FD_R43X6_QUAD holding u44
     240             :    representations of a valid curve point. */
     241             : 
     242             : #define FD_R43X6_GE_IS_SMALL_ORDER( P ) fd_r43x6_ge_is_small_order( P##03,P##14,P##25 )
     243             : 
     244             : FD_FN_UNUSED static int /* let compiler decide if worth inlining */
     245           0 : fd_r43x6_ge_is_small_order( wwl_t P03, wwl_t P14, wwl_t P25 ) {
     246           0 :   for( int i=0; i<3; i++ ) FD_R43X6_GE_DBL( P, P ); /* P = [8]P, in u44|u44|u44|u44 */
     247           0 : 
     248           0 :   /* We do a faster check than is_eq above by propagating the 0 and 1
     249           0 :      values of the curve neutral point into the multiplication and
     250           0 :      simplifying it.  This is equivalent to checking that the result has
     251           0 :      the form (0|Z|Z|0).  Note that if x is a representation of field
     252           0 :      element 0, t is also a representation 0 as t = x*y. */
     253           0 : 
     254           0 :   fd_r43x6_t x, y, z, t;
     255           0 :   FD_R43X6_QUAD_UNPACK( x, y, z, t, P ); (void)t;
     256           0 :   return (int)(!fd_r43x6_is_nonzero( x )) & (int)(!fd_r43x6_is_nonzero( fd_r43x6_sub_fast( y, z ) /* in s44 */ ));
     257           0 : }
     258             : 
     259             : /* FD_R43X6_GE_ENCODE(h,P) encodes a curve point P stored in the
     260             :    FD_R43X6_QUAD P into a unique compressed representation and writes it
     261             :    to the 32-byte memory region whose first byte in the callers address
     262             :    space is h.  P should hold u47 representations. */
     263             : 
     264             : #define FD_R43X6_GE_ENCODE(h,P) wv_stu( (h), fd_r43x6_ge_encode( P##03, P##14, P##25 ) )
     265             : 
     266             : FD_FN_CONST wv_t
     267             : fd_r43x6_ge_encode( wwl_t P03, wwl_t P14, wwl_t P25 );
     268             : 
     269             : /* FD_R43X6_GE_DECODE(P,s) decodes a encoded curve point stored at the
     270             :    32-byte region whose first byte in the caller's address space is
     271             :    pointed to by s into the curve point P.  Returns 0 on success (P will
     272             :    hold the decoded curve point on return in u44 representations) and a
     273             :    negative error code on failure (P will hold reduced 0 for X,Y,Z,T).
     274             :    The below implementation avoids pointer escapes to help the
     275             :    optimizer. */
     276             : 
     277             : #define FD_R43X6_GE_DECODE( P,s ) (__extension__({             \
     278             :     FD_R43X6_QUAD_DECL( _P );                                  \
     279             :     int _err = fd_r43x6_ge_decode( &_P03, &_P14, &_P25, (s) ); \
     280             :     FD_R43X6_QUAD_MOV( P, _P );                                \
     281             :     _err;                                                      \
     282             :   }))
     283             : 
     284             : int
     285             : fd_r43x6_ge_decode( wwl_t * _P03, wwl_t * _P14, wwl_t * _P25,
     286             :                     void const * _vs );
     287             : 
     288             : /* FD_R43X6_GE_DECODE2( Pa,sa, Pb,sb ) does:
     289             : 
     290             :      if(      GE_DECODE( Pa,sa ) ) { (PbX,PbY,PbZ,PbT) = 0; return -1; }
     291             :      else if( GE_DECODE( Pb,sb ) ) { (PaX,PaY,PaZ,PaT) = 0; return -2; }
     292             :      return 0;
     293             : 
     294             :    but faster. */
     295             : 
     296      300755 : #define FD_R43X6_GE_DECODE2( Pa,sa, Pb,sb ) (__extension__({                                      \
     297      300755 :     FD_R43X6_QUAD_DECL( _Pa );    FD_R43X6_QUAD_DECL( _Pb );                                      \
     298      300755 :     int _err = fd_r43x6_ge_decode2( &_Pa03, &_Pa14, &_Pa25, (sa), &_Pb03, &_Pb14, &_Pb25, (sb) ); \
     299      300755 :     FD_R43X6_QUAD_MOV( Pa, _Pa ); FD_R43X6_QUAD_MOV( Pb, _Pb );                                   \
     300      300755 :     _err;                                                                                         \
     301      300755 :   }))
     302             : 
     303             : int
     304             : fd_r43x6_ge_decode2( wwl_t * _Pa03, wwl_t * _Pa14, wwl_t * _Pa25,
     305             :                      void const * _vsa,
     306             :                      wwl_t * _Pb03, wwl_t * _Pb14, wwl_t * _Pb25,
     307             :                      void const * _vsb );
     308             : 
     309             : /* FD_R43X6_GE_SMUL_BASE(R,s) computes R = [s]B where B is the base
     310             :    curve point.  s points to a 32-byte memory region holding a little
     311             :    endian uint256 scalar in [0,2^255).  In-place operation fine.  The
     312             :    implementation has OpenSSL style timing attack mitigations.  The
     313             :    returned R will hold u44 representations.
     314             : 
     315             :    FD_R43X6_GE_SMUL_BASE_VARTIME does the same thing but uses a faster
     316             :    variable time algorithm.
     317             : 
     318             :    Written this funny way to prevent pointer escapes from interfering
     319             :    the optimizer and allow for easy testing of different implementations
     320             :    as this one of this most performance critical operations in the code
     321             :    base.
     322             : 
     323             :    Performance of fd_ed25519_public_from_private (this is almost just a
     324             :    pure smul_base so it is a good indicator of practical end-to-end
     325             :    performance of smul_base ... sign for small messages will show
     326             :    similar results) on a single 2.3 GHz icelake-server core under gcc-12
     327             :    circa 2023 Sep:
     328             : 
     329             :      ref:   ~37.0 us ("vartime" style)
     330             :      large:  ~9.2 us ("vartime" style)
     331             :      small: ~11.3 us (w/timing attack mitigations)
     332             : 
     333             :    For reference:
     334             : 
     335             :      scalar: ~46.0 us ("small" style w/timing attack mitigations)
     336             :      AVX-2:  ~24.9 us ("small" style w/timing attack mitigations)
     337             : 
     338             :    In the large implementation, if table symmetry is not exploited, it
     339             :    gets slightly faster (~9.0 us) but the table footprint roughly
     340             :    doubles (to ~765KiB) such that it has double the cache pressure.  If
     341             :    table symmetry and GE_ADD precomputation is omitted (i.e. GE_ADD is
     342             :    used instead of GE_ADD_TABLE), it runs at ~10.0 us.
     343             : 
     344             :    If the large implementation is modified to use OpenSSL-style timing
     345             :    attack mitigations, it runs at ~11.5 us because the mitigations are
     346             :    so expensive (these scan the whole table every time such that the
     347             :    table lookup timing should be independent of the input s, which is a
     348             :    blunt and portable if naive way of doing it).
     349             : 
     350             :    In the small implementation, by using a 4-bit at-a-time
     351             :    implementation, the table footprint can be reduced to 48KiB.
     352             :    OpenSSL-style timing attack mitigations are then much less expensive
     353             :    but more computation is required.  The result has virtually identical
     354             :    performance but much less cache pressure than the large
     355             :    implementation with timing attack mitigations.  If timing attack
     356             :    mitigations are removed from small, it runs at ~11.1 us.
     357             : 
     358             :    TL;DR This is ~4-5x faster than the original scalar implementation
     359             :    and ~2.2-2.7x faster than the original AVX-2 accelerated
     360             :    implementation.  Performance should roughly scale with core clock
     361             :    speed for these operations. */
     362             : 
     363             : #define FD_R43X6_GE_SMUL_BASE(R,s) do {                                                \
     364             :     FD_R43X6_QUAD_DECL( _R ); fd_r43x6_ge_smul_base_small( &_R03, &_R14, &_R25, (s) ); \
     365             :     FD_R43X6_QUAD_MOV( R, _R );                                                        \
     366             :   } while(0)
     367             : 
     368             : #define FD_R43X6_GE_SMUL_BASE_VARTIME(R,s) do {                                        \
     369             :     FD_R43X6_QUAD_DECL( _R ); fd_r43x6_ge_smul_base_large( &_R03, &_R14, &_R25, (s) ); \
     370             :     FD_R43X6_QUAD_MOV( R, _R );                                                        \
     371             :   } while(0)
     372             : 
     373             : void
     374             : fd_r43x6_ge_smul_base_ref( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     375             :                            void const * _vs ); /* vartime */
     376             : 
     377             : void
     378             : fd_r43x6_ge_smul_base_large( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     379             :                              void const * _vs ); /* vartime */
     380             : 
     381             : void
     382             : fd_r43x6_ge_smul_base_small( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     383             :                              void const * _vs ); /* has timing attack mitigations */
     384             : 
     385             : /* FD_R43X6_GE_FMA_VARTIME computes R = [s]P + Q where s points to a
     386             :    32-bit memory region holding a little endian uint256 scalar in
     387             :    [0,2^255).  P and Q are FD_R43X6_QUADs holding curve points in a u44
     388             :    representation.  R is a FD_R43X6_QUAD that will hold the result in a
     389             :    u44 representation on return.  Uses a variable time algorithm.
     390             :    In-place operation fine. */
     391             : 
     392             : #define FD_R43X6_GE_FMA_VARTIME(R,s,P,Q) do {                                                                         \
     393             :     FD_R43X6_QUAD_DECL( _R ); fd_r43x6_ge_fma_sparse( &_R03,&_R14,&_R25, (s), P##03,P##14,P##25, Q##03,Q##14,Q##25 ); \
     394             :     FD_R43X6_QUAD_MOV( R, _R );                                                                                       \
     395             :   } while(0)
     396             : 
     397             : void
     398             : fd_r43x6_ge_fma_ref( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     399             :                      void const * _vs,
     400             :                      wwl_t    P03, wwl_t    P14, wwl_t    P25,
     401             :                      wwl_t    Q03, wwl_t    Q14, wwl_t    Q25 ); /* vartime */
     402             : 
     403             : void
     404             : fd_r43x6_ge_fma_sparse( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     405             :                         void const * _vs,
     406             :                         wwl_t    P03, wwl_t    P14, wwl_t    P25,
     407             :                         wwl_t    Q03, wwl_t    Q14, wwl_t    Q25 ); /* vartime */
     408             : 
     409             : /* FD_R43X6_GE_DMUL_VARTIME(R,s,k,A) computes R = [s]B + [k]A where s
     410             :    and k point to 32-byte memory regions holding little endian uint256
     411             :    scalars in [0,2^255) and A is a FD_R43X6_QUAD holding a curve point
     412             :    in a u44 representation.  B is the base curve point.  R is a
     413             :    FD_R43X6_QUAD that will hold the result in a u44 representation on
     414             :    return.  Uses a variable time algorithm.  In-place operation fine. */
     415             : 
     416             : #define FD_R43X6_GE_DMUL_VARTIME(R,s,k,A) do {                                                           \
     417             :     FD_R43X6_QUAD_DECL( _R ); fd_r43x6_ge_dmul_sparse( &_R03,&_R14,&_R25, (s), (k), A##03,A##14,A##25 ); \
     418             :     FD_R43X6_QUAD_MOV( R, _R );                                                                          \
     419             :   } while(0)
     420             : 
     421             : void
     422             : fd_r43x6_ge_dmul_ref( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     423             :                       void const * _vs,
     424             :                       void const * _vk,
     425             :                       wwl_t    A03, wwl_t    A14, wwl_t    A25 ); /* vartime */
     426             : 
     427             : void
     428             : fd_r43x6_ge_dmul_sparse( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     429             :                          void const * _vs,
     430             :                          void const * _vk,
     431             :                          wwl_t    A03, wwl_t    A14, wwl_t    A25 ); /* vartime */
     432             : 
     433             : /* fd_r43x6_ge_sparse_table computes a table of odd scalar multiples of
     434             :    P stores them in table.  Given a w in [-max,max], the 3 wwl_t's
     435             :    holding the FD_R43X6_QUAD for [w]P will start at table index:
     436             :    3*((max+w)/2).  This quad will hold Y-X|Y+X|2*Z|T*2*d in a u44
     437             :    representation.  max should be positive and odd and table should have
     438             :    space for 3*(max+1) entries.  This is mostly for internal use. */
     439             : 
     440             : void
     441             : fd_r43x6_ge_sparse_table( wwl_t *    table,
     442             :                           wwl_t P03, wwl_t P14, wwl_t P25,
     443             :                           int        max );
     444             : 
     445             : /* TODO: Consider pure for multi-return functions? */
     446             : 
     447             : FD_PROTOTYPES_END
     448             : 
     449             : #endif /* HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_ge_h */

Generated by: LCOV version 1.14