LCOV - code coverage report
Current view: top level - ballet/ed25519/avx512 - fd_r43x6_ge.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 69 82 84.1 %
Date: 2025-11-15 04:42:28 Functions: 2 48 4.2 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_ge_h
       2             : #define HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_ge_h
       3             : 
       4             : /* This header provides APIs for manipulating group elements / curve
       5             :    points in ED25519.  Direct quotes from RFC 8032 are indicated with
       6             :    '//' style comments. */
       7             : 
       8             : /* A curve point will be represented by a FD_R43X6_QUAD (X,Y,Z,T) in
       9             :    extended homogeneous coordinates where X, Y, Z and T hold fd_r43x6_t
      10             :    u44 representations typically and X Y = T Z. */
      11             : 
      12             : // Section 5.1.4 (page 11)
      13             : //
      14             : // A point (x,y) is represented in extended homogeneous coordinates
      15             : // (X, Y, Z, T), with x = X/Z, y = Y/Z, x * y = T/Z.
      16             : 
      17             : #include "fd_r43x6.h"
      18             : 
      19             : FD_PROTOTYPES_BEGIN
      20             : 
      21             : /* FD_R43X6_GE_ZERO(P) does P = the curve neutral point (0,1,1,0).
      22             :    (X,Y,Z,T) will be reduced representations. */
      23             : 
      24             : // Section 5.1.4 (page 11):
      25             : //
      26             : // The neutral point is (0,1), or equivalently in extended homogeneous
      27             : // coordinates (0, Z, Z, 0) for any non-zero Z.
      28             : 
      29      837340 : #define FD_R43X6_GE_ZERO(P) do { P##03 = wwl( 0L,1L,1L,0L, 0L,0L,0L,0L ); P##14 = wwl_zero(); P##25 = wwl_zero(); } while(0)
      30             : 
      31             : /* FD_R43X6_GE_ONE(P) does P = the curve "base" point.  (X,Y,Z,T) are all
      32             :    reduced representations with Z==1.  Section 5.1 (page 9):
      33             : 
      34             :      B = (15112221349535400772501151409588531511454012693041857206046113283949847762202,
      35             :           46316835694926478169428394003475163141307993866256225615783033603165251855960)
      36             : 
      37             :    The below limbs for a reduced fd_r43x6_t representation were computed
      38             :    from the above using Python. */
      39             : 
      40             : #define FD_R43X6_GE_ONE(P) do {                                                                                            \
      41             :     P##03 = wwl( 5912276620570L, 7036874417752L, 1L, 2970602692003L, 1206867684910L, 3518437208883L, 0L, 8002368565694L ); \
      42             :     P##14 = wwl( 5175273663173L, 5277655813324L, 0L, 2381000326097L, 7581689711182L, 7036874417766L, 0L,  787695955620L ); \
      43             :     P##25 = wwl( 1806891319892L, 1759218604441L, 0L, 4963950264797L,  286998243226L,  879609302220L, 0L,  889305571247L ); \
      44             :   } while(0)
      45             : 
      46             : /* FD_R43X6_GE_IS_EQ(X,Y) returns 1 if X and Y represent the same curve
      47             :    point and 0 otherwise.  X and Y should be FD_R43X6_QUAD holding u46
      48             :    representations. */
      49             : 
      50      255071 : #define FD_R43X6_GE_IS_EQ( X, Y ) fd_r43x6_ge_is_eq( X##03,X##14,X##25, Y##03,Y##14,Y##25 )
      51             : 
      52             : FD_FN_UNUSED static int /* let compiler decide if worth inlining */
      53             : fd_r43x6_ge_is_eq( wwl_t X03, wwl_t X14, wwl_t X25,
      54      255071 :                    wwl_t Y03, wwl_t Y14, wwl_t Y25 ) {
      55             : 
      56             :   /* We use the same method from the spec to test equality.  It is worth
      57             :      noting that the standard itself specifies using a two point check
      58             :      to avoid doing an unnecessary inversion.  E.g. It is somewhat odd
      59             :      that OpenSSL implementation chooses instead to encode R' to r'
      60             :      (slow) compare the encoded r and r' for its equality check in
      61             :      verify. */
      62             : 
      63             :   // Section 6 classEdwardsPoint (page 51)
      64             :   //
      65             :   // #Check that two points are equal.
      66             :   // def __eq__(self,y):
      67             :   //     #Need to check x1/z1 == x2/z2 and similarly for y, so cross
      68             :   //     #multiply to eliminate divisions.
      69             :   //     xn1=self.x*y.z
      70             :   //     xn2=y.x*self.z
      71             :   //     yn1=self.y*y.z
      72             :   //     yn2=y.y*self.z
      73             :   //     return xn1==xn2 and yn1==yn2
      74             : 
      75      255071 :   fd_r43x6_t xn1, xn2, yn1, yn2;
      76      255071 :   FD_R43X6_QUAD_PERMUTE ( X, 2,0,2,1, X );         /* X = XZ |XX |XZ |XY,  in u46|u46|u46|u46 */
      77      255071 :   FD_R43X6_QUAD_PERMUTE ( Y, 0,2,1,2, Y );         /* Y = YX |YZ |YY |YZ , in u46|u46|u46|u46 */
      78      255071 :   FD_R43X6_QUAD_MUL_FAST( X, X, Y );               /* X = xn2|xn1|yn2|yn1, in u62|u62|u62|u62 */
      79      255071 :   FD_R43X6_QUAD_UNPACK  ( xn2, xn1, yn2, yn1, X );
      80      255071 :   return (int)(!fd_r43x6_is_nonzero( fd_r43x6_sub_fast( xn1, xn2 ) /* in s62 */ )) &
      81      255071 :          (int)(!fd_r43x6_is_nonzero( fd_r43x6_sub_fast( yn1, yn2 ) /* in s62 */ ));
      82      255071 : }
      83             : 
      84             : /* FD_R43X6_QUAD_1112d(Q) does Q = (1,1,2,2*d).  (X,Y,Z,T) will be
      85             :    reduced fd_r43x6_t. */
      86             : 
      87     4604881 : #define FD_R43X6_QUAD_1112d( Q ) do {                                      \
      88     4604881 :     Q##03 = wwl( 1L, 1L, 1L, 3934839304537L, 0L, 0L, 0L,  521695520920L ); \
      89     4604881 :     Q##14 = wwl( 0L, 0L, 0L,  507525298899L, 0L, 0L, 0L, 6596238350568L ); \
      90     4604881 :     Q##25 = wwl( 0L, 0L, 0L,   15037786634L, 0L, 0L, 0L,  309467527341L ); \
      91     4604881 :   } while(0)
      92             : 
      93             : /* FD_R43X6_GE_ADD(P3,P1,P2) computes P3 = P1 + P2 where P1, P2 and P3
      94             :    are FD_R43X6_QUAD.  P1 and P2 should hold s61 representations.  P3
      95             :    will hold u44 representations on return.  In place operation fine. */
      96             : 
      97             : // Section 5.1.4 (page 12):
      98             : //
      99             : // The following formulas for adding two points, (x3,y3) =
     100             : // (x1,y1)+(x2,y2), on twisted Edwards curves with a=-1, square a, and
     101             : // non-square d are described in Section 3.1 of [Edwards-revisited] and
     102             : // in [EFD-TWISTED-ADD].  They are complete, i.e., they work for any
     103             : // pair of valid input points.
     104             : //
     105             : //   A = (Y1-X1)*(Y2-X2)
     106             : //   B = (Y1+X1)*(Y2+X2)
     107             : //   C = T1*2*d*T2
     108             : //   D = Z1*2*Z2
     109             : //   E = B-A
     110             : //   F = D-C
     111             : //   G = D+C
     112             : //   H = B+A
     113             : //   X3 = E*F
     114             : //   Y3 = G*H
     115             : //   T3 = E*H
     116             : //   Z3 = F*G
     117             : 
     118             : #if 1 /* Seems very slightly faster than the below */
     119     2012081 : #define FD_R43X6_GE_ADD( P3, P1, P2 ) do {                                                                             \
     120     2012081 :     FD_R43X6_QUAD_DECL         ( _1112d );                                                                             \
     121     2012081 :     FD_R43X6_QUAD_DECL         ( _ta );                                                                                \
     122     2012081 :     FD_R43X6_QUAD_DECL         ( _tb );                                                                                \
     123     2012081 :     FD_R43X6_QUAD_1112d        ( _1112d );                      /*       (1,    1,    1,    2*d  ), u43|u43|u43|u43 */ \
     124     2012081 :     FD_R43X6_QUAD_PERMUTE      ( _ta, 1,0,2,3, P1 );            /* _ta = (Y1,   X1,   Z1,   T1   ), s61|s61|s61|s61 */ \
     125     2012081 :     FD_R43X6_QUAD_PERMUTE      ( _tb, 1,0,2,3, P2 );            /* _tb = (Y2,   X2,   Z2,   T2   ), s61|s61|s61|s61 */ \
     126     2012081 :     FD_R43X6_QUAD_LANE_SUB_FAST( _ta, _ta, 1,0,0,0, _ta, P1 );  /* _ta = (Y1-X1,X1,   Z1,   T1   ), s62|s61|s61|s61 */ \
     127     2012081 :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,0, _tb, P2 );  /* _tb = (Y2-X2,X2,   Z2,   T2   ), s62|s61|s61|s61 */ \
     128     2012081 :     FD_R43X6_QUAD_LANE_ADD_FAST( _ta, _ta, 0,1,1,0, _ta, P1 );  /* _ta = (Y1-X1,Y1+X1,Z1*2, T1   ), s62|s62|s62|s61 */ \
     129     2012081 :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,0,0, _tb, P2 );  /* _tb = (Y2-X2,Y2+X2,Z2,   T2   ), s62|s62|s61|s61 */ \
     130     2012081 :     FD_R43X6_QUAD_FOLD_SIGNED  ( _ta, _ta );                    /* _ta = (Y1-X1,Y1+X1,Z1*2, T1   ), u44|u44|u44|u44 */ \
     131     2012081 :     FD_R43X6_QUAD_FOLD_SIGNED  ( _tb, _tb );                    /* _tb = (Y2-X2,Y2+X2,Z2,   T2   ), u44|u44|u44|u44 */ \
     132     2012081 :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _1112d );            /* _ta = (Y1-X1,Y1+X1,Z1*2, T1*2d), u62|u62|u62|u62 */ \
     133     2012081 :     FD_R43X6_QUAD_FOLD_UNSIGNED( _ta, _ta );                    /* _ta = (Y1-X1,Y1+X1,Z1*2, T1*2d), u44|u44|u44|u44 */ \
     134     2012081 :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (A,    B,    D,    C    ), u62|u62|u62|u62 */ \
     135     2012081 :     /* the next line can't be removed because in 3 lines we'd get s62|u63|u63|s62 and that's not ok for fold_signed */ \
     136     2012081 :     FD_R43X6_QUAD_FOLD_UNSIGNED( _ta, _ta );                    /* _ta = (A,    B,    D,    C    ), u44|u44|u44|u44 */ \
     137     2012081 :     FD_R43X6_QUAD_PERMUTE      ( _tb, 1,0,3,2, _ta );           /* _tb = (B,    A,    C,    D    ), u44|u44|u44|u44 */ \
     138     2012081 :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,1, _tb, _ta ); /* _tb = (E,    A,    C,    F    ), s44|u44|u44|s44 */ \
     139     2012081 :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,1,0, _tb, _ta ); /* _tb = (E,    H,    G,    F    ), s44|u45|u45|s44 */ \
     140     2012081 :     FD_R43X6_QUAD_FOLD_SIGNED  ( _tb, _tb );                    /* _tb = (E,    H,    G,    F    ), u44|u44|u44|u44 */ \
     141     2012081 :     FD_R43X6_QUAD_PERMUTE      ( _ta, 0,2,2,0, _tb );           /* _ta = (E,    G,    G,    E    ), u44|u44|u44|u44 */ \
     142     2012081 :     FD_R43X6_QUAD_PERMUTE      ( _tb, 3,1,3,1, _tb );           /* _tb = (F,    H,    F,    H    ), u44|u44|u44|u44 */ \
     143     2012081 :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (X3,   Y3,   Z3,   T3   ), u62|u62|u62|u62 */ \
     144     2012081 :     FD_R43X6_QUAD_FOLD_UNSIGNED( P3, _ta );                     /* P3  = (X3,   Y3,   Z3,   T3   ), u44|u44|u44|u44 */ \
     145     2012081 :   } while(0)
     146             : #else /* Seems very slightly slower than the above */
     147             : #define FD_R43X6_GE_ADD( P3, P1, P2 ) do {                                                                          \
     148             :     FD_R43X6_QUAD_DECL         ( _ta );                                                                             \
     149             :     FD_R43X6_QUAD_DECL         ( _tb );                                                                             \
     150             :     FD_R43X6_QUAD_PERMUTE      ( _ta, 1,0,2,3, P1 );            /* _ta = (Y1,   X1,   Z1,   T1), s61|s61|s61|s61 */ \
     151             :     FD_R43X6_QUAD_PERMUTE      ( _tb, 1,0,2,3, P2 );            /* _tb = (Y2,   X2,   Z2,   T2), s61|s61|s61|s61 */ \
     152             :     FD_R43X6_QUAD_LANE_SUB_FAST( _ta, _ta, 1,0,0,0, _ta, P1 );  /* _ta = (Y1-X1,X1,   Z1,   T1), s62|s61|s61|s61 */ \
     153             :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,0, _tb, P2 );  /* _tb = (Y2-X2,X2,   Z2,   T2), s62|s61|s61|s61 */ \
     154             :     FD_R43X6_QUAD_LANE_ADD_FAST( _ta, _ta, 0,1,1,0, _ta, P1 );  /* _ta = (Y1-X1,Y1+X1,2*Z1, T1), s62|s62|s62|s61 */ \
     155             :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,0,0, _tb, P2 );  /* _tb = (Y2-X2,Y2+X2,Z2,   T2), s62|s62|s61|s61 */ \
     156             :     FD_R43X6_QUAD_FOLD_SIGNED  ( _ta, _ta );                    /* _ta = (Y1-X1,Y1+X1,2*Z1, T1), u44|u44|u44|u44 */ \
     157             :     FD_R43X6_QUAD_FOLD_SIGNED  ( _tb, _tb );                    /* _tb = (Y2-X2,Y2+X2,Z2,   T2), u44|u44|u44|u44 */ \
     158             :     fd_r43x6_t _YmX1, _YpX1, _2Z1, _T1;                                                                             \
     159             :     FD_R43X6_QUAD_UNPACK( _YmX1, _YpX1, _2Z1, _T1, _ta );                                                           \
     160             :     FD_R43X6_QUAD_PACK( _ta, _YmX1, _YpX1, _2Z1, fd_r43x6_mul( _T1, fd_r43x6_2d() ) );                              \
     161             :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (A,    B,    D,    C ), u62|u62|u62|u62 */ \
     162             :     FD_R43X6_QUAD_PERMUTE      ( _tb, 1,0,3,2, _ta );           /* _tb = (B,    A,    C,    D ), u62|u62|u62|u62 */ \
     163             :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,1, _tb, _ta ); /* _tb = (E,    A,    C,    F ), s62|u62|u62|s62 */ \
     164             :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,1,0, _tb, _ta ); /* _tb = (E,    H,    G,    F ), s62|u63|u63|s62 */ \
     165             :     FD_R43X6_QUAD_FOLD_SIGNED  ( _tb, _tb );                    /* _tb = (E,    H,    G,    F ), u44|u44|u44|u44 */ \
     166             :     FD_R43X6_QUAD_PERMUTE      ( _ta, 0,2,2,0, _tb );           /* _ta = (E,    G,    G,    E ), u44|u44|u44|u44 */ \
     167             :     FD_R43X6_QUAD_PERMUTE      ( _tb, 3,1,3,1, _tb );           /* _tb = (F,    H,    F,    H ), u44|u44|u44|u44 */ \
     168             :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (X3,   Y3,   Z3,   T3), u62|u62|u62|u62 */ \
     169             :     FD_R43X6_QUAD_FOLD_UNSIGNED( P3, _ta );                     /* P3  = (X3,   Y3,   Z3,   T3), u44|u44|u44|u44 */ \
     170             :   } while(0)
     171             : #endif
     172             : 
     173             : /* FD_R43X6_GE_ADD_TABLE does the same thing as FD_R43X6_GE_ADD where T1
     174             :    holds (Y1-X1,Y1+X1,Z1*2,T1*2d).  T1 and P2 should be in s61
     175             :    representations.  P3 will hold u44 representations. */
     176             : 
     177             : #define FD_R43X6_GE_ADD_TABLE( P3, T1, P2 ) do {                                                                      \
     178             :     FD_R43X6_QUAD_DECL         ( _ta );                                                                               \
     179             :     FD_R43X6_QUAD_DECL         ( _tb );                                                                               \
     180             :     FD_R43X6_QUAD_MOV          ( _ta, T1 );                     /* _ta = (Y1-X1,Y1+X1,Z1*2,T1*2d), s61|s61|s61|s61 */ \
     181             :     FD_R43X6_QUAD_PERMUTE      ( _tb, 1,0,2,3, P2 );            /* _tb = (Y2,   X2,   Z2,  T2   ), s61|s61|s61|s61 */ \
     182             :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,0, _tb, P2 );  /* _tb = (Y2-X2,X2,   Z2,  T2   ), s62|s61|s61|s61 */ \
     183             :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,0,0, _tb, P2 );  /* _tb = (Y2-X2,Y2+X2,Z2,  T2   ), s62|s62|s61|s61 */ \
     184             :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (A,    B,    D,   C    ), u62|u62|u62|u62 */ \
     185             :     FD_R43X6_QUAD_FOLD_UNSIGNED( _ta, _ta );                    /* _ta = (A,    B,    D,   C    ), u44|u44|u44|u44 */ \
     186             :     FD_R43X6_QUAD_PERMUTE      ( _tb, 1,0,3,2, _ta );           /* _tb = (B,    A,    C,   D    ), u62|u62|u62|u62 */ \
     187             :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,1, _tb, _ta ); /* _tb = (E,    A,    C,   F    ), s62|u62|u62|s62 */ \
     188             :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,1,1,0, _tb, _ta ); /* _tb = (E,    H,    G,   F    ), s62|u63|u63|s62 */ \
     189             :     FD_R43X6_QUAD_PERMUTE      ( _ta, 0,2,2,0, _tb );           /* _ta = (E,    G,    G,   E    ), u44|u44|u44|u44 */ \
     190             :     FD_R43X6_QUAD_PERMUTE      ( _tb, 3,1,3,1, _tb );           /* _tb = (F,    H,    F,   H    ), u44|u44|u44|u44 */ \
     191             :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (X3,   Y3,   Z3,  T3   ), u62|u62|u62|u62 */ \
     192             :     FD_R43X6_QUAD_FOLD_UNSIGNED( P3, _ta );                     /* P3  = (X3,   Y3,   Z3,  T3   ), u44|u44|u44|u44 */ \
     193             :   } while(0)
     194             : 
     195             : /* FD_R43X6_GE_DBL(P3,P1) computes P3 = 2*P1 where P1 and P3 are
     196             :    FD_R43X6_GE.  P1 should hold u44 representations.  P3 will hold u44
     197             :    representations on return.  In place operation fine. */
     198             : 
     199             : // Section 5.1.4 (page 12):
     200             : //
     201             : // For point doubling, (x3,y3) = (x1,y1)+(x1,y1), one could just
     202             : // substitute equal points in the above (because of completeness, such
     203             : // substitution is valid) and observe that four multiplications turn
     204             : // into squares.  However, using the formulas described in Section 3.2
     205             : // of [Edwards-revisited] and in [EFD-TWISTED-DBL] saves a few smaller
     206             : // operations.
     207             : //
     208             : //   A = X1^2
     209             : //   B = Y1^2
     210             : //   C = 2*Z1^2
     211             : //   H = A+B
     212             : //   E = H-(X1+Y1)^2
     213             : //   G = A-B
     214             : //   F = C+G
     215             : //   X3 = E*F
     216             : //   Y3 = G*H
     217             : //   T3 = E*H
     218             : //   Z3 = F*G
     219             : 
     220             : /* TODO: CONSIDER MUL INSTEAD OF SQR TO GET THE 2* AT THE SAME TIME? */
     221    67424944 : #define FD_R43X6_GE_DBL( P3, P1 ) do {                                                                              \
     222    67424944 :     FD_R43X6_QUAD_DECL         ( _ta );                                                                             \
     223    67424944 :     FD_R43X6_QUAD_DECL         ( _tb );                                                                             \
     224    67424944 :     FD_R43X6_QUAD_DECL         ( _BB );                                                                             \
     225    67424944 :     FD_R43X6_QUAD_PERMUTE      ( _ta, 1,1,2,0, P1 );            /* _ta = (Y1,       Y1,Z1,  X1), u44/u44/u44/u44 */ \
     226    67424944 :     FD_R43X6_QUAD_LANE_ADD_FAST( _ta, _ta, 1,0,0,0, _ta, P1 );  /* _ta = (X1+Y1,    Y1,Z1,  X1), u45/u44/u44/u44 */ \
     227    67424944 :     FD_R43X6_QUAD_SQR_FAST     ( _ta, _ta );                    /* _ta = ((X1+Y1)^2,B, Z1^2,A ), u61/u61/u61/u61 */ \
     228    67424944 :     FD_R43X6_QUAD_FOLD_UNSIGNED( _ta, _ta );                    /* _ta = ((X1+Y1)^2,B, Z1^2,A ), u44/u44/u44/u44 */ \
     229    67424944 :     FD_R43X6_QUAD_LANE_ADD_FAST( _ta, _ta, 0,0,1,0, _ta, _ta ); /* _ta = ((X1+Y1)^2,B, C,   A ), u44/u44/u45/u44 */ \
     230    67424944 :     FD_R43X6_QUAD_PERMUTE      ( _tb, 3,3,3,3, _ta );           /* _tb = (A,        A, A,   A ), u44/u44/u44/u44 */ \
     231    67424944 :     FD_R43X6_QUAD_PERMUTE      ( _BB, 1,1,1,1, _ta );           /* _BB = (B,        B, B,   B ), u44/u44/u44/u44 */ \
     232    67424944 :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 1,0,0,1, _tb, _BB ); /* _tb = (H,        A, A,   H ), u45/u44/u44/u45 */ \
     233    67424944 :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 0,1,1,0, _tb, _BB ); /* _tb = (H,        G, G,   H ), u45/u45/u45/u45 */ \
     234    67424944 :     FD_R43X6_QUAD_LANE_ADD_FAST( _tb, _tb, 0,0,1,0, _tb, _ta ); /* _tb = (H,        G, F,   H ), u45/u45/u46/u45 */ \
     235    67424944 :     FD_R43X6_QUAD_LANE_SUB_FAST( _tb, _tb, 1,0,0,0, _tb, _ta ); /* _tb = (E,        G, F,   H ), u46/u45/u46/u45 */ \
     236    67424944 :     FD_R43X6_QUAD_PERMUTE      ( _ta, 0,1,1,0, _tb );           /* _tb = (E,        G, G,   E ), u46/u45/u45/u46 */ \
     237    67424944 :     FD_R43X6_QUAD_PERMUTE      ( _tb, 2,3,2,3, _tb );           /* _tb = (F,        H, F,   H ), u46/u45/u46/u45 */ \
     238    67424944 :     FD_R43X6_QUAD_MUL_FAST     ( _ta, _ta, _tb );               /* _ta = (X3,       Y3,Z3,  T3), u62/u62/u62/u62 */ \
     239    67424944 :     FD_R43X6_QUAD_FOLD_UNSIGNED( P3, _ta );                     /* P3  = (X3,       Y3,Z3,  T3), u44/u44/u44/u44 */ \
     240    67424944 :   } while(0)
     241             : 
     242             : /* FD_R43X6_GE_IS_SMALL_ORDER(P) returns 1 if [8]P is the curve neutral
     243             :    point and 0 otherwise.  P should be a FD_R43X6_QUAD holding u44
     244             :    representations of a valid curve point. */
     245             : 
     246             : #define FD_R43X6_GE_IS_SMALL_ORDER( P ) fd_r43x6_ge_is_small_order( P##03,P##14,P##25 )
     247             : 
     248             : FD_FN_UNUSED static int /* let compiler decide if worth inlining */
     249           0 : fd_r43x6_ge_is_small_order( wwl_t P03, wwl_t P14, wwl_t P25 ) {
     250           0 :   for( int i=0; i<3; i++ ) FD_R43X6_GE_DBL( P, P ); /* P = [8]P, in u44|u44|u44|u44 */
     251           0 : 
     252           0 :   /* We do a faster check than is_eq above by propagating the 0 and 1
     253           0 :      values of the curve neutral point into the multiplication and
     254           0 :      simplifying it.  This is equivalent to checking that the result has
     255           0 :      the form (0|Z|Z|0).  Note that if x is a representation of field
     256           0 :      element 0, t is also a representation 0 as t = x*y. */
     257           0 : 
     258           0 :   fd_r43x6_t x, y, z, t;
     259           0 :   FD_R43X6_QUAD_UNPACK( x, y, z, t, P ); (void)t;
     260           0 :   return (int)(!fd_r43x6_is_nonzero( x )) & (int)(!fd_r43x6_is_nonzero( fd_r43x6_sub_fast( y, z ) /* in s44 */ ));
     261           0 : }
     262             : 
     263             : /* FD_R43X6_GE_ENCODE(h,P) encodes a curve point P stored in the
     264             :    FD_R43X6_QUAD P into a unique compressed representation and writes it
     265             :    to the 32-byte memory region whose first byte in the callers address
     266             :    space is h.  P should hold u47 representations. */
     267             : 
     268             : #define FD_R43X6_GE_ENCODE(h,P) wv_stu( (h), fd_r43x6_ge_encode( P##03, P##14, P##25 ) )
     269             : 
     270             : FD_FN_CONST wv_t
     271             : fd_r43x6_ge_encode( wwl_t P03, wwl_t P14, wwl_t P25 );
     272             : 
     273             : /* FD_R43X6_GE_DECODE(P,s) decodes a encoded curve point stored at the
     274             :    32-byte region whose first byte in the caller's address space is
     275             :    pointed to by s into the curve point P.  Returns 0 on success (P will
     276             :    hold the decoded curve point on return in u44 representations) and a
     277             :    negative error code on failure (P will hold reduced 0 for X,Y,Z,T).
     278             :    The below implementation avoids pointer escapes to help the
     279             :    optimizer. */
     280             : 
     281             : #define FD_R43X6_GE_DECODE( P,s ) (__extension__({             \
     282             :     FD_R43X6_QUAD_DECL( _P );                                  \
     283             :     int _err = fd_r43x6_ge_decode( &_P03, &_P14, &_P25, (s) ); \
     284             :     FD_R43X6_QUAD_MOV( P, _P );                                \
     285             :     _err;                                                      \
     286             :   }))
     287             : 
     288             : int
     289             : fd_r43x6_ge_decode( wwl_t * _P03, wwl_t * _P14, wwl_t * _P25,
     290             :                     void const * _vs );
     291             : 
     292             : /* FD_R43X6_GE_DECODE2( Pa,sa, Pb,sb ) does:
     293             : 
     294             :      if(      GE_DECODE( Pa,sa ) ) { (PbX,PbY,PbZ,PbT) = 0; return -1; }
     295             :      else if( GE_DECODE( Pb,sb ) ) { (PaX,PaY,PaZ,PaT) = 0; return -2; }
     296             :      return 0;
     297             : 
     298             :    but faster. */
     299             : 
     300      299233 : #define FD_R43X6_GE_DECODE2( Pa,sa, Pb,sb ) (__extension__({                                      \
     301      299233 :     FD_R43X6_QUAD_DECL( _Pa );    FD_R43X6_QUAD_DECL( _Pb );                                      \
     302      299233 :     int _err = fd_r43x6_ge_decode2( &_Pa03, &_Pa14, &_Pa25, (sa), &_Pb03, &_Pb14, &_Pb25, (sb) ); \
     303      299233 :     FD_R43X6_QUAD_MOV( Pa, _Pa ); FD_R43X6_QUAD_MOV( Pb, _Pb );                                   \
     304      299233 :     _err;                                                                                         \
     305      299233 :   }))
     306             : 
     307             : int
     308             : fd_r43x6_ge_decode2( wwl_t * _Pa03, wwl_t * _Pa14, wwl_t * _Pa25,
     309             :                      void const * _vsa,
     310             :                      wwl_t * _Pb03, wwl_t * _Pb14, wwl_t * _Pb25,
     311             :                      void const * _vsb );
     312             : 
     313             : /* FD_R43X6_GE_SMUL_BASE(R,s) computes R = [s]B where B is the base
     314             :    curve point.  s points to a 32-byte memory region holding a little
     315             :    endian uint256 scalar in [0,2^255).  In-place operation fine.  The
     316             :    implementation has OpenSSL style timing attack mitigations.  The
     317             :    returned R will hold u44 representations.
     318             : 
     319             :    FD_R43X6_GE_SMUL_BASE_VARTIME does the same thing but uses a faster
     320             :    variable time algorithm.
     321             : 
     322             :    Written this funny way to prevent pointer escapes from interfering
     323             :    the optimizer and allow for easy testing of different implementations
     324             :    as this one of this most performance critical operations in the code
     325             :    base.
     326             : 
     327             :    Performance of fd_ed25519_public_from_private (this is almost just a
     328             :    pure smul_base so it is a good indicator of practical end-to-end
     329             :    performance of smul_base ... sign for small messages will show
     330             :    similar results) on a single 2.3 GHz icelake-server core under gcc-12
     331             :    circa 2023 Sep:
     332             : 
     333             :      ref:   ~37.0 us ("vartime" style)
     334             :      large:  ~9.2 us ("vartime" style)
     335             :      small: ~11.3 us (w/timing attack mitigations)
     336             : 
     337             :    For reference:
     338             : 
     339             :      scalar: ~46.0 us ("small" style w/timing attack mitigations)
     340             :      AVX-2:  ~24.9 us ("small" style w/timing attack mitigations)
     341             : 
     342             :    In the large implementation, if table symmetry is not exploited, it
     343             :    gets slightly faster (~9.0 us) but the table footprint roughly
     344             :    doubles (to ~765KiB) such that it has double the cache pressure.  If
     345             :    table symmetry and GE_ADD precomputation is omitted (i.e. GE_ADD is
     346             :    used instead of GE_ADD_TABLE), it runs at ~10.0 us.
     347             : 
     348             :    If the large implementation is modified to use OpenSSL-style timing
     349             :    attack mitigations, it runs at ~11.5 us because the mitigations are
     350             :    so expensive (these scan the whole table every time such that the
     351             :    table lookup timing should be independent of the input s, which is a
     352             :    blunt and portable if naive way of doing it).
     353             : 
     354             :    In the small implementation, by using a 4-bit at-a-time
     355             :    implementation, the table footprint can be reduced to 48KiB.
     356             :    OpenSSL-style timing attack mitigations are then much less expensive
     357             :    but more computation is required.  The result has virtually identical
     358             :    performance but much less cache pressure than the large
     359             :    implementation with timing attack mitigations.  If timing attack
     360             :    mitigations are removed from small, it runs at ~11.1 us.
     361             : 
     362             :    TL;DR This is ~4-5x faster than the original scalar implementation
     363             :    and ~2.2-2.7x faster than the original AVX-2 accelerated
     364             :    implementation.  Performance should roughly scale with core clock
     365             :    speed for these operations. */
     366             : 
     367             : #define FD_R43X6_GE_SMUL_BASE(R,s) do {                                                \
     368             :     FD_R43X6_QUAD_DECL( _R ); fd_r43x6_ge_smul_base_small( &_R03, &_R14, &_R25, (s) ); \
     369             :     FD_R43X6_QUAD_MOV( R, _R );                                                        \
     370             :   } while(0)
     371             : 
     372             : #define FD_R43X6_GE_SMUL_BASE_VARTIME(R,s) do {                                        \
     373             :     FD_R43X6_QUAD_DECL( _R ); fd_r43x6_ge_smul_base_large( &_R03, &_R14, &_R25, (s) ); \
     374             :     FD_R43X6_QUAD_MOV( R, _R );                                                        \
     375             :   } while(0)
     376             : 
     377             : void
     378             : fd_r43x6_ge_smul_base_ref( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     379             :                            void const * _vs ); /* vartime */
     380             : 
     381             : void
     382             : fd_r43x6_ge_smul_base_large( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     383             :                              void const * _vs ); /* vartime */
     384             : 
     385             : void
     386             : fd_r43x6_ge_smul_base_small( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     387             :                              void const * _vs ); /* has timing attack mitigations */
     388             : 
     389             : /* FD_R43X6_GE_FMA_VARTIME computes R = [s]P + Q where s points to a
     390             :    32-bit memory region holding a little endian uint256 scalar in
     391             :    [0,2^255).  P and Q are FD_R43X6_QUADs holding curve points in a u44
     392             :    representation.  R is a FD_R43X6_QUAD that will hold the result in a
     393             :    u44 representation on return.  Uses a variable time algorithm.
     394             :    In-place operation fine. */
     395             : 
     396             : #define FD_R43X6_GE_FMA_VARTIME(R,s,P,Q) do {                                                                         \
     397             :     FD_R43X6_QUAD_DECL( _R ); fd_r43x6_ge_fma_sparse( &_R03,&_R14,&_R25, (s), P##03,P##14,P##25, Q##03,Q##14,Q##25 ); \
     398             :     FD_R43X6_QUAD_MOV( R, _R );                                                                                       \
     399             :   } while(0)
     400             : 
     401             : void
     402             : fd_r43x6_ge_fma_ref( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     403             :                      void const * _vs,
     404             :                      wwl_t    P03, wwl_t    P14, wwl_t    P25,
     405             :                      wwl_t    Q03, wwl_t    Q14, wwl_t    Q25 ); /* vartime */
     406             : 
     407             : void
     408             : fd_r43x6_ge_fma_sparse( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     409             :                         void const * _vs,
     410             :                         wwl_t    P03, wwl_t    P14, wwl_t    P25,
     411             :                         wwl_t    Q03, wwl_t    Q14, wwl_t    Q25 ); /* vartime */
     412             : 
     413             : /* FD_R43X6_GE_DMUL_VARTIME(R,s,k,A) computes R = [s]B + [k]A where s
     414             :    and k point to 32-byte memory regions holding little endian uint256
     415             :    scalars in [0,2^255) and A is a FD_R43X6_QUAD holding a curve point
     416             :    in a u44 representation.  B is the base curve point.  R is a
     417             :    FD_R43X6_QUAD that will hold the result in a u44 representation on
     418             :    return.  Uses a variable time algorithm.  In-place operation fine. */
     419             : 
     420             : #define FD_R43X6_GE_DMUL_VARTIME(R,s,k,A) do {                                                           \
     421             :     FD_R43X6_QUAD_DECL( _R ); fd_r43x6_ge_dmul_sparse( &_R03,&_R14,&_R25, (s), (k), A##03,A##14,A##25 ); \
     422             :     FD_R43X6_QUAD_MOV( R, _R );                                                                          \
     423             :   } while(0)
     424             : 
     425             : void
     426             : fd_r43x6_ge_dmul_ref( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     427             :                       void const * _vs,
     428             :                       void const * _vk,
     429             :                       wwl_t    A03, wwl_t    A14, wwl_t    A25 ); /* vartime */
     430             : 
     431             : void
     432             : fd_r43x6_ge_dmul_sparse( wwl_t * _R03, wwl_t * _R14, wwl_t * _R25,
     433             :                          void const * _vs,
     434             :                          void const * _vk,
     435             :                          wwl_t    A03, wwl_t    A14, wwl_t    A25 ); /* vartime */
     436             : 
     437             : /* fd_r43x6_ge_sparse_table computes a table of odd scalar multiples of
     438             :    P stores them in table.  Given a w in [-max,max], the 3 wwl_t's
     439             :    holding the FD_R43X6_QUAD for [w]P will start at table index:
     440             :    3*((max+w)/2).  This quad will hold Y-X|Y+X|2*Z|T*2*d in a u44
     441             :    representation.  max should be positive and odd and table should have
     442             :    space for 3*(max+1) entries.  This is mostly for internal use. */
     443             : 
     444             : void
     445             : fd_r43x6_ge_sparse_table( wwl_t *    table,
     446             :                           wwl_t P03, wwl_t P14, wwl_t P25,
     447             :                           int        max );
     448             : 
     449             : /* TODO: Consider pure for multi-return functions? */
     450             : 
     451             : FD_PROTOTYPES_END
     452             : 
     453             : #endif /* HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_ge_h */

Generated by: LCOV version 1.14