LCOV - cov.lcov - ballet/ed25519/avx512/fd

LCOV - code coverage report

Current view:	top level - ballet/ed25519/avx512 - fd_r43x6.h (source / functions)		Hit	Total	Coverage
Test:	cov.lcov	Lines:	273	273	100.0 %
Date:	2025-08-05 05:04:49	Functions:	51	406	12.6 %

          Line data    Source code

       1             : #ifndef HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_h
       2             : #define HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_h
       3             : 
       4             : #if FD_HAS_AVX512
       5             : 
       6             : #include "../../../util/simd/fd_avx.h"
       7             : #include "../../../util/simd/fd_avx512.h"
       8             : 
       9             : /* A fd_r43x6_t represents a GF(p) element, where p = 2^255-19, in a
      10             :    little endian 6 long radix 2^43 limb representation.  The 6 limbs are
      11             :    held in lanes 0 through 5 of an AVX-512 vector.  That is, given a
      12             :    fd_r43x6_t x, the field element represented by x is:
      13             : 
      14             :      ( x0 + x1 2^43 + x2 2^86 + x3 2^129 + x4 2^172 + x5 2^215 ) mod p
      15             : 
      16             :    where xn is the n-th 64-bit vector lane treated as a long.  Lanes 6
      17             :    and 7 are ignored.  The below will often use the shorthand:
      18             : 
      19             :      <x0,x1,x2,x3,x4,x5>
      20             : 
      21             :    for the above expression.
      22             : 
      23             :    This representation is redundant: multiple fd_r43x6_t can represent
      24             :    the same element.  Most functions have restrictions on the which
      25             :    representations can be used for inputs and which representations they
      26             :    produce to support high performance implementation and composability.
      27             : 
      28             :    Frequently used representations are:
      29             : 
      30             :    - arbitrary:      limbs     are in [-2^63,2^63)
      31             :    - signed:         limbs     are in [-2^62,2^62)
      32             :    - unsigned:       limbs     are in [0,2^62)
      33             :    - unreduced:      limbs     are in [0,2^47)
      34             :    - unpacked:       limbs 0-4 are in [0,2^43), limb 5 is in [0,2^41)
      35             :    - nearly reduced: limbs 0-4 are in [0,2^43), limb 5 is in [0,2^41), the packed uint256 value is in [0,2*p)
      36             :    - reduced:        limbs 0-4 are in [0,2^43), limb 5 is in [0,2^40), the packed uint256 value is in [0,p)
      37             : 
      38             :    As a frequently used shorthand when analyzing range of limbs, unn
      39             :    indicates limbs are in [0,2^nn) and snn indicates limbs are in
      40             :    (-2^nn,2^nn).
      41             : 
      42             :    Note:
      43             : 
      44             :    - There is only one reduced fd_r43x6_t for each element.
      45             : 
      46             :    - There are two nearly reduced fd_r43x6_t for each element.
      47             : 
      48             :    - reduced is a subset of nearly reduced is a subset of unpacked is a
      49             :      subset of unreduced is a subset of unsigned is a subset of signed
      50             :      is a subset arbitrary.
      51             : 
      52             :    - unpacked, nearly reduced and reduced fd_r43x6_t be quickly
      53             :      converted into a packed uint256 value used by various cryptographic
      54             :      protocols and vice versa.
      55             : 
      56             :    - Cheat sheet
      57             : 
      58             :        unpack maps uint256 to unpacked
      59             :        pack   maps unpacked to uint256
      60             : 
      61             :        * These are used to interface with external protocols.
      62             : 
      63             :        fold_unsigned maps unsigned to unreduced
      64             :        fold_signed   maps   signed to unreduced
      65             : 
      66             :        * fold_* are fast and typically used to keep the ranges of limbs
      67             :          reasonable in long running calculations without needing to do
      68             :          more expensive approx_mod_* or mod_* operations.
      69             : 
      70             :        approx_mod           maps arbitrary to nearly reduced
      71             :        approx_mod_signed    maps signed    to nearly reduced
      72             :        approx_mod_unsigned  maps unsigned  to nearly reduced
      73             :        approx_mod_unreduced maps unreduced to nearly reduced
      74             :        approx_mod_unpacked  maps unpacked  to nearly reduced
      75             : 
      76             :        * approx_mod_* are typically used to get inputs to a long running
      77             :          calculation into a suitable form or by mod_* below.
      78             : 
      79             :        mod                maps arbitrary      to reduced (equiv to approx_mod           / mod_nearly_reduced)
      80             :        mod_signed         maps signed         to reduced (equiv to approx_mod_signed    / mod_nearly_reduced)
      81             :        mod_unsigned       maps unsigned       to reduced (equiv to approx_mod_unsigned  / mod_nearly_reduced)
      82             :        mod_unreduced      maps unreduced      to reduced (equiv to approx_mod_unreduced / mod_nearly_reduced)
      83             :        mod_unpacked       maps unpacked       to reduced (equiv to approx_mod_unpacked  / mod_nearly_reduced)
      84             :        mod_nearly_reduced maps nearly reduced to reduced
      85             : 
      86             :        * mod_* are typically used for produce a final unique result at
      87             :          the end of a long running calculation.
      88             : 
      89             :        add_fast   maps unreduced x unreduced             to unsigned (among others), see fold_unsigned above
      90             :        sub_fast   maps unreduced x unreduced             to signed   (among others), see fold_signed   above
      91             :                   or,  reduced x reduced                 to unsigned (among others), see fold_unsigned above
      92             :        mul_fast   maps unreduced x unreduced             to unsigned,                see fold_unsigned above
      93             :        sqr_fast   maps unreduced                         to unsigned,                see fold_unsigned above
      94             :        neg        maps unreduced                         to unreduced
      95             :                   or,  reduced                           to reduced
      96             :        add        maps unreduced x unreduced             to unreduced
      97             :        sub        maps unreduced x unreduced             to unreduced
      98             :        mul        maps unreduced x unreduced             to unreduced
      99             :        sqr        maps unreduced                         to unreduced
     100             :        scale      maps [0,2^47)  x unreduced             to unreduced
     101             :        scaleadd   maps unreduced x [0,2^47)  x unreduced to unreduced
     102             :        invert     maps unreduced                         to unreduced
     103             :        is_nonzero maps signed                            to [0,1]
     104             :        diagnose   maps signed                            to [-1,0,1]
     105             :        pow22523   maps unreduced                         to unreduced
     106             : 
     107             :        * These are used to implement HPC calculations on GF(p) elements. */
     108             : 
     109   245759511 : #define fd_r43x6_t wwl_t
     110             : 
     111             : FD_PROTOTYPES_BEGIN
     112             : 
     113             : /* fd_r43x6(x0,x1,x2,x3,x4,x5) constructs an arbitrary r43x6_t from the
     114             :    given limbs.  Lanes 6 and 7 will be zero.  This macro is robust.
     115             :    Note: implementing via setr was benchmarked as slightly faster than
     116             :    loading from a stack tmp (probably due to better compiler code gen). */
     117             : 
     118   303430938 : #define fd_r43x6(x0,x1,x2,x3,x4,x5) wwl( (x0),(x1),(x2),(x3),(x4),(x5), 0L,0L )
     119             : 
     120             : /* fd_r43x6_extract_limbs(x,y) extracts the limbs of an arbitrary
     121             :    fd_r43x6_t x into the longs y0-y5.  This is primarily for use in
     122             :    operations that are not vectorized.  This macro is robust.  Note:
     123             :    implementing via extract was benchmarked as slightly faster than
     124             :    storing to a stack tmp and reloading (probably due to better compiler
     125             :    code gen). */
     126             : 
     127   305656940 : #define fd_r43x6_extract_limbs(x,y) do {              \
     128   305656940 :     wwl_t _x = (x);                                   \
     129   305656940 :     __m256i _xl = _mm512_extracti64x4_epi64( _x, 0 ); \
     130   305656940 :     __m256i _xh = _mm512_extracti64x4_epi64( _x, 1 ); \
     131   305656940 :     y##0 = _mm256_extract_epi64( _xl, 0 );            \
     132   305656940 :     y##1 = _mm256_extract_epi64( _xl, 1 );            \
     133   305656940 :     y##2 = _mm256_extract_epi64( _xl, 2 );            \
     134   305656940 :     y##3 = _mm256_extract_epi64( _xl, 3 );            \
     135   305656940 :     y##4 = _mm256_extract_epi64( _xh, 0 );            \
     136   305656940 :     y##5 = _mm256_extract_epi64( _xh, 1 );            \
     137   305656940 :   } while(0)
     138             : 
     139             : /* fd_r43x6_zero(), fd_r43x6_one(), fd_r43x6_p(), fd_r43x6_d(),
     140             :    fd_r43x6_2d(), fd_r43x6_imag() returns the reduced fd_r43x6_t for
     141             :    zero (reduced), one (reduced), 2^255-19 (the non-trivial nearly
     142             :    reduced representation), d (reduced), 2*d (reduced) and sqrt(-1)
     143             :    (reduced) respectively.  d is defined as per IETF RFC 8032 Section
     144             :    5.1 (page 9) as -121665/121666.  imag^2 = -1 mod p = p-1.  These
     145             :    macros are robust.  Lanes 6 and 7 will be zero. */
     146             : 
     147           1 : #define fd_r43x6_zero() wwl_zero()
     148      299215 : #define fd_r43x6_one()  wwl(             1L,            0L,            0L,            0L,            0L,            0L, 0L,0L )
     149    20000001 : #define fd_r43x6_p()    wwl( 8796093022189L,8796093022207L,8796093022207L,8796093022207L,8796093022207L,1099511627775L, 0L,0L )
     150      299214 : #define fd_r43x6_d()    wwl( 6365466163363L, 253762649449L,   7518893317L, 260847760460L,7696165686388L, 704489577558L, 0L,0L )
     151             : #define fd_r43x6_2d()   wwl( 3934839304537L, 507525298899L,  15037786634L, 521695520920L,6596238350568L, 309467527341L, 0L,0L )
     152      299214 : #define fd_r43x6_imag() wwl( 3467281080496L,6582290652611L,5210002954932L, 329084955603L,4526638806224L, 373767602335L, 0L,0L )
     153             : 
     154             : /* fd_r43x6_unpack(u) returns an unpacked r43x6_t corresponding to an
     155             :    arbitrary uint256 stored little endian 4 ulong radix 2^64 limb
     156             :    representation held in an AVX-2 vector:
     157             : 
     158             :      u = u0 + u1 2^64 + u2 2^128 + u3 2^192
     159             : 
     160             :    where un is the n-th 64-bit vector lane treated as a ulong.  Returned
     161             :    lanes 6 and 7 will be zero.  If u is in [0,2*p), the return will be a
     162             :    nearly reduced fd_r43x6_t.  If u is in [0,p), the return will be a
     163             :    reduced fd_r43x6_t. */
     164             : 
     165             : FD_FN_CONST static inline fd_r43x6_t
     166    32057143 : fd_r43x6_unpack( wv_t u ) {
     167    32057143 :   wwl_t const zero   = wwl_zero();
     168    32057143 :   wwl_t const perm   = wwl( 0x3f3f050403020100L,   // r0 = bits   0: 42 (43 bits, zero extend to 64 bits)
     169    32057143 :                             0x3f3f0a0908070605L,   // r1 = bits  43: 85 (43 bits, zero extend to 64 bits)
     170    32057143 :                             0x3f100f0e0d0c0b0aL,   // r2 = bits  86:128 (43 bits, zero extend to 64 bits)
     171    32057143 :                             0x3f3f151413121110L,   // r3 = bits 129:171 (43 bits, zero extend to 64 bits)
     172    32057143 :                             0x3f3f1a1918171615L,   // r4 = bits 172:214 (43 bits, zero extend to 64 bits)
     173    32057143 :                             0x3f3f1f1e1d1c1b1aL,   // r5 = bits 215:255 (41 bits, zero extend to 64 bits)
     174    32057143 :                             0x3f3f3f3f3f3f3f3fL,   // r6 = zero
     175    32057143 :                             0x3f3f3f3f3f3f3f3fL ); // r7 = zero
     176    32057143 :   wwl_t const rshift = wwl( 0L, 3L, 6L, 1L, 4L, 7L, 0L, 0L ); // r0/r1/r2/r3/r4/r5 bit 0 is u bit 0/43/86/129/172
     177    32057143 :   wwl_t const mask   = wwl_bcast( (1L<<43)-1L );                    // Keep 43 least significant bits for each lane
     178    32057143 :   return wwl_and( wwl_shru_vector( _mm512_permutexvar_epi8( perm, _mm512_inserti64x4( zero, u, 0 ) ), rshift ), mask );
     179    32057143 : }
     180             : 
     181             : /* fd_r43x6_pack(r) is the inverse of fd_r43x6_unpack.  r should be an
     182             :    unpacked fd_r43x6_t.  If r is also nearly reduced, the return will be
     183             :    in [0,2*p).  If r is also reduced fd_r43x6_t, the return will be in
     184             :    [0,p).  Ignores lanes 6 and 7. */
     185             : 
     186             : FD_FN_CONST static inline wv_t
     187    33167888 : fd_r43x6_pack( fd_r43x6_t r ) {
     188             : 
     189             :   /*                  43              21
     190             :                 0            42 43          63
     191             :      u0 =       r0_0  ... r0_42 r1_0 ... r1_20
     192             : 
     193             :                       22              42
     194             :                 0            21 22          63
     195             :      u1 =       r1_21 ... r1_42 r2_0 ... r2_41
     196             : 
     197             :             1         43              20
     198             :             0   1            43 44          63
     199             :      u2 = r2_42 r3_0  ... r3_42 r4_0 ... r4_19
     200             : 
     201             :                       23              41
     202             :                 0            22 23          63
     203             :      u3 =       r4_20 ... r3_42 r5_0 ... r5_40
     204             : 
     205             :              t0         t1         t2
     206             :      u0 = (r0>> 0) | (r1<<43) | (r1<<43); ... Last term redundant to keep vectorized
     207             :      u1 = (r1>>21) | (r2<<22) | (r2<<22); ... "
     208             :      u2 = (r2>>42) | (r3<< 1) | (r4<<44);
     209             :      u3 = (r4>>20) | (r5<<23) | (r5<<23); ... " */
     210             : 
     211    33167888 :   wwl_t t0 = wwl_shru_vector( wwl_permute( wwl(  0L, 1L, 2L, 4L, 0L,0L,0L,0L ), r ), wwl(  0L,21L,42L,20L, 0L,0L,0L,0L ) );
     212    33167888 :   wwl_t t1 = wwl_shl_vector ( wwl_permute( wwl(  1L, 2L, 3L, 5L, 0L,0L,0L,0L ), r ), wwl( 43L,22L, 1L,23L, 0L,0L,0L,0L ) );
     213    33167888 :   wwl_t t2 = wwl_shl_vector ( wwl_permute( wwl(  1L, 2L, 4L, 5L, 0L,0L,0L,0L ), r ), wwl( 43L,22L,44L,23L, 0L,0L,0L,0L ) );
     214             : 
     215    33167888 :   return _mm512_extracti64x4_epi64( wwl_or( wwl_or( t0, t1 ), t2 ), 0 );
     216    33167888 : }
     217             : 
     218             : /* fd_r43x6_approx_carry_propagate_limbs(x,y) computes a signed
     219             :    fd_r43x6_t equivalent to an arbitrary fd_r43x6_t that has been
     220             :    extracted into the longs x0-x5 and stores the result into the longs
     221             :    y0-y5.  On return:
     222             : 
     223             :      y0    in [-19*2^23,2^43+19*(2^23-1))
     224             :      y1-y4 in [   -2^20,2^43+   (2^20-1))
     225             :      y5    in [   -2^20,2^40+   (2^20-1))
     226             : 
     227             :    In-place operation fine.  This macro is robust.
     228             : 
     229             :    If x is unsigned or more generally x0-x5 in [0,2^63), the result will
     230             :    be an unreduced fd_r43x6_t with:
     231             : 
     232             :      y0    in [0,2^43+19*(2^23-1))
     233             :      y1-y4 in [0,2^43+   (2^20-1))
     234             :      y5    in [0,2^40+   (2^20-1))
     235             : 
     236             :    Theory:
     237             : 
     238             :      x = <x0,x1,x2,x3,x4,x5>
     239             :        = <x0l,x1l,x2l,x3l,x4l,x5l> + <2^43*x0h,2^43*x1h,2^43*x2h,2^43*x3h,2^43*x4h,2^40*x5h>
     240             :        = <x0l,x1l,x2l,x3l,x4l,x5l> + <19*x5h,x0h,x1h,x2h,x3h,x4h>
     241             : 
     242             :    where x0h=floor(x0/2^43) and x0l = x0-2^43*x0h and similarly for
     243             :    x1-x4 while x5h = floor(x5/2^40) and x5l=x5-2^40*x5h.
     244             : 
     245             :    Above we used:
     246             : 
     247             :      2^215*2^40*x5h = 2^255*x5h = (p+19)*x5h mod p = 19*x5h mod p.
     248             : 
     249             :    Equivalently, x0l-x5l are the least significant {43,43,43,43,43,40}
     250             :    bits of x0-x5 and x0h-x5l are the sign extending right shifts of
     251             :    x0-x5 by the same.
     252             : 
     253             :    For arbitrary x we have:
     254             : 
     255             :      x0l-x4l in [0,2^43), x0h-x4h in [-2^20,2^20)
     256             :      x5l     in [0,2^40), x5h     in [-2^23,2^23)
     257             : 
     258             :    while for x0-x5 in [0,2^63) (which includes unsigned) we have:
     259             : 
     260             :      x0l-x4l in [0,2^43), x0h-x4h in [0,2^20)
     261             :      x5l     in [0,2^40), x5h     in [0,2^23)
     262             : 
     263             :    This yields the above ranges for y0-y5.  There are no intermediate
     264             :    overflows in the computation.
     265             : 
     266             :    This is a building block for more complex mappings where x's limbs
     267             :    have already been extracted in order to minimize the number of limb
     268             :    extracts and fd_r43x6 constructs.
     269             : 
     270             :    Note that if we used ulongs (and thus zero padding right shifts)
     271             :    below, this same style calculation could be used on an arbitrary
     272             :    _ulong_ limbed x.  The result would still be an unreduced fd_r43x6_t
     273             :    with:
     274             : 
     275             :      y0-y4 in [0,2^43+19*(2^24-1))
     276             :      y5    in [0,2^40+    2^21-1 ) */
     277             : 
     278   113430033 : #define fd_r43x6_approx_carry_propagate_limbs(x,y) do { \
     279   113430033 :     long const _m43 = (1L<<43)-1L;                      \
     280   113430033 :     long const _m40 = (1L<<40)-1L;                      \
     281   113430033 :     long _x0 = (x##0);                                  \
     282   113430033 :     long _x1 = (x##1);                                  \
     283   113430033 :     long _x2 = (x##2);                                  \
     284   113430033 :     long _x3 = (x##3);                                  \
     285   113430033 :     long _x4 = (x##4);                                  \
     286   113430033 :     long _x5 = (x##5);                                  \
     287   113430033 :     (y##0) = (_x0 & _m43) + 19L*(_x5>>40);              \
     288   113430033 :     (y##1) = (_x1 & _m43) +     (_x0>>43);              \
     289   113430033 :     (y##2) = (_x2 & _m43) +     (_x1>>43);              \
     290   113430033 :     (y##3) = (_x3 & _m43) +     (_x2>>43);              \
     291   113430033 :     (y##4) = (_x4 & _m43) +     (_x3>>43);              \
     292   113430033 :     (y##5) = (_x5 & _m40) +     (_x4>>43);              \
     293   113430033 :   } while(0)
     294             : 
     295             : /* fd_r43x6_approx_carry_propagate is a vectorized version of the above.
     296             :    Returned lanes 6 and 7 are zero.  If we've already extracted x's
     297             :    limbs and/or need the resulting limbs after, it is usually faster to
     298             :    do the extract and then use the scalar implementation.
     299             : 
     300             :    The fold_unsigned variant is the same but assumes x is unsigned or
     301             :    more generally x0-x5 in [0,2^63).  This allows a faster madd52lo to
     302             :    be used instead of a slower mullo.  (Hat tip to Philip Taffet for
     303             :    pointing this out.) It will also work for an arbitrary limbed ulong
     304             :    x.
     305             : 
     306             :    The fold_signed variant assumes x is signed or more generally:
     307             : 
     308             :      x0    in [-2^63+19*2^23,2^63)
     309             :      x1-x5 in [-2^63   +2^20,2^63)
     310             : 
     311             :    and subtracts x by <19*2^23,2^20,2^20,2^20,2^20,2^20> (which will not
     312             :    overflow) before the approx carry propagate and add it back after.
     313             :    This will not overflow and will not change the element represented.
     314             :    But it does yield an unreduced result with limbs:
     315             : 
     316             :      y0    in [0,2^43+19*(2^24-1))
     317             :      y1-y4 in [0,2^43+   (2^21-1))
     318             :      y5    in [0,2^40+   (2^21-1))
     319             : 
     320             :    These variants are particularly useful for mapping results of a
     321             :    additions and subtractions into unreduced results for subsequent
     322             :    operations. */
     323             : 
     324             : #if 0 /* A mullo based implementation is slightly slower ... */
     325             : #define fd_r43x6_approx_carry_propagate( x ) (__extension__({                                         \
     326             :     long const _m43 = (1L<<43)-1L;                                                                    \
     327             :     long const _m40 = (1L<<40)-1L;                                                                    \
     328             :     wwl_t    _x   = (x);                                                                              \
     329             :     wwl_add( wwl_and( _x, wwl( _m43,_m43,_m43,_m43,_m43,_m40, 0L,0L ) ),                              \
     330             :              wwl_mul( wwl( 19L,1L,1L,1L,1L,1L, 0L,0L ),                                               \
     331             :                       wwl_permute( wwl( 5L,0L,1L,2L,3L,4L, 6L,7L ),                                   \
     332             :                                    wwl_shr_vector( _x, wwl( 43L,43L,43L,43L,43L,40L, 0L,0L ) ) ) ) ); \
     333             :   }))
     334             : #else /* ... than a more obtuse shift-and-add based implementation */
     335             : #define fd_r43x6_approx_carry_propagate( x ) (__extension__({                                                \
     336             :     long const _m43 = (1L<<43)-1L;                                                                           \
     337             :     long const _m40 = (1L<<40)-1L;                                                                           \
     338             :     wwl_t _x   = (x);                                                                                        \
     339             :     wwl_t _xl  = wwl_and( _x, wwl( _m43,_m43,_m43,_m43,_m43,_m40, 0L,0L ) );                                 \
     340             :     wwl_t _xh  = wwl_shr_vector( _x, wwl( 43L,43L,43L,43L,43L,40L, 0L,0L ) );                                \
     341             :     wwl_t _c   = wwl_select( wwl( 5L,0L,1L,2L,3L,4L, 8L,8L ), _xh, wwl_zero() );                             \
     342             :     wwl_t _d   = wwl_and( wwl_add( wwl_shl( _c, 1 ), wwl_shl( _c, 4 ) ), wwl( -1L,0L,0L,0L,0L,0L, 0L,0L ) ); \
     343             :     /* _xl = <   x0l,x1l,x2l,x3l,x4l,x5l, 0,0> */                                                            \
     344             :     /* _c  = <   x5h,x0h,x1h,x2h,x3h,x4h, 0,0> */                                                            \
     345             :     /* _d  = <18*x5h,  0,  0,  0,  0,  0, 0,0> */                                                            \
     346             :     wwl_add( wwl_add( _xl, _c ), _d );                                                                       \
     347             :   }))
     348             : #endif
     349             : 
     350   979936066 : #define fd_r43x6_fold_unsigned( x ) (__extension__({                                             \
     351   979936066 :     long const _m43 = (1L<<43)-1L;                                                               \
     352   979936066 :     long const _m40 = (1L<<40)-1L;                                                               \
     353   979936066 :     wwl_t _x = (x);                                                                              \
     354   979936066 :     wwl_madd52lo( wwl_and( _x, wwl( _m43,_m43,_m43,_m43,_m43,_m40, 0L,0L ) ),                    \
     355   979936066 :                   wwl( 19L,1L,1L,1L,1L,1L, 0L,0L ),                                              \
     356   979936066 :                   wwl_permute( wwl( 5L,0L,1L,2L,3L,4L, 6L,7L ),                                  \
     357   979936066 :                                wwl_shru_vector( _x, wwl( 43L,43L,43L,43L,43L,40L, 0L,0L ) ) ) ); \
     358   979936066 :   }))
     359             : 
     360     3835230 : #define fd_r43x6_fold_signed( x ) (__extension__({                             \
     361     3835230 :     wwl_t const _b = wwl( 19L<<23,1L<<20,1L<<20,1L<<20,1L<<20,1L<<20, 0L,0L ); \
     362     3835230 :     wwl_add( fd_r43x6_approx_carry_propagate( wwl_sub( (x), _b ) ), _b );      \
     363     3835230 :   }))
     364             : 
     365             : /* fd_r43x6_biased_carry_propagate_limbs computes an equivalent
     366             :    fd_r43x6_t to a signed fd_r43x6_t that has been extracted into the
     367             :    longs x0-x5 and stores the result into the longs y0-y5.  x5 is
     368             :    subtracted by a small bias in [0,2^20] before that is added back
     369             :    after.  This has no impact on the element represented but can impact
     370             :    the range needed for limb 5.  In-place operation fine.  This macro is
     371             :    robust.
     372             : 
     373             :    IMPORTANT!  THIS SHOULD NOT BE APPLIED TO ARBITRARY X.
     374             : 
     375             :    If x is signed or more generally:
     376             : 
     377             :      x0    in [-2^63+19*2^23,2^63-19*(2^23-1))
     378             :      x1-x4 in [-2^63+   2^20,2^63-   (2^20-1))
     379             :      x5    in [-2^63+b,      2^63            )
     380             : 
     381             :    On return y will be signed but only in one limb such that it is
     382             :    almost nearly reduced:
     383             : 
     384             :      y0-y4 in [0,      2^43         )
     385             :      y5    in [-2^20+b,2^40+2^20-1+b)
     386             : 
     387             :    Thus, with b=2^20, if x is signed or more generally
     388             : 
     389             :      x0    in [-2^63+19*2^23,2^63-19*(2^23-1))
     390             :      x1-x4 in [-2^63+   2^20,2^63-   (2^20-1))
     391             :      x5    in [-2^63+   2^20,2^63            )
     392             : 
     393             :    On return y will be nearly reduced with y5 in [0,2^40+2^21-1).
     394             : 
     395             :    And, with b=0, if x is unsigned or more generally:
     396             : 
     397             :      x0    in [0,2^63-19*(2^23-1))
     398             :      x1-x4 in [0,2^63-   (2^20-1))
     399             :      x5    in [0,2^63            )
     400             : 
     401             :    On return y will be nearly reduced with y5 in [0,2^40+2^20-1).
     402             : 
     403             :    With b=0, the more restricted x is, the tighter the y5 range.  If x
     404             :    is {unreduced,unpacked,nearly reduced}, with b=0, on return, y will
     405             :    be nearly reduced with y5 in {[0,2^40+2^5-1),[0,2^40+1),[0,2^40+1)}.
     406             :    If x is reduced, on return, y will be reduced.
     407             : 
     408             :    Under the hood, this does a serial carry propagate on the limbs.
     409             : 
     410             :    Theory:
     411             : 
     412             :    Break an arbitrary x5 into its lower and upper bits as was done for
     413             :    approx_carry_propagate.  Then:
     414             : 
     415             :      x = <x0,x1,x2,x3,x4,x5l> + 2^40 <0,0,0,0,0,x5h>
     416             :        = <x0,x1,x2,x3,x4,x5l> + 19 <x5h,0,0,0,0,0>
     417             :        = <x0l+19*x5h,x1,x2,x3,x4,x5l>
     418             : 
     419             :    As x5h will be in [-2^23,2^23), if the initial x0 is in
     420             :    [-2^63+19*2^23,2^63-19*(2^23-1)), this new representation can be
     421             :    computed without intermediate overflow with limb 4 in [0,2^40) and
     422             :    limb 0 in [-2^63,2^63).
     423             : 
     424             :    Now break an arbitrary x0 into its lower and upper bits.  Then:
     425             : 
     426             :        x = <x0,x1,x2,x3,x4,x5>
     427             :          = <x0l,x1,x2,x3,x4,x5> + 2^43 <x0h,0,0,0,0,0>
     428             :          = <x0l,x1,x2,x3,x4,x5> + <0,x0h,0,0,0,0>
     429             :          = <x0l,x1+x0h,x2,x3,x5>
     430             : 
     431             :    As x0h will be in [-2^20,2^20), if the initial x1 is in
     432             :    [-2^63+2^20,2^63-(2^20-1)), this new representation can be computed
     433             :    without intermediate overflow with limb 0 in [0,2^43) and limb 1 in
     434             :    [-2^63,2^63).
     435             : 
     436             :    We can similarly serially propagate limb 1's carries to 2, 2 to 3, 3
     437             :    to 4 and 4 to 5.  As x4h will be in [-2^20,2^20), the limb 5's final
     438             :    value will be in [-2^20,2^40+2^20-1).
     439             : 
     440             :    If x is in unsigned or in the unsigned range described above, limb 4's
     441             :    carry will be in [0,2^20) such that the result will be nearly reduced
     442             :    with limb 5 in [0,2^40+2^20-1).
     443             : 
     444             :    If x is {unreduced,unpacked,nearly reduced}, limb 4's carry will be
     445             :    in {[0,2^5),[0,1],[0,1]} such that limb 5 will be in
     446             :    {[0,2^40+2^5-1),[0,2^40+1),[0,2^40+1)}.
     447             : 
     448             :    If x was reduced, all carries will be zero such that y will have the
     449             :    same limbs as x.
     450             : 
     451             :    This yields the above.
     452             : 
     453             :    Like approx_carry_propagate, this is a building block for more
     454             :    complex mappings where x's limbs have already been extracted in order
     455             :    to minimize the number of limb extracts and fd_r43x6 constructs. */
     456             : 
     457   275656840 : #define fd_r43x6_biased_carry_propagate_limbs(x,y,b) do { \
     458   275656840 :     long const _m43 = (1L<<43)-1L;                        \
     459   275656840 :     long const _m40 = (1L<<40)-1L;                        \
     460   275656840 :     long _y0 = (x##0);                                    \
     461   275656840 :     long _y1 = (x##1);                                    \
     462   275656840 :     long _y2 = (x##2);                                    \
     463   275656840 :     long _y3 = (x##3);                                    \
     464   275656840 :     long _y4 = (x##4);                                    \
     465   275656840 :     long _y5 = (x##5);                                    \
     466   275656840 :     long _b  = (b);                                       \
     467   275656840 :     long _c;                                              \
     468   275656840 :     _y5 -= _b;                                            \
     469   275656840 :     _c = _y5>>40; _y5 &= _m40; _y0 += 19L*_c;             \
     470   275656840 :     _c = _y0>>43; _y0 &= _m43; _y1 +=     _c;             \
     471   275656840 :     _c = _y1>>43; _y1 &= _m43; _y2 +=     _c;             \
     472   275656840 :     _c = _y2>>43; _y2 &= _m43; _y3 +=     _c;             \
     473   275656840 :     _c = _y3>>43; _y3 &= _m43; _y4 +=     _c;             \
     474   275656840 :     _c = _y4>>43; _y4 &= _m43; _y5 +=     _c;             \
     475   275656840 :     _y5 += _b;                                            \
     476   275656840 :     (y##0) = _y0;                                         \
     477   275656840 :     (y##1) = _y1;                                         \
     478   275656840 :     (y##2) = _y2;                                         \
     479   275656840 :     (y##3) = _y3;                                         \
     480   275656840 :     (y##4) = _y4;                                         \
     481   275656840 :     (y##5) = _y5;                                         \
     482   275656840 :   } while(0)
     483             : 
     484             : /* Note: fd_r43x6_biased_carry_propagate_limbs does not have a good
     485             :    AVX-512 implementation (it is highly sequential). */
     486             : 
     487             : /* fd_r43x6_mod_nearly_reduced_limbs computes the reduced fd_r43x6_t for
     488             :    a nearly reduced fd_r43x6_t that has been extracted into the longs
     489             :    x0-x5 and stores the limbs in the longs y0-y5.  In-place operation
     490             :    fine.  This macro is robust.  Theory:
     491             : 
     492             :    Let x = q p + r where q is an integer and r is in [0,p).  Since x is
     493             :    in [0,2*p), q is in [0,1].  If x<p, q=0 and r=x, otherwise, q=1 and
     494             :    r=x-p.  Using p = 2^255 - 19, x<p implies x+19<2^255.  Thus, if x+19
     495             :    has bit 255 (limb 5 bit 40) clear, r=x, otherwise, r=x-p.
     496             : 
     497             :    In pseudo code:
     498             :      y = x + 19                ... < 2*p+19 = 2^256-19 < 2^256
     499             :      q = y>>255                ... in [0,1]
     500             :      if !q, r = x              ... x in [0,p  ) -> r in [0,p)
     501             :      else   r = x - (2^255-19) ... x in [p,2*p) -> r in [0,p)
     502             : 
     503             :    Simplifying:
     504             :      y = x + 19
     505             :      q = y>>255
     506             :      if !q, r = y - 19
     507             :      else   r = y - 2^255
     508             : 
     509             :    Or:
     510             :      y  = x + 19
     511             :      q  = y>>255
     512             :      y -= q<<255       ... clear bit 255
     513             :      if !q, r = y - 19
     514             :      else   r = y
     515             : 
     516             :    Or, branchless for deterministic performance:
     517             :      y  = x + 19
     518             :      q  = y>>255
     519             :      y -= q<<255
     520             :      r  = y - if(!q,19,0) */
     521             : 
     522   125656440 : #define fd_r43x6_mod_nearly_reduced_limbs(x,y) do { \
     523   125656440 :     long const _m43 = (1L<<43)-1L;                  \
     524   125656440 :     long const _m40 = (1L<<40)-1L;                  \
     525   125656440 :     long _y0 = (x##0);                              \
     526   125656440 :     long _y1 = (x##1);                              \
     527   125656440 :     long _y2 = (x##2);                              \
     528   125656440 :     long _y3 = (x##3);                              \
     529   125656440 :     long _y4 = (x##4);                              \
     530   125656440 :     long _y5 = (x##5);                              \
     531   125656440 :     long _c;                                        \
     532   125656440 :                                                     \
     533   125656440 :     /* y = x + 19, q = y>>255, y -= q<<255 */       \
     534   125656440 :     _y0 += 19L;                                     \
     535   125656440 :     _c = _y0 >> 43; _y0 &= _m43; _y1 += _c;         \
     536   125656440 :     _c = _y1 >> 43; _y1 &= _m43; _y2 += _c;         \
     537   125656440 :     _c = _y2 >> 43; _y2 &= _m43; _y3 += _c;         \
     538   125656440 :     _c = _y3 >> 43; _y3 &= _m43; _y4 += _c;         \
     539   125656440 :     _c = _y4 >> 43; _y4 &= _m43; _y5 += _c;         \
     540   125656440 :     _c = _y5 >> 40; _y5 &= _m40;                    \
     541   125656440 :                                                     \
     542   125656440 :     /* r = y - if(!q,19,0) */                       \
     543   125656440 :     _y0 -= fd_long_if( !_c, 19L, 0L );              \
     544   125656440 :     _c = _y0 >> 43; _y0 &= _m43; _y1 += _c;         \
     545   125656440 :     _c = _y1 >> 43; _y1 &= _m43; _y2 += _c;         \
     546   125656440 :     _c = _y2 >> 43; _y2 &= _m43; _y3 += _c;         \
     547   125656440 :     _c = _y3 >> 43; _y3 &= _m43; _y4 += _c;         \
     548   125656440 :     _c = _y4 >> 43; _y4 &= _m43; _y5 += _c;         \
     549   125656440 :                                                     \
     550   125656440 :     (y##0) = _y0;                                   \
     551   125656440 :     (y##1) = _y1;                                   \
     552   125656440 :     (y##2) = _y2;                                   \
     553   125656440 :     (y##3) = _y3;                                   \
     554   125656440 :     (y##4) = _y4;                                   \
     555   125656440 :     (y##5) = _y5;                                   \
     556   125656440 :   } while(0)
     557             : 
     558             : /* Note: fd_r43x6_mod_nearly_reduced_limbs does not have a good AVX-512
     559             :    implementation (it is highly sequential). */
     560             : 
     561             : /* fd_r43x6_approx_mod(x) returns a nearly reduced fd_r43x6_t equivalent
     562             :    to an arbitrary fd_r43x6_t x.  On return y5 will be in [0,2^40+2).
     563             : 
     564             :    fd_r43x6_approx_mod_signed(x) does the same for signed x or, more
     565             :    generally:
     566             : 
     567             :      x0    in [-2^63+19*2^23,2^63-19*(2^23-1))
     568             :      x1-x4 in [-2^63+   2^20,2^63-   (2^20-1))
     569             :      x5    in [-2^63+   2^20,2^63            )
     570             : 
     571             :    fd_r43x6_approx_mod_unsigned(x) does the same for unsigned x or, more
     572             :    generally:
     573             : 
     574             :      x0    in [0,2^63-19*(2^23-1))
     575             :      x1-x4 in [0,2^63-   (2^20-1))
     576             :      x5    in [0,2^63            )
     577             : 
     578             :    On return y5 will be in [0,2^40+2^20-1).
     579             : 
     580             :    fd_r43x6_approx_mod_unreduced(x) does the same for unreduced x.  On
     581             :    return y5 will be in [0,2^40+2^5-1).
     582             : 
     583             :    fd_r43x6_approx_mod_unpacked(x) does the same for unpacked x.  On
     584             :    return y5 will be in [0,2^40+1). */
     585             : 
     586             : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
     587    90000100 : fd_r43x6_approx_mod( fd_r43x6_t x ) {
     588    90000100 :   long y0, y1, y2, y3, y4, y5;
     589    90000100 :   fd_r43x6_extract_limbs( x, y );
     590             : 
     591             :   /* At this point y is arbitrary.  We do an approx carry propagate to
     592             :      reduce the range of limbs suitable for a biased carry propagate.
     593             :      (Note: it is faster here to do extract then approx-cp than
     594             :      vector-approx-cp then extract.) */
     595             : 
     596    90000100 :   fd_r43x6_approx_carry_propagate_limbs( y, y );
     597             : 
     598             :   /* At this point y has:
     599             : 
     600             :        y0    in [-19*2^23,2^43+19*(2^23-1))
     601             :        y1-y4 in [   -2^20,2^43+   (2^20-1))
     602             :        y5    in [   -2^20,2^40+   (2^20-1))
     603             : 
     604             :      An unbiased carry propagate could produce y5=-1.  So, we use a
     605             :      biased c-p with a b=1.  TODO: CONSIDER JUST USING 2^20 BIAS ALWAYS? */
     606             : 
     607    90000100 :   fd_r43x6_biased_carry_propagate_limbs( y, y, 1L );
     608             : 
     609    90000100 :   return fd_r43x6( y0, y1, y2, y3, y4, y5 );
     610    90000100 : }
     611             : 
     612             : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
     613    20000100 : fd_r43x6_approx_mod_signed( fd_r43x6_t x ) {
     614    20000100 :   long y0, y1, y2, y3, y4, y5;
     615    20000100 :   fd_r43x6_extract_limbs( x, y );
     616             : 
     617             :   /* At this point y has:
     618             : 
     619             :        x0    in [-2^63+19*2^23,2^63-19*(2^23-1))
     620             :        x1-x4 in [-2^63+   2^20,2^63-   (2^20-1))
     621             :        x5    in [-2^63+   2^20,2^63            )
     622             : 
     623             :      so we can do a biased carry propagate y with b=2^20 as described
     624             :      above. */
     625             : 
     626    20000100 :   fd_r43x6_biased_carry_propagate_limbs( y, y, 1L<<20 );
     627             : 
     628    20000100 :   return fd_r43x6( y0, y1, y2, y3, y4, y5 );
     629    20000100 : }
     630             : 
     631             : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
     632    60000300 : fd_r43x6_approx_mod_unsigned( fd_r43x6_t x ) {
     633    60000300 :   long y0, y1, y2, y3, y4, y5;
     634    60000300 :   fd_r43x6_extract_limbs( x, y );
     635             : 
     636             :   /* At this point y has:
     637             : 
     638             :        x0    in [0,2^63-19*(2^23-1))
     639             :        x1-x4 in [0,2^63-   (2^20-1))
     640             :        x5    in [0,2^63            )
     641             : 
     642             :      so we can do an unbiased carry propagate as described above. */
     643             : 
     644    60000300 :   fd_r43x6_biased_carry_propagate_limbs( y, y, 0L );
     645             : 
     646    60000300 :   return fd_r43x6( y0, y1, y2, y3, y4, y5 );
     647    60000300 : }
     648             : 
     649             : #define fd_r43x6_approx_mod_unreduced fd_r43x6_approx_mod_unsigned /* no difference in impl, tighter y5 result described above */
     650             : #define fd_r43x6_approx_mod_unpacked  fd_r43x6_approx_mod_unsigned /* no difference in impl, tighter y5 result described above */
     651             : 
     652             : /* fd_r43x6_mod(x) returns the reduced fd_r43x6_t equivalent to an
     653             :    arbitrary fd_r43x6_t x.
     654             : 
     655             :    fd_r43x6_approx_mod_signed(x) does the same for signed x or more
     656             :    generally:
     657             : 
     658             :      x0    in [-2^63+19*2^23,2^63-19*(2^23-1))
     659             :      x1-x4 in [-2^63+   2^20,2^63-   (2^20-1))
     660             :      x5    in [-2^63+   2^20,2^63            )
     661             : 
     662             :    fd_r43x6_mod_unreduced(x) does the same for unreduced x, or, more
     663             :    generally:
     664             : 
     665             :      x0    in [0,2^63-19*(2^23-1))
     666             :      x1-x4 in [0,2^63-   (2^20-1))
     667             :      x5    in [0,2^63            )
     668             : 
     669             :    fd_r43x6_mod_unpacked(x) does the same for unpacked x.
     670             : 
     671             :    fd_r43x6_mod_nearly_reduced(x) does the same for nearly reduced x. */
     672             : 
     673             : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
     674    23429933 : fd_r43x6_mod( fd_r43x6_t x ) {
     675    23429933 :   long y0, y1, y2, y3, y4, y5;
     676    23429933 :   fd_r43x6_extract_limbs( x, y );
     677    23429933 :   fd_r43x6_approx_carry_propagate_limbs( y, y );
     678    23429933 :   fd_r43x6_biased_carry_propagate_limbs( y, y, 1L );
     679             :   /* At this point, x is nearly reduced */
     680    23429933 :   fd_r43x6_mod_nearly_reduced_limbs( y, y );
     681    23429933 :   return fd_r43x6( y0, y1, y2, y3, y4, y5 );
     682    23429933 : }
     683             : 
     684             : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
     685    20000101 : fd_r43x6_mod_signed( fd_r43x6_t x ) {
     686    20000101 :   long y0, y1, y2, y3, y4, y5;
     687    20000101 :   fd_r43x6_extract_limbs( x, y );
     688    20000101 :   fd_r43x6_biased_carry_propagate_limbs( y, y, 1L<<20 );
     689             :   /* At this point, x is nearly reduced */
     690    20000101 :   fd_r43x6_mod_nearly_reduced_limbs( y, y );
     691    20000101 :   return fd_r43x6( y0, y1, y2, y3, y4, y5 );
     692    20000101 : }
     693             : 
     694             : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
     695    60000303 : fd_r43x6_mod_unsigned( fd_r43x6_t x ) {
     696    60000303 :   long y0, y1, y2, y3, y4, y5;
     697    60000303 :   fd_r43x6_extract_limbs( x, y );
     698    60000303 :   fd_r43x6_biased_carry_propagate_limbs( y, y, 0L );
     699             :   /* At this point, x is nearly reduced */
     700    60000303 :   fd_r43x6_mod_nearly_reduced_limbs( y, y );
     701    60000303 :   return fd_r43x6( y0, y1, y2, y3, y4, y5 );
     702    60000303 : }
     703             : 
     704             : #define fd_r43x6_mod_unreduced fd_r43x6_mod_unsigned /* no difference in impl */
     705             : #define fd_r43x6_mod_unpacked  fd_r43x6_mod_unsigned /* no difference in impl */
     706             : 
     707             : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
     708    20000100 : fd_r43x6_mod_nearly_reduced( fd_r43x6_t x ) {
     709    20000100 :   long y0, y1, y2, y3, y4, y5;
     710    20000100 :   fd_r43x6_extract_limbs( x, y );
     711             :   /* At this point, x is already nearly reduced */
     712    20000100 :   fd_r43x6_mod_nearly_reduced_limbs( y, y );
     713    20000100 :   return fd_r43x6( y0, y1, y2, y3, y4, y5 );
     714    20000100 : }
     715             : 
     716             : /* fd_r43x6_neg_fast(x)   returns z = -x,    computed as z = p - x
     717             :    fd_r43x6_add_fast(x,y) returns z = x + y
     718             :    fd_r43x6_sub_fast(x,y) returns z = x - y, computed as z = x + (p - y)
     719             : 
     720             :    These will be applied to lanes 6 and 7 and these assume that the
     721             :    corresponding limbs of x and y when added / subtracted will produce a
     722             :    result that doesn't overflow the range of a long.
     723             : 
     724             :    For example, it is safe to add a large (but bounded) number of
     725             :    unpacked x to produce an unreduced sum and then do a single mod at
     726             :    the end to produce the reduced result.  With 0 < ll <= mm < 63 and
     727             :    letting nn = mm+1, given the input ranges, the below give
     728             :    conservative output ranges.
     729             : 
     730             :       -ull -> sll
     731             :       -sll -> sll
     732             : 
     733             :       ull + umm -> unn, umm + ull -> unn
     734             :       ull + smm -> snn, umm + sll -> snn
     735             :       sll + umm -> snn, smm + ull -> snn
     736             :       sll + smm -> snn, smm + sll -> snn
     737             : 
     738             :       ull - umm -> smm, umm - ull -> smm
     739             :       ull - smm -> snn, umm - sll -> snn
     740             :       sll - umm -> snn, smm - ull -> snn
     741             :       sll - smm -> snn, smm - sll -> snn */
     742             : 
     743      992497 : #define fd_r43x6_neg_fast( x )    wwl_sub( fd_r43x6_p(), (x) )
     744    17947055 : #define fd_r43x6_add_fast( x, y ) wwl_add( (x), (y) )
     745     1108532 : #define fd_r43x6_sub_fast( x, y ) wwl_add( (x), wwl_sub( (fd_r43x6_p()), (y) ) )
     746             : 
     747             : /* fd_r43x6_mul_fast(x,y) returns z = x*y as an unsigned fd_r43x6_t
     748             :    with lanes 6 and 7 zero where x and y are unreduced fd_r43x6_t's
     749             :    (i.e. in u47).  Ignores lanes 6 and 7 of x and assumes lanes 6 and 7
     750             :    of y are zero.  More specifically, u44/u45/u46/u47 inputs produce a
     751             :    u62/u62/u62/u63 output. */
     752             : 
     753             : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
     754             : fd_r43x6_mul_fast( fd_r43x6_t x,
     755   204590205 :                    fd_r43x6_t y ) {
     756             : 
     757             :   /* 5x5 grade school-ish multiplication accelerated with AVX512 integer
     758             :      madd52 instructions.  The basic algorithm is:
     759             : 
     760             :        x*y = (sum_i xi 2^(43*i))*(sum_j yj 2^(43*j))
     761             :            = sum_i sum_j xi*xj 2^(43*(i+j))
     762             :            = sum_i sum_j (pijl + 2^43 pijh) 2^(43*(i+j))
     763             :            = sum_i sum_j ( pijl 2^(43*(i+j)) + pijh 2^(43*(i+j+1)) )
     764             :            = sum_k       zk 2^43 k
     765             : 
     766             :      where the product of xi*xj has been split such that:
     767             : 
     768             :        pijl + 2^43 pijh = xi*xj
     769             : 
     770             :      and zk has grouped all terms with the same scale factor:
     771             : 
     772             :         zk = sum_i sum_j ( pijl ((i+j)==k) + pijh ((i+j+1)==k) )
     773             : 
     774             :      Or graphically:
     775             : 
     776             :                                        x5   x4   x3   x2   x1   x0
     777             :                                   x    y5   y4   y3   y2   y1   y0
     778             :                                   --------------------------------
     779             :                                      p50l p40l p30l p20l p10l p00l -> t0
     780             :                                 p50h p40h p30h p20h p10h p00h      \
     781             :                                 p51l p41l p31l p21l p11l p01l      /  t1
     782             :                            p51h p41h p31h p21h p11h p01h           \
     783             :                            p52l p42l p32l p22l p12l p02l           /  t2
     784             :                       p52h p42h p32h p22h p12h p02h                \
     785             :                       p53l p43l p33l p23l p13l p03l                /  t3
     786             :                  p53h p43h p33h p23h p13h p03h                     \
     787             :                  p54l p44l p34l p24l p14l p04l                     /  t4
     788             :             p54h p44h p34h p24h p14h p04h                          \
     789             :             p55l p45l p35l p25l p15l p05l                          /  t5
     790             :        p55h p45h p35h p25h p15h p05h                               -> t6
     791             :        -----------------------------------------------------------
     792             :        z11  z10   z9   z8   z7   z6   z5   z4   z3   z2   z1   z0
     793             :        \----------------/   \-----------------------------------/
     794             :                zh                            zl
     795             : 
     796             :     The conventional split would require xi and xj to be in [0,2^43) and
     797             :     yield pijl and pijh in [0,2^43).  But we need to use a different
     798             :     split to exploit the madd52 instructions:
     799             : 
     800             :       ul = madd52lo(al,x,y) = LO64( al + LO52( LO52(x)*LO52(y) ) )
     801             :       uh = madd52hi(ah,x,y) = LO64( ah + HI52( LO52(x)*LO52(y) ) )
     802             : 
     803             :     Consider when al and ah are zero.  Since x and y here are unreduced
     804             :     and thus in [0,2^47), we have:
     805             : 
     806             :       ul = LO52( x*y )
     807             :       uh = HI52( x*y )
     808             :       --> ul + 2^52 uh       = x*y
     809             :       --> ul + 2^43 (2^9 uh) = x*y
     810             : 
     811             :     Thus, we can use pl=ul and ph=2^9 uh from these instructions as
     812             :     a split for the above.  With this we have:
     813             : 
     814             :       pl in [0,2^52)
     815             :       ph in [0,2^51)
     816             : 
     817             :     so:
     818             : 
     819             :       z{0,1,2,3, 4, 5} < { 2, 5, 8,11,14,17} 2^51
     820             :       z{6,7,8,9,10,11} < {16,13,10, 7, 4, 1} 2^51
     821             : 
     822             :     Note: the [0,2^47) range for unreduced fd_r43x6_t was picked to
     823             :     yield pl and ph with similar ranges while allowing for fast
     824             :     reduction below.
     825             : 
     826             :     Note: It might seem even better to use a 5 long radix 52 limb
     827             :     representation such that madd naturally produces the desired
     828             :     splitting.  If the only consideration is the above, this is correct.
     829             : 
     830             :     But this calculation is frequently used in long sequential chains
     831             :     (e.g. the repeated squarings done to compute the multiplicative
     832             :     inverse).  In the 52x5 case, the zk reduction required to produce an
     833             :     output representation that can be fed into the next multiplication
     834             :     has no "headroom".  All carries from limbs 0-3 must be fully
     835             :     propagated to limb 4 such that all limbs are in [0,2^52) to be able
     836             :     to use madd52 subsequent multiply operations.  This requires then
     837             :     extracting all the limbs and doing a slow sequential calculation as
     838             :     part of the zk reduction.  This throws away most of the advantage of
     839             :     using instructions like madd52 in the first place.
     840             : 
     841             :     Using 43x6 is nearly the same cost as 52x5 for the above because we
     842             :     have unused AVX512 lanes and the scaling needed to tweak the
     843             :     madd52hi result is a fast shift operation.  Critically, the needed
     844             :     zk reduction (described below) can be done with a fast approximate
     845             :     carry propagate to get a result that can be immediately fed into
     846             :     subsequent multiplications.
     847             : 
     848             :     The overall impact is that this is typically ~2-3x faster than, for
     849             :     example, the fd_ed25519_fe_t scalar multiplier on platforms with
     850             :     AVX512 support.
     851             : 
     852             :     This implementation is not the "textbook" style found in the
     853             :     literature.  The textbook implementations accumulate zh and zl by
     854             :     interleaving alignr with the madds.  The putative benefit of such is
     855             :     that it can make use of the madd52hi adder to save some adds over
     856             :     the below.  Unfortunately, such requires more alignr / more
     857             :     instruction footprint / more sequential dependencies and has less
     858             :     ILP.  In practice, adds are much cheaper than alignr (remember, data
     859             :     motion has been more expensive than computation for decades).  So
     860             :     spending some adds to buy fewer alignr, a smaller instruction
     861             :     footprint and more ILP is a great trade and yields the faster than
     862             :     textbook implementation below. */
     863             : 
     864   204590205 :   wwl_t const zero = wwl_zero();
     865             : 
     866   204590205 :   wwl_t x0  = wwl_permute( zero,            x );
     867   204590205 :   wwl_t x1  = wwl_permute( wwl_one(),       x );
     868   204590205 :   wwl_t x2  = wwl_permute( wwl_bcast( 2L ), x );
     869   204590205 :   wwl_t x3  = wwl_permute( wwl_bcast( 3L ), x );
     870   204590205 :   wwl_t x4  = wwl_permute( wwl_bcast( 4L ), x );
     871   204590205 :   wwl_t x5  = wwl_permute( wwl_bcast( 5L ), x );
     872             : 
     873   204590205 :   wwl_t t0  = wwl_madd52lo( zero,                                      x0, y );
     874   204590205 :   wwl_t t1  = wwl_madd52lo( wwl_shl( wwl_madd52hi( zero, x0, y ), 9 ), x1, y );
     875   204590205 :   wwl_t t2  = wwl_madd52lo( wwl_shl( wwl_madd52hi( zero, x1, y ), 9 ), x2, y );
     876   204590205 :   wwl_t t3  = wwl_madd52lo( wwl_shl( wwl_madd52hi( zero, x2, y ), 9 ), x3, y );
     877   204590205 :   wwl_t t4  = wwl_madd52lo( wwl_shl( wwl_madd52hi( zero, x3, y ), 9 ), x4, y );
     878   204590205 :   wwl_t t5  = wwl_madd52lo( wwl_shl( wwl_madd52hi( zero, x4, y ), 9 ), x5, y );
     879   204590205 :   wwl_t t6  =               wwl_shl( wwl_madd52hi( zero, x5, y ), 9 );
     880             : 
     881   204590205 :   wwl_t p0j =                  t0;      /* note: q0j = 0 */
     882   204590205 :   wwl_t p1j = wwl_slide( zero, t1, 7 ); /* note: q1j = 0 */
     883   204590205 :   wwl_t p2j = wwl_slide( zero, t2, 6 ); /* note: q2j = 0 */
     884   204590205 :   wwl_t p3j = wwl_slide( zero, t3, 5 ); wwl_t q3j = wwl_slide( t3, zero, 5 );
     885   204590205 :   wwl_t p4j = wwl_slide( zero, t4, 4 ); wwl_t q4j = wwl_slide( t4, zero, 4 );
     886   204590205 :   wwl_t p5j = wwl_slide( zero, t5, 3 ); wwl_t q5j = wwl_slide( t5, zero, 3 );
     887   204590205 :   wwl_t p6j = wwl_slide( zero, t6, 2 ); wwl_t q6j = wwl_slide( t6, zero, 2 );
     888             : 
     889   204590205 :   wwl_t zl  = wwl_add( wwl_add( wwl_add( p0j, p1j ), wwl_add( p2j, p3j ) ), wwl_add( wwl_add( p4j, p5j ), p6j ) );
     890   204590205 :   wwl_t zh  = wwl_add( wwl_add( q3j, q4j ), wwl_add( q5j, q6j ) );
     891             : 
     892             :   /* At this point:
     893             :        z = <zl0,zl1,zl2,zl3,zl4,zl5,zl6,zl7> + 2^344 <zh0,zh1,zh2,zh3,0,0,0,0> */
     894             : 
     895   204590205 :   wwl_t za  = wwl_and( zl, wwl( -1L,-1L,-1L,-1L,-1L,-1L, 0L,0L ) );
     896   204590205 :   wwl_t zb  = wwl_slide( zl, zh, 6 );
     897             : 
     898             :   /* At this point:
     899             : 
     900             :        z = <za0,za1,za2,za3,za4,za5> + 2^258 <zb0,zb1,zb2,zb3,zb4,zb5>
     901             : 
     902             :      where (as shown above):
     903             : 
     904             :        za{0,1,2,3,4,5} < 2^51 { 2, 5, 8,11,14,17}
     905             :        zb{0,1,2,3,4,5) < 2^51 {16,13,10, 7, 4, 1}
     906             : 
     907             :      Using:
     908             : 
     909             :        2^258 mod p = (p+19) 2^3 mod p = 19*2^3 = 152
     910             : 
     911             :      we can reduce this to 6 limbs via:
     912             : 
     913             :        z = <za0,za1,za2,za3,za4,za5> + 152 <zb0,zb1,zb2,zb3,zb4,zb5>
     914             : 
     915             :      We can do the sum directly because:
     916             : 
     917             :        z{0,1,2,3,4,5} < 2^51 {2434,1981,1528,1075,622,169} < 2^63
     918             : 
     919             :      (These limbs are too large to use in a subsequent multiply but they
     920             :      are in the range where we can do a fd_r43x6_fold_unsigned,
     921             :      yielding:
     922             : 
     923             :        z0    in [0,2^43+19*(2^23-1))
     924             :        z1-z4 in [0,2^43+   (2^20-1))
     925             :        z5    in [0,2^40+   (2^20-1))
     926             : 
     927             :      This is an unreduced fd_r43x6_t and thus suitable direct use in
     928             :      subsequent multiply operations.  See fd_r43x6_mul.)
     929             : 
     930             :      Note that mullo is slow.  Since 152 = 2^7 + 2^4 + 2^3 and is
     931             :      applied to all lanes, we can compute za+152*zb slightly faster via
     932             :      shift and add techniques. */
     933             : 
     934   204590205 :   return wwl_add( wwl_add( za, wwl_shl( zb, 7 ) ), wwl_add( wwl_shl( zb, 4 ), wwl_shl( zb, 3 ) ) );
     935   204590205 : }
     936             : 
     937             : /* fd_r43x6_sqr_fast(x) returns z = x^2 as an unsigned fd_r43x6_t with
     938             :    lanes 6 and 7 zero where x is an unreduced fd_r43x6_t (i.e. in u47).
     939             :    Assumes lanes 6 and 7 of x are zero.  More specifically,
     940             :    u44/u45/u46/u47 inputs produce a u61/u61/u62/u62 output.
     941             : 
     942             :    IMPORTANT!  z may not be the same representation returned by
     943             :    fd_r43x6_mul_fast(x,x).  This is because doubling of the off-diagonal
     944             :    partial products is done _before_ the multiplications while it is
     945             :    done _after_ the multiplications in fd_r43x6_mul.  This is faster for
     946             :    sqr and reduces the range of the outputs slightly (which can then be
     947             :    used to further optimize code using sqr). */
     948             : 
     949             : FD_FN_UNUSED FD_FN_CONST static fd_r43x6_t /* Work around -Winline */
     950   991634193 : fd_r43x6_sqr_fast( fd_r43x6_t x ) {
     951   991634193 :   wwl_t const zero = wwl_zero();
     952             : 
     953             :   /* The goal of this implementation is to compute each product once, but
     954             :      to make sure each product gets generated in the right AVX lane so
     955             :      that we don't have to do a ton of shuffling at the end.  In
     956             :      exchange, we do a lot of permutevars upfront to get everything
     957             :      setup.  It's possible trading permutevars for madd52s might improve
     958             :      performance.
     959             : 
     960             :      We'll compute
     961             :          p0 = x{0,0,0,0,0,0,3,3} * x{0,1,2,3,4,5,3,4}
     962             :          p1 = x{4,4,1,1,1,1,1,-} * x{4,5,1,2,3,4,5,-}
     963             :          p2 = x{3,-,5,-,2,2,2,2} * x{5,-,5,-,2,3,4,5}
     964             :      with the scaling of non-square terms omitted above.  A dash
     965             :      indicates a don't care value, since we have to do 24
     966             :      multiplications but there are only 21 unique terms.
     967             : 
     968             :      All the terms of p0 belong in the low final result, but the first
     969             :      two terms of p1, and the first and third terms of p2 belong in the
     970             :      high final result.  The other gotcha is that each multiplication
     971             :      has a high and a low component, so we confusingly have two
     972             :      different notions of high/low. */
     973             : 
     974   991634193 :   wwl_t x0 = wwl_permute( wwl( 0L, 0L, 0L, 0L, 0L, 0L, 3L, 3L ), x );
     975   991634193 :   wwl_t x1 = wwl_permute( wwl( 0L, 1L, 2L, 3L, 4L, 5L, 3L, 4L ), x );
     976   991634193 :   wwl_t x2 = wwl_permute( wwl( 4L, 4L, 1L, 1L, 1L, 1L, 1L, 7L ), x );
     977   991634193 :   wwl_t x3 = wwl_permute( wwl( 4L, 5L, 1L, 2L, 3L, 4L, 5L, 7L ), x );
     978   991634193 :   wwl_t x4 = wwl_permute( wwl( 3L, 7L, 5L, 7L, 2L, 2L, 2L, 2L ), x );
     979   991634193 :   wwl_t x5 = wwl_permute( wwl( 5L, 7L, 5L, 7L, 2L, 3L, 4L, 5L ), x );
     980             : 
     981             :   /* Double the non-square terms. */
     982             : 
     983   991634193 :   x0 = wwl_shl_vector( x0, wwl( 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L ) );
     984   991634193 :   x2 = wwl_shl_vector( x2, wwl( 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L ) );
     985   991634193 :   x4 = wwl_shl_vector( x4, wwl( 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L ) );
     986             : 
     987   991634193 :   wwl_t p0l = wwl_madd52lo( zero, x0, x1 );
     988   991634193 :   wwl_t p1l = wwl_madd52lo( zero, x2, x3 );
     989   991634193 :   wwl_t p2l = wwl_madd52lo( zero, x4, x5 );
     990             : 
     991             :   /* Use the same approach as in the multiply to generate the high bits
     992             :      of each individual product. */
     993             : 
     994   991634193 :   wwl_t p0h = wwl_shl( wwl_madd52hi( zero, x0, x1 ), 9 );
     995   991634193 :   wwl_t p1h = wwl_shl( wwl_madd52hi( zero, x2, x3 ), 9 );
     996   991634193 :   wwl_t p2h = wwl_shl( wwl_madd52hi( zero, x4, x5 ), 9 );
     997             : 
     998             :   /* Generate masks to split p_i into the terms that belong in the high
     999             :      word and low word. */
    1000             : 
    1001   991634193 :   wwl_t const mask1 = wwl( -1L,-1L, 0L, 0L, 0L, 0L, 0L,0L );
    1002   991634193 :   wwl_t const mask2 = wwl( -1L, 0L,-1L, 0L, 0L, 0L, 0L,0L );
    1003   991634193 :   wwl_t zll = wwl_add( p0l, wwl_add( wwl_andnot( mask1, p1l ), wwl_andnot( mask2, p2l ) ) );
    1004   991634193 :   wwl_t zlh = wwl_add( p0h, wwl_add( wwl_andnot( mask1, p1h ), wwl_andnot( mask2, p2h ) ) );
    1005   991634193 :   wwl_t zhl =               wwl_add( wwl_and   ( mask1, p1l ), wwl_and   ( mask2, p2l ) );
    1006   991634193 :   wwl_t zhh =               wwl_add( wwl_and   ( mask1, p1h ), wwl_and   ( mask2, p2h ) );
    1007             : 
    1008             :   /* Generate zl and zh as in fd_r43x6_mul_fast */
    1009             : 
    1010   991634193 :   wwl_t zl  = wwl_add( zll,          wwl_slide( zero, zlh, 7 ) );
    1011   991634193 :   wwl_t zh  = wwl_add( zhl, wwl_add( wwl_slide( zero, zhh, 7 ), wwl_slide( zlh, zero, 7 ) ) );
    1012             : 
    1013   991634193 :   wwl_t za  = wwl_and( zl, wwl( -1L,-1L,-1L,-1L,-1L,-1L, 0L,0L ) );
    1014   991634193 :   wwl_t zb  = wwl_slide( zl, zh, 6 );
    1015             : 
    1016             :   /* By the same type of analysis above, we can still do the sum
    1017             :      directly as:
    1018             : 
    1019             :        z{0,1,2,3,4,5} < 2^51 {1826,1371,1222,767,618,163} < 2^62
    1020             : 
    1021             :      (Note that the result is fits into u62 instead of u63 like for
    1022             :      mul_fast.) */
    1023             : 
    1024   991634193 :   return wwl_add( wwl_add( za, wwl_shl( zb, 7 ) ), wwl_add( wwl_shl( zb, 4 ), wwl_shl( zb, 3 ) ) );
    1025   991634193 : }
    1026             : 
    1027             : /* fd_r43x6_scale_fast(x0,y) returns z = <x0,0,0,0,0>*y as an unsigned
    1028             :    fd_r43x6_t with lanes 6 and 7 zero where x is in [0,2^47) and y is an
    1029             :    unreduced fd_r43x6_t (i.e. in u47).  Assumes lanes 6 and 7 of y are
    1030             :    zero.  More specifically, u47 inputs produce a u59 output. */
    1031             : 
    1032             : FD_FN_CONST static inline fd_r43x6_t
    1033             : fd_r43x6_scale_fast( long       _x0,
    1034    66747325 :                      fd_r43x6_t y ) {
    1035             : 
    1036             :   /* This is fd_r43x6_mul with x = <_x0,0,0,0,0> and zeros values
    1037             :      optimized out.  See fd_r43x6_mul for detailed explanation how this
    1038             :      works. */
    1039             : 
    1040    66747325 :   wwl_t const zero = wwl_zero();
    1041             : 
    1042    66747325 :   wwl_t x0  = wwl_bcast( _x0 );
    1043             : 
    1044    66747325 :   wwl_t t0  =          wwl_madd52lo( zero, x0, y );
    1045    66747325 :   wwl_t t1  = wwl_shl( wwl_madd52hi( zero, x0, y ), 9 );
    1046             : 
    1047    66747325 :   wwl_t p0j =            t0;
    1048    66747325 :   wwl_t p1j = wwl_slide( zero, t1, 7 );
    1049             : 
    1050    66747325 :   wwl_t zl  = wwl_add( p0j, p1j );
    1051             : 
    1052    66747325 :   wwl_t za  = wwl_and( zl, wwl( -1L,-1L,-1L,-1L,-1L,-1L, 0L,0L ) );
    1053    66747325 :   wwl_t zb  = wwl_slide( zl, zero, 6 );
    1054             : 
    1055             :   /* At this point:
    1056             : 
    1057             :        za{0,1,2,3,4,5} < 2^51 {2,3,3,3,3,3}
    1058             :        zb0             < 2^51
    1059             : 
    1060             :      such that:
    1061             : 
    1062             :        z{0,1,2,3,4,5} < 2^51 {154,3,3,3,3,3} < 2^63 */
    1063             : 
    1064    66747325 :   return wwl_add( wwl_add( za, wwl_shl( zb, 7 ) ), wwl_add( wwl_shl( zb, 4 ), wwl_shl( zb, 3 ) ) );
    1065    66747325 : }
    1066             : 
    1067             : /* fd_r43x6_neg/add/sub/mul/scale fold the results of their fast
    1068             :    counterparts above.  Given unreduced r43x6_t's (i.e. in u47) with
    1069             :    lanes 6 and 7 zero and/or x0 in [0,2^47), these return unreduced
    1070             :    results (in u44) with lanes 6 and 7 zero. */
    1071             : 
    1072             : #define fd_r43x6_neg( x )       fd_r43x6_fold_signed  ( fd_r43x6_neg_fast  ( (x) ) )
    1073     1320668 : #define fd_r43x6_add( x, y )    fd_r43x6_fold_unsigned( fd_r43x6_add_fast  ( (x), (y) ) )
    1074      598428 : #define fd_r43x6_sub( x, y )    fd_r43x6_fold_signed  ( fd_r43x6_sub_fast  ( (x), (y) ) )
    1075    73729131 : #define fd_r43x6_mul( x, y )    fd_r43x6_fold_unsigned( fd_r43x6_mul_fast  ( (x), (y) ) )
    1076   824886267 : #define fd_r43x6_sqr( x )       fd_r43x6_fold_unsigned( fd_r43x6_sqr_fast  ( (x) ) )
    1077             : #define fd_r43x6_scale( x0, y ) fd_r43x6_fold_unsigned( fd_r43x6_scale_fast( (x0), (y) ) )
    1078             : 
    1079             : /* fd_r43x6_invert(z) returns the multiplicative inverse of z in GF(p)
    1080             :    as an unreduced fd_r43x6_t (in u44) with lanes 6 and 7 zero where z
    1081             :    is an unreduced fd_r43x6_t (i.e. in u47) with lanes 6 and 7 zero. */
    1082             : 
    1083             : FD_FN_CONST fd_r43x6_t
    1084             : fd_r43x6_invert( fd_r43x6_t z );
    1085             : 
    1086             : /* Miscellaneous APIs *************************************************/
    1087             : 
    1088             : /* fd_r43x6_if(c,x,y) returns x if c is non-zero and y if not.
    1089             :    Branchless for deterministic timing.  This macro is robust. */
    1090             : 
    1091     2277071 : #define fd_r43x6_if(c,x,y) wwl_if( (-!(c)) & 0xff, (y), (x) )
    1092             : 
    1093             : /* fd_r43x6_swap_if(c,x,y) will swap the contents of x and y if c is
    1094             :    non-zero and leave the contents of x and y unchanged otherwise.
    1095             :    Branchless for deterministic timing.  This macro is robust. */
    1096             : 
    1097    10000000 : #define fd_r43x6_swap_if(c,x,y) do { \
    1098    10000000 :     wwl_t _x = (x);                  \
    1099    10000000 :     wwl_t _y = (y);                  \
    1100    10000000 :     int   _m = (-!(c)) & 0xff;       \
    1101    10000000 :     (x)      = wwl_if( _m, _x, _y ); \
    1102    10000000 :     (y)      = wwl_if( _m, _y, _x ); \
    1103    10000000 :   } while(0)
    1104             : 
    1105             : /* fd_r43x6_is_nonzero(x) reduces a signed fd_r43x6_t x (i.e. in s62)
    1106             :    and returns 0 if the result is zero and 1 if the result is non-zero. */
    1107             : 
    1108             : FD_FN_UNUSED FD_FN_CONST static int /* Work around -Winline */
    1109     1710033 : fd_r43x6_is_nonzero( fd_r43x6_t x ) {
    1110     1710033 :   long l0, l1, l2, l3, l4, l5;
    1111     1710033 :   fd_r43x6_extract_limbs( x, l );                        /* l is signed */
    1112     1710033 :   fd_r43x6_biased_carry_propagate_limbs( l, l, 1L<<20 ); /* l is nearly reduced */
    1113     1710033 :   fd_r43x6_mod_nearly_reduced_limbs( l, l );             /* l is reduced */
    1114     1710033 :   return !!(l0|l1|l2|l3|l4|l5);
    1115     1710033 : }
    1116             : 
    1117             : /* fd_r43x6_diagnose(x) reduces a signed r43x6_t x (i.e. in s62) and
    1118             :    returns -1 if the result is zero and the least significant bit of the
    1119             :    reduced result otherwise. */
    1120             : 
    1121             : FD_FN_UNUSED FD_FN_CONST static int /* Work around -Winline */
    1122      515970 : fd_r43x6_diagnose( fd_r43x6_t x ) {
    1123      515970 :   long l0, l1, l2, l3, l4, l5;
    1124      515970 :   fd_r43x6_extract_limbs( x, l );                               /* l is signed */
    1125      515970 :   fd_r43x6_biased_carry_propagate_limbs( l, l, 1L<<20 );        /* l is nearly reduced */
    1126      515970 :   fd_r43x6_mod_nearly_reduced_limbs( l, l );                    /* l is reduced */
    1127      515970 :   return fd_int_if( !(l0|l1|l2|l3|l4|l5), -1, (int)(l0 & 1L) ); /* cmov */
    1128      515970 : }
    1129             : 
    1130             : /* fd_r43x6_pow22523(z) returns z^((p-5)/8) = z^(2^252 - 3) as an unreduced
    1131             :    fd_r43x6_t (in u44) with lanes 6 and 7 zero where z is an unreduced
    1132             :    r43x6_t (i.e. in u47) with lanes 6 and 7 of zero. */
    1133             : 
    1134             : FD_FN_CONST fd_r43x6_t
    1135             : fd_r43x6_pow22523( fd_r43x6_t z );
    1136             : 
    1137             : FD_PROTOTYPES_END
    1138             : 
    1139             : #include "fd_r43x6_inl.h"
    1140             : 
    1141             : #endif /* FD_HAS_AVX512 */
    1142             : 
    1143             : #endif /* HEADER_fd_src_ballet_ed25519_avx512_fd_r43x6_h */

Generated by: LCOV version 1.14