LCOV - code coverage report
Current view: top level - ballet/reedsol - fd_reedsol_pi.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 221 335 66.0 %
Date: 2025-01-08 12:08:44 Functions: 10 13 76.9 %

          Line data    Source code
       1             : #include "fd_reedsol_private.h"
       2             : 
       3             : /* TODO: Move this high-level overview
       4             : 
       5             :    The main Lin, et al. paper proposes a clever method for dealing with
       6             :    erasures.  Suppose there is a polynomial P(x) of degree <k, and we
       7             :    know it's value at k inputs, but not necessarily at inputs 0, 1, ...
       8             :    k-1.  We construct a polynomial Pi(x) (that's the uppercase Greek
       9             :    letter, not shorthand for P_i) with zeros at the erasures, i.e.  at
      10             :    the x in [0, n) for which we don't know P(x).  We don't care what the
      11             :    value of Pi(x) is at the non-erasures other than that we know the
      12             :    value.  This means we know the value of the product P(x)Pi(x) for
      13             :    each x in [0, n).  Additionally, since the order of Pi(x) is n-k, the
      14             :    order of P(x)Pi(x) is <n.  Now we're back in the regime of
      15             :    Reed-Solomon encoding: we know the first n values of a polynomial of
      16             :    order <n, so we can use the FFT and IFFT operators to evaluate this
      17             :    polynomial computationally efficiently wherever we please.
      18             : 
      19             :    However, we don't actually care about the value of P(x)Pi(x); we want
      20             :    the value of P(x).  To accomplish this, we take the formal derivative
      21             :    ("formal" in the sense that these are polynomials over a finite field
      22             :    not real numbers, but we ignore that for a minute):
      23             :         (P(x)Pi(x))' = P'(x)Pi(x) + P(x)Pi'(x).
      24             :    At erasures, Pi(x) = 0 by construction, so
      25             :         P(x) = (P(x)Pi(x))' / Pi'(x)
      26             : 
      27             :    Now then, all we need is a fast way to compute the value of the
      28             :    formal derivative at some points given its value at 0, 1, ..., n-1.
      29             :    It turns out, the special basis we use for the rest of the
      30             :    Reed-Solomon computation gives us a computationally efficient way to
      31             :    compute formal derivatives.
      32             : 
      33             :    Thus, the overall algorithm is this:
      34             :      1. Compute the value of P(x)Pi(x) at x in [0, n)
      35             :      2. Use the IFFT to represent the polynomial in the coefficient basis.
      36             :      3. Take the formal derivative in the coefficient basis.
      37             :      4. Use the FFT to compute the value of (P(x)Pi(x))' for x in [0, n).
      38             :      5. Compute Pi'(x) directly.
      39             :      6. Divide the results of step 4 and 5.
      40             : 
      41             :    That's roughly the approach this implementation uses. The paper gives
      42             :    some tips on how to optimize the computation of Pi and Pi' using the
      43             :    Fast Walsh-Hadamard transform in Appendix A that the code in this
      44             :    implementation also uses. */
      45             : 
      46             : #if FD_REEDSOL_ARITH_IMPL>0
      47             : 
      48             : /* When using AVX, the representation used for internal computation can
      49             :    be done with unsigned chars or with shorts.  They give the same
      50             :    result in the end, but have different performance characteristics.
      51             :    It's not always obvious which is faster, so this is a compile-time
      52             :    switch for it.
      53             : 
      54             :    When FD_REEDSOL_PI_USE_SHORT==1, the FWHT operates on signed
      55             :    integers, so some care is needed to make sure overflow can't happen.
      56             : 
      57             :    When FD_REESOL_PI_USE_SHORT==0, the FWHT operates on a slightly
      58             :    overcomplete representation of the integers mod 255 (yes,
      59             :    unfortunately not mod 256).  In particular, the value 255 is allowed,
      60             :    which is interchangeable with 0. */
      61             : 
      62             : #ifndef FD_REEDSOL_PI_USE_SHORT
      63             : #define FD_REEDSOL_PI_USE_SHORT 0
      64             : #endif
      65             : 
      66             : /* Define some helper macros like what we have in util/simd for a vector
      67             :    of shorts.  */
      68             : 
      69             : #include "../../util/simd/fd_sse.h"
      70             : 
      71             : #define ws_adjust_sign(a,b) _mm256_sign_epi16( (a), (b) ) /* scales elements in a by the sign of the corresponding element of b */
      72             : 
      73             : static inline ws_t
      74           0 : ws_mod255( ws_t x ) {
      75             :   /* GCC informs me that for a ushort x,
      76             :      (x%255) == 0xFF & ( x + (x*0x8081)>>23).
      77             :      We need at least 31 bits of precision for the product, so
      78             :      mulh_epu16 is perfect. */
      79           0 :   return ws_and( ws_bcast( 0xFF ), ws_add( x, ws_shru( wh_mulhi( x, ws_bcast( (short)0x8081 ) ), 7 ) ) );
      80           0 : }
      81             : 
      82             : /* The following macros implement the unscaled Fast Walsh-Hadamard
      83             :    transform.  As alluded to above, this gives us a way to compute Pi
      84             :    and Pi' in O(n lg n) time.  These are designed for use within this
      85             :    file, not external use.
      86             : 
      87             :    Unlike the rest of the similar-seeming components in fd_reedsol (e.g.
      88             :    FFT, PPT), this computes the transform within a single (or few) AVX
      89             :    vectors, not in parallel across each component of the vector. I.e. if
      90             :    FD_REEDSOL_ARITH_IMPL>0, to compute a 16-element FWHD, you pass one
      91             :    AVX vector (16*short), not 16 vectors.
      92             : 
      93             :    Also unlike the rest of the similar-seeming components in fd_reedsol,
      94             :    this works on the group Z/255Z (integers mod 255).  Since 255 is not
      95             :    a prime, this is not a field, but the FD_REEDSOL_FWHT only needs addition,
      96             :    subtraction, and division by powers of 2 (which have inverses mod
      97             :    255), so it's not a problem.
      98             : 
      99             :    The typical FWHT multiplies by a factor of 1/sqrt(2) at each step.
     100             :    To convert the unscaled version to the scaled version, divide the
     101             :    result by sqrt(2)^lg(N).  Since we often do two transforms, we need
     102             :    to divide by N ( = (sqrt(2)^lg(N))^2 ).
     103             :    */
     104             : 
     105             : #if FD_REEDSOL_PI_USE_SHORT
     106             : 
     107             : #define FD_REEDSOL_FWHT_16( x )  do { ws_t _x = (x);                                                                  \
     108             :   _x = ws_add( _mm256_setr_m128i( _mm256_extracti128_si256( _x, 1 ), _mm256_extracti128_si256( _x, 0 ) ), \
     109             :       ws_adjust_sign( _x, _mm256_setr_epi16( 1,1,1,1, 1,1,1,1, -1,-1,-1,-1, -1,-1,-1,-1 ) ) ); \
     110             :   _x = ws_add( _mm256_shuffle_epi32( _x, 0x4E ), \
     111             :       ws_adjust_sign( _x, _mm256_setr_epi16( 1,1,1,1, -1,-1,-1,-1, 1,1,1,1, -1,-1,-1,-1 ) ) ); \
     112             :   _x = ws_add( _mm256_shuffle_epi32( _x, 0xB1 ), \
     113             :       ws_adjust_sign( _x, _mm256_setr_epi16( 1,1,-1,-1, 1,1,-1,-1, 1,1,-1,-1, 1,1,-1,-1 ) ) ); \
     114             :   _x = ws_add( _mm256_shuffle_epi8( _x, _mm256_setr_epi8( 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, \
     115             :                                                           2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ) ), \
     116             :           ws_adjust_sign( _x, _mm256_setr_epi16( 1,-1,1,-1, 1,-1,1,-1, 1,-1,1,-1, 1,-1,1,-1 ) ) ); \
     117             :   (x) = _x; } while( 0 )
     118             : 
     119             : #define FD_REEDSOL_FWHT_32( x0, x1 )  do { \
     120             :   ws_t _y0i = (x0);                  ws_t _y1i = (x1);  \
     121             :   ws_t _y0  = ws_add( _y0i, _y1i );  ws_t _y1  = ws_sub( _y0i, _y1i ); \
     122             :   FD_REEDSOL_FWHT_16( _y0 );         FD_REEDSOL_FWHT_16( _y1 ); \
     123             :   (x0) = _y0;                        (x1) = _y1; \
     124             : } while( 0 )
     125             : 
     126             : #define FD_REEDSOL_FWHT_64( x0, x1, x2, x3 )  do { \
     127             :   ws_t _z0, _z1, _z2, _z3; ws_t _z0i, _z1i, _z2i, _z3i; \
     128             :   _z0i = (x0);                  _z1i = (x1);                  _z2i = (x2);                  _z3i = (x3);  \
     129             :   _z0  = ws_add( _z0i, _z2i );  _z1  = ws_add( _z1i, _z3i );  _z2  = ws_sub( _z0i, _z2i );  _z3  = ws_sub( _z1i, _z3i );   \
     130             :   FD_REEDSOL_FWHT_32( _z0, _z1 );                             FD_REEDSOL_FWHT_32( _z2, _z3 ); \
     131             :   (x0) = _z0;                   (x1) = _z1;                   (x2) = _z2;                   (x3) = _z3; \
     132             : } while( 0 )
     133             : 
     134             : #define FD_REEDSOL_FWHT_128( x0, x1, x2, x3, x4, x5, x6, x7 )  do { \
     135             :   ws_t _w0,  _w1,  _w2,  _w3,  _w4,  _w5,  _w6,  _w7; \
     136             :   ws_t _w0i, _w1i, _w2i, _w3i, _w4i, _w5i, _w6i, _w7i; \
     137             :   _w0i = (x0);                  _w1i = (x1);                  _w2i = (x2);                  _w3i = (x3);  \
     138             :   _w4i = (x4);                  _w5i = (x5);                  _w6i = (x6);                  _w7i = (x7);  \
     139             :   _w0  = ws_add( _w0i, _w4i );  _w1  = ws_add( _w1i, _w5i );  _w2  = ws_add( _w2i, _w6i );  _w3  = ws_add( _w3i, _w7i );   \
     140             :   _w4  = ws_sub( _w0i, _w4i );  _w5  = ws_sub( _w1i, _w5i );  _w6  = ws_sub( _w2i, _w6i );  _w7  = ws_sub( _w3i, _w7i );   \
     141             :   FD_REEDSOL_FWHT_64( _w0, _w1, _w2, _w3 );                   FD_REEDSOL_FWHT_64( _w4, _w5, _w6, _w7 ); \
     142             :   (x0) = _w0; (x1) = _w1; (x2) = _w2; (x3) = _w3; (x4) = _w4; (x5) = _w5; (x6) = _w6; (x7) = _w7; \
     143             : } while( 0 )
     144             : 
     145             : #define FD_REEDSOL_FWHT_256( x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 )  do { \
     146             :   ws_t _v0,  _v1,  _v2,  _v3,  _v4,  _v5,  _v6,  _v7,  _v8,  _v9,  _v10,  _v11,  _v12,  _v13,  _v14,  _v15; \
     147             :   ws_t _v0i, _v1i, _v2i, _v3i, _v4i, _v5i, _v6i, _v7i, _v8i, _v9i, _v10i, _v11i, _v12i, _v13i, _v14i, _v15i; \
     148             :   _v0i  = (x0);                 _v1i  = (x1);                  _v2i  = (x2);                  _v3i  = (x3);  \
     149             :   _v4i  = (x4);                 _v5i  = (x5);                  _v6i  = (x6);                  _v7i  = (x7);  \
     150             :   _v8i  = (x8);                 _v9i  = (x9);                  _v10i = (x10);                 _v11i = (x11);  \
     151             :   _v12i = (x12);                _v13i = (x13);                 _v14i = (x14);                 _v15i = (x15);  \
     152             :   _v0  = ws_add( _v0i, _v8i  ); _v1   = ws_add( _v1i, _v9i  ); _v2   = ws_add( _v2i, _v10i ); _v3   = ws_add( _v3i, _v11i );   \
     153             :   _v4  = ws_add( _v4i, _v12i ); _v5   = ws_add( _v5i, _v13i ); _v6   = ws_add( _v6i, _v14i ); _v7   = ws_add( _v7i, _v15i );   \
     154             :   _v8  = ws_sub( _v0i, _v8i  ); _v9   = ws_sub( _v1i, _v9i  ); _v10  = ws_sub( _v2i, _v10i ); _v11  = ws_sub( _v3i, _v11i );   \
     155             :   _v12 = ws_sub( _v4i, _v12i ); _v13  = ws_sub( _v5i, _v13i ); _v14  = ws_sub( _v6i, _v14i ); _v15  = ws_sub( _v7i, _v15i );   \
     156             :   FD_REEDSOL_FWHT_128( _v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7 ); FD_REEDSOL_FWHT_128( _v8, _v9, _v10, _v11, _v12, _v13, _v14, _v15 ); \
     157             :   (x0) = _v0; (x1) = _v1; (x2)  = _v2;  (x3)  = _v3;  (x4)  = _v4;  (x5)  = _v5;  (x6)  = _v6;  (x7)  = _v7; \
     158             :   (x8) = _v8; (x9) = _v9; (x10) = _v10; (x11) = _v11; (x12) = _v12; (x13) = _v13; (x14) = _v14; (x15) = _v15; \
     159             : } while( 0 )
     160             : 
     161             : #else /* FD_REEDSOL_PI_USE_SHORT */
     162             : 
     163             : static inline wb_t
     164        1698 : add_mod_255( wb_t a, wb_t b ) {
     165        1698 :   wb_t sum = wb_add( a, b );
     166        1698 :   wb_t overflowed = wb_lt( sum, a );
     167        1698 :   return wb_sub( sum, overflowed );
     168        1698 : }
     169             : 
     170         294 : #define FD_REEDSOL_FWHT_16( x ) do { wb_t _x = (x); \
     171         294 :   wb_t negated, unshifted, shifted; \
     172         294 :   /* Shift by 8 elements (8B) */ \
     173         294 :   negated = wb_sub( wb_bcast( 0xFF ), _x ); \
     174         294 :   unshifted = _mm256_blend_epi32( _x, negated, 0xCC ); \
     175         294 :   shifted = _mm256_shuffle_epi32( _x, 0x4E ); \
     176         294 :   _x = add_mod_255( unshifted, shifted ); \
     177         294 :   /* Shift by 4 elements (4B) */ \
     178         294 :   negated = wb_sub( wb_bcast( 0xFF ), _x ); \
     179         294 :   unshifted = _mm256_blend_epi32( _x, negated, 0xAA ); \
     180         294 :   shifted = _mm256_shuffle_epi32( _x, 0xB1 ); \
     181         294 :   _x = add_mod_255( unshifted, shifted ); \
     182         294 :   /* Shift by 2 elements (2B) */ \
     183         294 :   negated = wb_sub( wb_bcast( 0xFF ), _x ); \
     184         294 :   unshifted = _mm256_blend_epi16( _x, negated, 0xAA ); \
     185         294 :   shifted = wb_exch_adj_pair( _x ); \
     186         294 :   _x = add_mod_255( unshifted, shifted ); \
     187         294 :   /* Shift by 1 element (1B) */ \
     188         294 :   negated = wb_sub( wb_bcast( 0xFF ), _x ); \
     189         294 :   unshifted = _mm256_blendv_epi8( _x, negated, wb_bcast_pair( 0x01, 0xFF ) ); \
     190         294 :   shifted = wb_exch_adj( _x ); \
     191         294 :   _x = add_mod_255( unshifted, shifted ); \
     192         294 :   (x) = _x; \
     193         294 : } while( 0 )
     194             : 
     195         270 : #define FD_REEDSOL_FWHT_32( x ) do { wb_t _y = (x); \
     196         270 :   wb_t negated, unshifted, shifted; \
     197         270 :   /* Shift by 16 elements (16B) */ \
     198         270 :   negated = wb_sub( wb_bcast( 0xFF ), _y ); \
     199         270 :   unshifted = _mm256_blend_epi32( _y, negated, 0xF0 ); \
     200         270 :   shifted = _mm256_setr_m128i( _mm256_extracti128_si256( _y, 1 ), _mm256_extracti128_si256( _y, 0 ) ); \
     201         270 :   _y = add_mod_255( unshifted, shifted ); \
     202         270 :   FD_REEDSOL_FWHT_16( _y ); \
     203         270 :   (x) = _y; \
     204         270 : } while( 0 )
     205             : 
     206         102 : #define FD_REEDSOL_FWHT_64( x0, x1 ) do { wb_t _z0i = (x0); wb_t _z1i = (x1);  \
     207         102 :   wb_t _z0 = add_mod_255( _z0i, _z1i );  wb_t _z1 = add_mod_255( _z0i, wb_sub( wb_bcast( 0xFF ), _z1i ) ); \
     208         102 :   FD_REEDSOL_FWHT_32( _z0 );             FD_REEDSOL_FWHT_32( _z1 ); \
     209         102 :   (x0) = _z0;                            (x1) = _z1; \
     210         102 : } while( 0 )
     211             : 
     212          12 : #define FD_REEDSOL_FWHT_128( x0, x1, x2, x3 ) do { wb_t _w0i = (x0); wb_t _w1i = (x1); wb_t _w2i = (x2); wb_t _w3i = (x3);  \
     213          12 :   wb_t _w0, _w1, _w2, _w3; \
     214          12 :   _w0 = add_mod_255( _w0i, _w2i );                               _w1 = add_mod_255( _w1i, _w3i ); \
     215          12 :   _w2 = add_mod_255( _w0i, wb_sub( wb_bcast( 0xFF ), _w2i ) );   _w3 = add_mod_255( _w1i, wb_sub( wb_bcast( 0xFF ), _w3i ) ); \
     216          12 :   FD_REEDSOL_FWHT_64( _w0, _w1 );                                FD_REEDSOL_FWHT_64( _w2, _w3 ); \
     217          12 :   (x0) = _w0; (x1) = _w1; (x2) = _w2; (x3) = _w3; \
     218          12 : } while( 0 )
     219             : 
     220           0 : #define FD_REEDSOL_FWHT_256( x0, x1, x2, x3, x4, x5, x6, x7 )  do { \
     221           0 :   wb_t _v0,  _v1,  _v2,  _v3,  _v4,  _v5,  _v6,  _v7; \
     222           0 :   wb_t _v0i, _v1i, _v2i, _v3i, _v4i, _v5i, _v6i, _v7i; \
     223           0 :   _v0i = (x0);                  _v1i = (x1);                  _v2i = (x2);                  _v3i = (x3);  \
     224           0 :   _v4i = (x4);                  _v5i = (x5);                  _v6i = (x6);                  _v7i = (x7);  \
     225           0 :   _v0 = add_mod_255( _v0i, _v4i );                               _v1 = add_mod_255( _v1i, _v5i ); \
     226           0 :   _v2 = add_mod_255( _v2i, _v6i );                               _v3 = add_mod_255( _v3i, _v7i ); \
     227           0 :   _v4 = add_mod_255( _v0i, wb_sub( wb_bcast( 0xFF ), _v4i ) );   _v5 = add_mod_255( _v1i, wb_sub( wb_bcast( 0xFF ), _v5i ) ); \
     228           0 :   _v6 = add_mod_255( _v2i, wb_sub( wb_bcast( 0xFF ), _v6i ) );   _v7 = add_mod_255( _v3i, wb_sub( wb_bcast( 0xFF ), _v7i ) ); \
     229           0 :   FD_REEDSOL_FWHT_128( _v0, _v1, _v2, _v3 );                   FD_REEDSOL_FWHT_128( _v4, _v5, _v6, _v7 ); \
     230           0 :   (x0) = _v0; (x1) = _v1; (x2) = _v2; (x3) = _v3; (x4) = _v4; (x5) = _v5; (x6) = _v6; (x7) = _v7; \
     231           0 : } while( 0 )
     232             : #endif
     233             : 
     234             : /* Casts each element of a to a uchar, forming a 16-element uchar vector.  Then
     235             :  * casts each element of b to a uchar, forming a second 16-element uchar
     236             :  * vector. Concatenates the two 16-element vectors to form a single
     237             :  * 32-element wb_t (a first, then b). */
     238             : static inline wb_t
     239         147 : compact_ws( ws_t a, ws_t b ) {
     240             :   /* There's also _mm256_packus_epi16, but it's no better than this */
     241         147 :   wb_t shuffled0 = _mm256_shuffle_epi8(a, wb(  0,  2,  4,  6,  8, 10, 12, 14, 128,128,128,128,128,128,128,128,
     242         147 :                                              128, 128,128,128,128,128,128,128,  0,  2,  4,  6,  8, 10, 12, 14 ) );
     243         147 :   wb_t shuffled1 = _mm256_shuffle_epi8(b, wb(  0,  2,  4,  6,  8, 10, 12, 14, 128,128,128,128,128,128,128,128,
     244         147 :                                              128, 128,128,128,128,128,128,128,  0,  2,  4,  6,  8, 10, 12, 14 ) );
     245         147 :   return _mm256_setr_m128i(
     246         147 :       _mm_or_si128( _mm256_extracti128_si256( shuffled0, 0 ), _mm256_extracti128_si256( shuffled0, 1 ) ),
     247         147 :       _mm_or_si128( _mm256_extracti128_si256( shuffled1, 0 ), _mm256_extracti128_si256( shuffled1, 1 ) ) );
     248         147 : }
     249             : 
     250             : /* exp_{n}( x ) computes n^x_i in GF(2^8) for each byte x_i in the
     251             :    vector x.  That's exponentiation, not xor. For example, exp_76
     252             :    interprets 76 as an element of GF(2^8) and x_i as an integer, and
     253             :    computes the product of multiplying GF(76) times itself x_i times.
     254             :    Recall that exponentiation is an operator from (GF(2^8) x (Z/255Z))
     255             :    -> GF(2^8), so x is interpreted mod 255.  (equivalently, observe
     256             :    n^255=1). As an input, x^255 is okay and is the same as x^0. */
     257             : static inline wb_t
     258          12 : exp_76( wb_t x ) {
     259             :   /* Decompose x = xh3*0x80 + xh2*0x40 + xh1*0x20 + xh0*0x10 + xl
     260             :      where 0<=xl<16 and 0<=xh_j<1 for each j.  Then
     261             : 
     262             :        76^x = (76^xl) * (76^0x10)^xh0 * (76^0x20)^xh1 *
     263             :                         (76^0x40)^xh2 (76^0x80)^xh3
     264             :              = (76^xl) * 2^xh0 * 4^xh1 * 16^xh2 * 29^xh3.
     265             : 
     266             :     We use vpshub to implement the 4-bit lookup table 76^xl.  The for
     267             :     the rest, we're either multiplying by a constant or not doing the
     268             :     multiply, so we can use our normal GF_MUL with a blend. */
     269             : 
     270          12 :   wb_t low = wb_and( x, wb_bcast( 0xF ) );
     271          12 :   wb_t exp_low = _mm256_shuffle_epi8( wb( 1,  76, 157,  70,  95, 253, 217, 129, 133, 168, 230, 227, 130,  81, 18,  44,
     272          12 :                                           1,  76, 157,  70,  95, 253, 217, 129, 133, 168, 230, 227, 130,  81, 18,  44 ),
     273          12 :                                       low );
     274          12 :   wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low,  2 ), _mm256_slli_epi16( x, 3 ) );
     275          12 :   wb_t with1 = _mm256_blendv_epi8( with0,   GF_MUL( with0,    4 ), _mm256_slli_epi16( x, 2 ) );
     276          12 :   wb_t with2 = _mm256_blendv_epi8( with1,   GF_MUL( with1,   16 ), _mm256_slli_epi16( x, 1 ) );
     277          12 :   wb_t with3 = _mm256_blendv_epi8( with2,   GF_MUL( with2,   29 ),                    x      );
     278          12 :   return with3;
     279          12 : }
     280             : 
     281             : static inline wb_t
     282          33 : exp_29( wb_t x ) {
     283          33 :   wb_t low = wb_and( x, wb_bcast( 0xF ) );
     284          33 :   wb_t exp_low = _mm256_shuffle_epi8( wb( 1,  29,  76, 143, 157, 106,  70,  93,  95, 101, 253, 254, 217,  13, 129,  59,
     285          33 :                                           1,  29,  76, 143, 157, 106,  70,  93,  95, 101, 253, 254, 217,  13, 129,  59 ),
     286          33 :                                       low );
     287          33 :   wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 133 ), _mm256_slli_epi16( x, 3 ) );
     288          33 :   wb_t with1 = _mm256_blendv_epi8( with0,   GF_MUL( with0,     2 ), _mm256_slli_epi16( x, 2 ) );
     289          33 :   wb_t with2 = _mm256_blendv_epi8( with1,   GF_MUL( with1,     4 ), _mm256_slli_epi16( x, 1 ) );
     290          33 :   wb_t with3 = _mm256_blendv_epi8( with2,   GF_MUL( with2,    16 ),                    x      );
     291          33 :   return with3;
     292          33 : }
     293             : static inline wb_t
     294          78 : exp_16( wb_t x ) {
     295          78 :   wb_t low = wb_and( x, wb_bcast( 0xF ) );
     296          78 :   wb_t exp_low = _mm256_shuffle_epi8( wb( 1,  16,  29, 205,  76, 180, 143,  24, 157,  37, 106, 238,  70,  20,  93, 185,
     297          78 :                                           1,  16,  29, 205,  76, 180, 143,  24, 157,  37, 106, 238,  70,  20,  93, 185 ),
     298          78 :                                       low );
     299          78 :   wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low,  95 ), _mm256_slli_epi16( x, 3 ) );
     300          78 :   wb_t with1 = _mm256_blendv_epi8( with0,   GF_MUL( with0,   133 ), _mm256_slli_epi16( x, 2 ) );
     301          78 :   wb_t with2 = _mm256_blendv_epi8( with1,   GF_MUL( with1,     2 ), _mm256_slli_epi16( x, 1 ) );
     302          78 :   wb_t with3 = _mm256_blendv_epi8( with2,   GF_MUL( with2,     4 ),                    x      );
     303          78 :   return with3;
     304          78 : }
     305             : static inline wb_t
     306          24 : exp_4( wb_t x ) {
     307          24 :   wb_t low = wb_and( x, wb_bcast( 0xF ) );
     308          24 :   wb_t exp_low = _mm256_shuffle_epi8( wb( 1,   4,  16,  64,  29, 116, 205,  19,  76,  45, 180, 234, 143,   6,  24,  96,
     309          24 :                                           1,   4,  16,  64,  29, 116, 205,  19,  76,  45, 180, 234, 143,   6,  24,  96 ),
     310          24 :                                       low );
     311          24 :   wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 157 ), _mm256_slli_epi16( x, 3 ) );
     312          24 :   wb_t with1 = _mm256_blendv_epi8( with0,   GF_MUL( with0,    95 ), _mm256_slli_epi16( x, 2 ) );
     313          24 :   wb_t with2 = _mm256_blendv_epi8( with1,   GF_MUL( with1,   133 ), _mm256_slli_epi16( x, 1 ) );
     314          24 :   wb_t with3 = _mm256_blendv_epi8( with2,   GF_MUL( with2,     2 ),                    x      );
     315          24 :   return with3;
     316          24 : }
     317             : 
     318             : static inline wb_t
     319           0 : exp_2( wb_t x ) {
     320           0 :   wb_t low = wb_and( x, wb_bcast( 0xF ) );
     321           0 :   wb_t exp_low = _mm256_shuffle_epi8( wb(   1,   2,   4,   8,  16,  32,  64, 128,  29,  58, 116, 232, 205, 135,  19,  38,
     322           0 :                                             1,   2,   4,   8,  16,  32,  64, 128,  29,  58, 116, 232, 205, 135,  19,  38),
     323           0 :                                       low );
     324           0 :   wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low,  76 ), _mm256_slli_epi16( x, 3 ) );
     325           0 :   wb_t with1 = _mm256_blendv_epi8( with0,   GF_MUL( with0,   157 ), _mm256_slli_epi16( x, 2 ) );
     326           0 :   wb_t with2 = _mm256_blendv_epi8( with1,   GF_MUL( with1,    95 ), _mm256_slli_epi16( x, 1 ) );
     327           0 :   wb_t with3 = _mm256_blendv_epi8( with2,   GF_MUL( with2,   133 ),                    x      );
     328           0 :   return with3;
     329           0 : }
     330             : 
     331             : #endif /* FD_REEDSOL_ARITH_IMPL>0 */
     332             : 
     333             : /* l_twiddle_{N} stores the size N FWHT of what the paper calls L~, i.e.
     334             :          ( 0, Log(1), Log(2), Log(3), ... Log(N-1) )
     335             : 
     336             :    The discrete log uses a primitive element of 2, and the FWHT is taken
     337             :    mod 255, which means all of the values can fit in a uchar.  However,
     338             :    Intel doesn't give us a multiplication instruction for 8-bit
     339             :    integers, which means that we'd have to zero-extend these values
     340             :    anyways.
     341             : 
     342             :    Although L~ for a smaller size is a subset of that for a larger size,
     343             :    because we also precompute the value of the FWHT, we store the
     344             :    variables separately.  Perhaps a good compiler could
     345             :    constant-propagate through the AVX instructions, but it's just 5
     346             :    values of N, so I prefer not to depend on that. */
     347             : static const short fwht_l_twiddle_16 [  16 ] = {0xca,0xa1,0x6a,0xa9,0x73,0xfc,0xe2,0x44,0x93,0x74,0x08,0x7f,0x96,0x8c,0x42,0xf2};
     348             : static const short fwht_l_twiddle_32 [  32 ] = {0x24,0x8f,0xc2,0x7e,0x49,0x89,0x74,0xdc,0x4f,0x95,0x43,0xb4,0x09,0xba,0x03,0x83,
     349             :                                                 0x71,0xb3,0x12,0xd4,0x9d,0x70,0x51,0xab,0xd7,0x53,0xcc,0x4a,0x24,0x5e,0x81,0x62};
     350             : static const short fwht_l_twiddle_64 [  64 ] = {0x05,0x81,0x9a,0x82,0x07,0x7c,0x3c,0xbe,0xd3,0xbc,0xed,0x23,0xc2,0x24,0xee,0xc8,
     351             :                                                 0x3f,0x5d,0x11,0x18,0x8a,0xf9,0x1c,0x4b,0x0e,0x02,0x8e,0xe4,0x77,0x8c,0x97,0x6d,
     352             :                                                 0x43,0x9d,0xea,0x7a,0x8b,0x96,0xac,0xfa,0xca,0x6e,0x98,0x46,0x4f,0x51,0x17,0x3e,
     353             :                                                 0xa3,0x0a,0x13,0x91,0xb0,0xe6,0x86,0x0c,0xa1,0xa4,0x0b,0xaf,0xd0,0x30,0x6b,0x57};
     354             : static const short fwht_l_twiddle_128[ 128 ] = {0xfe,0x89,0x15,0xeb,0x48,0xea,0x04,0xfe,0x32,0xd9,0xca,0x2c,0x1e,0x58,0x8d,0xed,
     355             :                                                 0x6f,0x36,0x53,0x24,0xb2,0x27,0x3e,0x06,0xec,0x96,0x41,0x05,0xbe,0x1d,0xb1,0xdd,
     356             :                                                 0x18,0x64,0xf4,0xc3,0x16,0x0a,0x2e,0x00,0xde,0x34,0xaf,0x42,0xd7,0x5e,0x92,0x02,
     357             :                                                 0xbf,0x5a,0x6a,0x97,0xe1,0x39,0xd0,0xf6,0x66,0x86,0xb5,0x61,0x8a,0xa2,0x8f,0x49,
     358             :                                                 0x0b,0x79,0x20,0x19,0xc5,0x0e,0x74,0x7e,0x75,0x9f,0x11,0x1a,0x67,0xef,0x50,0xa3,
     359             :                                                 0x0f,0x84,0xce,0x0c,0x62,0xcc,0xf9,0x90,0x2f,0x6d,0xdb,0xc4,0x30,0xfb,0x7d,0xfc,
     360             :                                                 0x6e,0xd6,0xe0,0x31,0x01,0x23,0x2b,0xf5,0xb6,0xa8,0x81,0x4a,0xc6,0x44,0x9b,0x7a,
     361             :                                                 0x87,0xb9,0xbb,0x8b,0x7f,0x94,0x3c,0x21,0xdc,0xc2,0x60,0xfd,0x17,0xbd,0x47,0x65};
     362             : static const short fwht_l_twiddle_256[ 256 ] = {0x00,0xfc,0xfb,0x15,0x2d,0xfa,0xc1,0x14,0x62,0x2c,0xd9,0xf9,0xc0,0x45,0x13,0xe8,
     363             :                                                 0x01,0x61,0x86,0x2b,0xd8,0xba,0xf8,0x5d,0xbf,0x7a,0x44,0x6a,0x07,0x12,0xf1,0xe7,
     364             :                                                 0x00,0xdc,0x60,0x0a,0x1f,0x85,0x1c,0x2a,0x8b,0xd7,0x92,0xb9,0xf7,0x82,0x5c,0xad,
     365             :                                                 0x19,0xbe,0xb1,0x79,0x43,0x3d,0x69,0x9e,0x06,0x75,0x11,0x27,0x70,0xf0,0xd2,0xe6,
     366             :                                                 0xfe,0x2f,0xdb,0xea,0x88,0x5f,0x7c,0x09,0x0c,0x1e,0x8d,0x84,0x1b,0x3f,0x29,0xd4,
     367             :                                                 0x31,0x8a,0x8f,0xd6,0x91,0xcb,0xb8,0xc9,0xf6,0xb6,0x81,0x39,0xc7,0x5b,0x55,0xac,
     368             :                                                 0x18,0x65,0xbd,0xf4,0x22,0xb0,0xb4,0x78,0x7f,0x42,0x34,0x3c,0x68,0x37,0x9d,0x4e,
     369             :                                                 0xc5,0x05,0x96,0x74,0x10,0x59,0x26,0x9a,0x6f,0xa3,0xef,0x53,0x4b,0xd1,0xaa,0xe5,
     370             :                                                 0xfd,0x16,0x2e,0xc2,0x63,0xda,0x46,0xe9,0x02,0x87,0xbb,0x5e,0x7b,0x6b,0x08,0xf2,
     371             :                                                 0xdd,0x0b,0x20,0x1d,0x8c,0x93,0x83,0xae,0x1a,0xb2,0x3e,0x9f,0x76,0x28,0x71,0xd3,
     372             :                                                 0x30,0xeb,0x89,0x7d,0x0d,0x8e,0x40,0xd5,0x32,0x90,0xcc,0xca,0xb7,0x3a,0xc8,0x56,
     373             :                                                 0x66,0xf5,0x23,0xb5,0x80,0x35,0x38,0x4f,0xc6,0x97,0x5a,0x9b,0xa4,0x54,0x4c,0xab,
     374             :                                                 0x17,0xc3,0x64,0x47,0x03,0xbc,0x6c,0xf3,0xde,0x21,0x94,0xaf,0xb3,0xa0,0x77,0x72,
     375             :                                                 0xec,0x7e,0x0e,0x41,0x33,0xcd,0x3b,0x57,0x67,0x24,0x36,0x50,0x98,0x9c,0xa5,0x4d,
     376             :                                                 0xc4,0x48,0x04,0x6d,0xdf,0x95,0xa1,0x73,0xed,0x0f,0xce,0x58,0x25,0x51,0x99,0xa6,
     377             :                                                 0x49,0x6e,0xe0,0xa2,0xee,0xcf,0x52,0xa7,0x4a,0xe1,0xd0,0xa8,0xe2,0xa9,0xe3,0xe4};
     378             : 
     379             : #if FD_REEDSOL_ARITH_IMPL==0
     380             : static void
     381             : gen_pi_noavx_generic( uchar const * is_erased,
     382             :                       uchar       * output,
     383             :                       ulong         sz,
     384             :                       const short * l_twiddle ) {
     385             :   long scratch[ 256 ];
     386             : 
     387             :   for( ulong i=0UL; i<sz; i++ ) scratch[ i ] = is_erased[ i ];
     388             : 
     389             :   /* Unscaled FWHT */
     390             :   for( ulong h=1UL; h<sz; h<<=1 ) {
     391             :     for( ulong i=0UL; i<sz; i += 2UL*h ) for( ulong j=i; j<i+h; j++ ) {
     392             :       long x = scratch[ j   ];
     393             :       long y = scratch[ j+h ];
     394             :       scratch[ j   ] = x+y;
     395             :       scratch[ j+h ] = x-y;
     396             :     }
     397             :   }
     398             : 
     399             :   for( ulong i=0UL; i<sz; i++ ) scratch[ i ] *= l_twiddle[ i ];
     400             : 
     401             :   for( ulong h=1UL; h<sz; h<<=1 ) {
     402             :     for( ulong i=0UL; i<sz; i += 2UL*h ) for( ulong j=i; j<i+h; j++ ) {
     403             :       long x = scratch[ j   ];
     404             :       long y = scratch[ j+h ];
     405             :       scratch[ j   ] = x+y;
     406             :       scratch[ j+h ] = x-y;
     407             :     }
     408             :   }
     409             : 
     410             :   /* Negate the ones corresponding to erasures to compute 1/Pi' */
     411             :   for( ulong i=0UL; i<sz; i++ ) scratch[ i ] *= fd_long_if( is_erased[ i ], -1L, 1L );
     412             : 
     413             :   /* To fix the FWHT scaling, we need to multiply by sz^-1 mod 255.
     414             :      Given that sz is always a power of 2, this is not too bad.
     415             :      Let s = lg(sz).  Note that 2^8 == 256 == 1 (mod 255),
     416             :      so then (2^s) * (2^(8-s)) == 1    (mod 255).
     417             :      This implies that sz^-1 = 2^(8-s) (mod 255), and we can compute
     418             :      2^(8-s) by 256/s with normal integer division. */
     419             :   long sz_inv = 256L/(long)sz;
     420             :   /* The % operator in C doesn't work like the mathematical mod
     421             :      operation for negative numbers, so we need to add in a shift to
     422             :      make any input non-negative.  We can compute the smallest possible
     423             :      value at this point and it's -142397, so we add 255*559=142545. */
     424             :   for( ulong i=0UL; i<sz; i++ ) scratch[ i ] = (sz_inv*(scratch[ i ] + 255L*559L)) % 255L;
     425             : 
     426             :   for( ulong i=0UL; i<sz; i++ ) output[ i ] = (uchar)gf_arith_invlog_tbl[ scratch[ i ] ];
     427             : }
     428             : #endif
     429             : 
     430             : void
     431             : fd_reedsol_private_gen_pi_16( uchar const * is_erased,
     432          12 :                               uchar       * output ) {
     433          12 : #if FD_REEDSOL_ARITH_IMPL>0
     434             : #if FD_REEDSOL_PI_USE_SHORT
     435             :   ws_t erased_vec = _mm256_cvtepu8_epi16( vb_ld( is_erased ) );
     436             : 
     437             :   ws_t transformed = erased_vec;
     438             :   FD_REEDSOL_FWHT_16( transformed ); /* FWHT( R~ ) */
     439             :   /* |transformed| <= 16 */
     440             : 
     441             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
     442             :      |product| <= 16*255, but definitely may be negative */
     443             :   ws_t product = ws_mullo( transformed, ws_ld( fwht_l_twiddle_16 ) );
     444             : 
     445             :   /* log_pi is congruent (mod 255) to what the paper calls
     446             :      R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
     447             :      Let H be the 16x16 Hadamard matrix.  Then
     448             :          log_pi = (H * diag( fwht_l_twiddle_16 ) * H) * erased_vec
     449             :                   |---------------------------------|
     450             :                                 M
     451             :      The part labeled M is a matrix known ahead of time, so we can bound
     452             :      log_pi relatively easily when combined with the fact
     453             :      0<=erased_vec<=1 (just set the negative entries to 0, and multiply
     454             :      by all 1s. Then do the same, setting the positive entries to 0
     455             :      instead).  This tells us |log_pi| <= 4489 */
     456             :   FD_REEDSOL_FWHT_16( product );
     457             :   ws_t log_pi = product;
     458             : 
     459             :   /* Negate the ones corresponding to erasures to compute 1/Pi' */
     460             :   log_pi = ws_adjust_sign( log_pi, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec, 1 ) ) );
     461             : 
     462             :   log_pi = ws_add( log_pi, ws_bcast( (short)(255*18) ) ); /* Now 0<= log_pi <= 9079 < 2^15 */
     463             : 
     464             :   /* GCC informs me that for a ushort x,
     465             :      (x%255) == 0xFF & ( x + (x*0x8081)>>23).
     466             :      We need at least 31 bits of precision for the product, so
     467             :      mulh_epu16 is perfect. */
     468             :   log_pi = ws_and( ws_bcast( 0xFF ), ws_add( log_pi, ws_shru( wh_mulhi( log_pi, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     469             :   /* Now 0<=log_pi < 255 */
     470             : 
     471             :   /* Since our FWHT implementation is unscaled, we've computed a value
     472             :      16 times larger than what we'd like.  16^-1 == 16 (mod 255), but
     473             :      we're just going to use this in the exponentiation, so we can
     474             :      compute this implicitly.
     475             :        2^(log_pi * 16^-1) = 2^(16*log_pi) = (2^16)^log_pi = 76^log_pi
     476             :      (where 2 is the primitive element we used for the logs, an element
     477             :      of GF(2^8) ). */
     478             : 
     479             :   wb_t compact_log_pi = compact_ws( log_pi, ws_zero() );
     480             :   wb_t pi = exp_76( compact_log_pi );
     481             : 
     482             :   vb_st( output, _mm256_extracti128_si256( pi, 0 ) );
     483             : 
     484             : #else
     485          12 :   wb_t erased_vec = _mm256_setr_m128i( vb_ldu( is_erased ), _mm_setzero_si128() );
     486          12 :   wb_t to_transform = erased_vec;
     487          12 :   FD_REEDSOL_FWHT_16( to_transform );
     488          12 :   ws_t transformed = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform, 0 ) );
     489             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
     490             :      0<=product<256*255, so product is interpreted as unsigned. */
     491          12 :   ws_t product = ws_mullo( transformed, ws_ld( fwht_l_twiddle_16 ) );
     492             : 
     493             :   /* Compute mod 255, using the same approach as above. */
     494          12 :   product = ws_and( ws_bcast( 0xFF ), ws_add( product, ws_shru( wh_mulhi( product, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     495          12 :   wb_t compact_product = compact_ws( product, ws_zero() );
     496             : 
     497          12 :   FD_REEDSOL_FWHT_16( compact_product );
     498             : 
     499             :   /* Negate the ones corresponding to erasures */
     500          12 :   compact_product = wb_if( wb_eq( erased_vec, wb_zero() ), compact_product, wb_sub( wb_bcast( 255 ), compact_product ) );
     501             : 
     502          12 :   wb_t pi = exp_76( compact_product );
     503          12 :   vb_st( output, _mm256_extracti128_si256( pi, 0 ) );
     504          12 : #endif
     505             : #else /* No AVX implementation */
     506             : 
     507             :   gen_pi_noavx_generic( is_erased, output, 16UL, fwht_l_twiddle_16 );
     508             : 
     509             : #endif
     510          12 : }
     511             : 
     512             : void
     513             : fd_reedsol_private_gen_pi_32( uchar const * is_erased,
     514          33 :                               uchar       * output ) {
     515          33 : #if FD_REEDSOL_ARITH_IMPL>0
     516             : #if FD_REEDSOL_PI_USE_SHORT
     517             :   ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased        ) );
     518             :   ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 16UL ) );
     519             : 
     520             :   ws_t transformed0 = erased_vec0;
     521             :   ws_t transformed1 = erased_vec1;
     522             :   FD_REEDSOL_FWHT_32( transformed0, transformed1 ); /* FWHT( R~ ) */
     523             :   /* |transformed| <= 32 */
     524             : 
     525             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
     526             :      |product| <= 32*255, but definitely may be negative */
     527             :   ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_32        ) );
     528             :   ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_32 + 16UL ) );
     529             : 
     530             :   /* log_pi is congruent (mod 255) to what the paper calls
     531             :      R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
     532             :      |log_pi| <= 6945 using the same approach as above. */
     533             :   FD_REEDSOL_FWHT_32( product0, product1 );
     534             :   ws_t log_pi0 = product0;
     535             :   ws_t log_pi1 = product1;
     536             : 
     537             :   /* Negate the ones corresponding to erasures to compute 1/Pi' */
     538             :   log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
     539             :   log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
     540             : 
     541             :   log_pi0 = ws_add( log_pi0, ws_bcast( (short)(255*28) ) );
     542             :   log_pi1 = ws_add( log_pi1, ws_bcast( (short)(255*28) ) ); /* Now 0<= log_pi <= 14085 < 2^15 */
     543             : 
     544             :   /* GCC informs me that for a ushort x,
     545             :      (x%255) == 0xFF & ( x + (x*0x8081)>>23).
     546             :      We need at least 31 bits of precision for the product, so
     547             :      mulh_epu16 is perfect. */
     548             :   log_pi0 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi0, ws_shru( wh_mulhi( log_pi0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     549             :   log_pi1 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi1, ws_shru( wh_mulhi( log_pi1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     550             :   /* Now 0<=log_pi < 255 */
     551             : 
     552             :   /* Since our FWHT implementation is unscaled, we've computed a value
     553             :      32 times larger than what we'd like.  32^-1 == 8 (mod 255), but
     554             :      we're just going to use this in the exponentiation, so we can
     555             :      compute this implicitly.
     556             :        2^(log_pi * 32^-1) = 2^(8*log_pi) = (2^8)^log_pi = 29^log_pi
     557             :      (where 2 is the primitive element we used for the logs, an element
     558             :      of GF(2^8) ). */
     559             : 
     560             :   wb_t compact_log_pi = compact_ws( log_pi0, log_pi1 );
     561             :   wb_t pi = exp_29( compact_log_pi );
     562             : 
     563             :   wb_st( output, pi );
     564             : 
     565             : #else
     566          33 :   wb_t erased_vec = wb_ld( is_erased );
     567          33 :   wb_t to_transform = erased_vec;
     568          33 :   FD_REEDSOL_FWHT_32( to_transform );
     569          33 :   ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform, 0 ) );
     570          33 :   ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform, 1 ) );
     571             : 
     572             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
     573             :      0<=product<256*255, so product is interpreted as unsigned. */
     574          33 :   ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_32        ) );
     575          33 :   ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_32 + 16UL ) );
     576             : 
     577             :   /* Compute mod 255, using the same approach as above. */
     578          33 :   product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( wh_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     579          33 :   product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( wh_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     580          33 :   wb_t compact_product = compact_ws( product0, product1 );
     581             : 
     582          33 :   FD_REEDSOL_FWHT_32( compact_product );
     583             : 
     584             :   /* Negate the ones corresponding to erasures */
     585          33 :   compact_product = wb_if( wb_eq( erased_vec, wb_zero() ), compact_product, wb_sub( wb_bcast( 255 ), compact_product ) );
     586             : 
     587          33 :   wb_t pi = exp_29( compact_product );
     588          33 :   wb_st( output, pi );
     589          33 : #endif
     590             : #else /* No AVX implementation */
     591             : 
     592             :   gen_pi_noavx_generic( is_erased, output, 32UL, fwht_l_twiddle_32 );
     593             : 
     594             : #endif
     595          33 : }
     596             : 
     597             : void
     598             : fd_reedsol_private_gen_pi_64( uchar const * is_erased,
     599          39 :                               uchar       * output ) {
     600          39 : #if FD_REEDSOL_ARITH_IMPL>0
     601             : #if FD_REEDSOL_PI_USE_SHORT
     602             :   ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased        ) );
     603             :   ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 16UL ) );
     604             :   ws_t erased_vec2 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 32UL ) );
     605             :   ws_t erased_vec3 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 48UL ) );
     606             : 
     607             :   ws_t transformed0 = erased_vec0;
     608             :   ws_t transformed1 = erased_vec1;
     609             :   ws_t transformed2 = erased_vec2;
     610             :   ws_t transformed3 = erased_vec3;
     611             :   FD_REEDSOL_FWHT_64( transformed0, transformed1, transformed2, transformed3 ); /* FWHT( R~ ) */
     612             :   /* |transformed| <= 64 */
     613             : 
     614             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
     615             :      |product| <= 64*255, but definitely may be negative */
     616             :   ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_64        ) );
     617             :   ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_64 + 16UL ) );
     618             :   ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_64 + 32UL ) );
     619             :   ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_64 + 48UL ) );
     620             : 
     621             :   /* log_pi is congruent (mod 255) to what the paper calls
     622             :      R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
     623             :      |log_pi| <= 18918 using the same approach as above. */
     624             :   FD_REEDSOL_FWHT_64( product0, product1, product2, product3 );
     625             :   ws_t log_pi0 = product0;
     626             :   ws_t log_pi1 = product1;
     627             :   ws_t log_pi2 = product2;
     628             :   ws_t log_pi3 = product3;
     629             : 
     630             :   /* Negate the ones corresponding to erasures to compute 1/Pi' */
     631             :   log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
     632             :   log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
     633             :   log_pi2 = ws_adjust_sign( log_pi2, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec2, 1 ) ) );
     634             :   log_pi3 = ws_adjust_sign( log_pi3, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec3, 1 ) ) );
     635             : 
     636             :   log_pi0 = ws_add( log_pi0, ws_bcast( (short)(255*75) ) );
     637             :   log_pi1 = ws_add( log_pi1, ws_bcast( (short)(255*75) ) );
     638             :   log_pi2 = ws_add( log_pi2, ws_bcast( (short)(255*75) ) );
     639             :   log_pi3 = ws_add( log_pi3, ws_bcast( (short)(255*75) ) );
     640             :   /* Now 0<= log_pi <= 38043 < 2^16 (okay, since the next step treats it as unsigned */
     641             : 
     642             :   /* GCC informs me that for a ushort x,
     643             :      (x%255) == 0xFF & ( x + (x*0x8081)>>23).
     644             :      We need at least 31 bits of precision for the product, so
     645             :      mulh_epu16 is perfect. */
     646             :   log_pi0 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi0, ws_shru( wh_mulhi( log_pi0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     647             :   log_pi1 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi1, ws_shru( wh_mulhi( log_pi1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     648             :   log_pi2 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi2, ws_shru( wh_mulhi( log_pi2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     649             :   log_pi3 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi3, ws_shru( wh_mulhi( log_pi3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     650             :   /* Now 0<=log_pi < 255 */
     651             : 
     652             :   /* Since our FWHT implementation is unscaled, we've computed a value
     653             :      64 times larger than what we'd like.  64^-1 == 4 (mod 255), but
     654             :      we're just going to use this in the exponentiation, so we can
     655             :      compute this implicitly.
     656             :        2^(log_pi * 64^-1) = 2^(4*log_pi) = (2^4)^log_pi = 16^log_pi
     657             :      (where 2 is the primitive element we used for the logs, an element
     658             :      of GF(2^8) ). */
     659             : 
     660             :   wb_t compact_log_pi0 = compact_ws( log_pi0, log_pi1 );
     661             :   wb_t compact_log_pi1 = compact_ws( log_pi2, log_pi3 );
     662             :   wb_t pi0 = exp_16( compact_log_pi0 );
     663             :   wb_t pi1 = exp_16( compact_log_pi1 );
     664             : 
     665             :   wb_st( output,      pi0 );
     666             :   wb_st( output+32UL, pi1 );
     667             : 
     668             : #else
     669          39 :   wb_t erased_vec0 = wb_ld( is_erased        );
     670          39 :   wb_t erased_vec1 = wb_ld( is_erased + 32UL );
     671          39 :   wb_t to_transform0 = erased_vec0;
     672          39 :   wb_t to_transform1 = erased_vec1;
     673             : 
     674          39 :   FD_REEDSOL_FWHT_64( to_transform0, to_transform1 );
     675             : 
     676          39 :   ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 0 ) );
     677          39 :   ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 1 ) );
     678          39 :   ws_t transformed2 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 0 ) );
     679          39 :   ws_t transformed3 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 1 ) );
     680             : 
     681             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
     682             :      0<=product<256*255, so product is interpreted as unsigned. */
     683          39 :   ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_64        ) );
     684          39 :   ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_64 + 16UL ) );
     685          39 :   ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_64 + 32UL ) );
     686          39 :   ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_64 + 48UL ) );
     687             : 
     688             :   /* Compute mod 255, using the same approach as above. */
     689          39 :   product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( wh_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     690          39 :   product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( wh_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     691          39 :   product2 = ws_and( ws_bcast( 0xFF ), ws_add( product2, ws_shru( wh_mulhi( product2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     692          39 :   product3 = ws_and( ws_bcast( 0xFF ), ws_add( product3, ws_shru( wh_mulhi( product3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     693             : 
     694          39 :   wb_t compact_product0 = compact_ws( product0, product1 );
     695          39 :   wb_t compact_product1 = compact_ws( product2, product3 );
     696             : 
     697          39 :   FD_REEDSOL_FWHT_64( compact_product0, compact_product1 );
     698             : 
     699             :   /* Negate the ones corresponding to erasures */
     700          39 :   compact_product0 = wb_if( wb_eq( erased_vec0, wb_zero() ), compact_product0, wb_sub( wb_bcast( 255 ), compact_product0 ) );
     701          39 :   compact_product1 = wb_if( wb_eq( erased_vec1, wb_zero() ), compact_product1, wb_sub( wb_bcast( 255 ), compact_product1 ) );
     702             : 
     703          39 :   wb_t pi0 = exp_16( compact_product0 );
     704          39 :   wb_t pi1 = exp_16( compact_product1 );
     705          39 :   wb_st( output       , pi0 );
     706          39 :   wb_st( output + 32UL, pi1 );
     707          39 : #endif
     708             : #else /* No AVX implementation */
     709             : 
     710             :   gen_pi_noavx_generic( is_erased, output, 64UL, fwht_l_twiddle_64 );
     711             : 
     712             : #endif
     713          39 : }
     714             : 
     715             : void
     716             : fd_reedsol_private_gen_pi_128( uchar const * is_erased,
     717           6 :                                uchar       * output ) {
     718           6 : #if FD_REEDSOL_ARITH_IMPL>0
     719             : #if FD_REEDSOL_PI_USE_SHORT
     720             :   ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased         ) );
     721             :   ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased +  16UL ) );
     722             :   ws_t erased_vec2 = _mm256_cvtepu8_epi16( vb_ld( is_erased +  32UL ) );
     723             :   ws_t erased_vec3 = _mm256_cvtepu8_epi16( vb_ld( is_erased +  48UL ) );
     724             :   ws_t erased_vec4 = _mm256_cvtepu8_epi16( vb_ld( is_erased +  64UL ) );
     725             :   ws_t erased_vec5 = _mm256_cvtepu8_epi16( vb_ld( is_erased +  80UL ) );
     726             :   ws_t erased_vec6 = _mm256_cvtepu8_epi16( vb_ld( is_erased +  96UL ) );
     727             :   ws_t erased_vec7 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 112UL ) );
     728             : 
     729             :   ws_t transformed0 = erased_vec0;
     730             :   ws_t transformed1 = erased_vec1;
     731             :   ws_t transformed2 = erased_vec2;
     732             :   ws_t transformed3 = erased_vec3;
     733             :   ws_t transformed4 = erased_vec4;
     734             :   ws_t transformed5 = erased_vec5;
     735             :   ws_t transformed6 = erased_vec6;
     736             :   ws_t transformed7 = erased_vec7;
     737             :   FD_REEDSOL_FWHT_128( transformed0, transformed1, transformed2, transformed3, transformed4, transformed5, transformed6, transformed7 ); /* FWHT( R~ ) */
     738             :   /* |transformed| <= 128 */
     739             : 
     740             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
     741             :      -16256 <= product <= 32512 */
     742             :   ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_128         ) );
     743             :   ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_128 +  16UL ) );
     744             :   ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_128 +  32UL ) );
     745             :   ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_128 +  48UL ) );
     746             :   ws_t product4 = ws_mullo( transformed4, ws_ld( fwht_l_twiddle_128 +  64UL ) );
     747             :   ws_t product5 = ws_mullo( transformed5, ws_ld( fwht_l_twiddle_128 +  80UL ) );
     748             :   ws_t product6 = ws_mullo( transformed6, ws_ld( fwht_l_twiddle_128 +  96UL ) );
     749             :   ws_t product7 = ws_mullo( transformed7, ws_ld( fwht_l_twiddle_128 + 112UL ) );
     750             : 
     751             :   /* We need to reduce these mod 255 to prevent overflow in the next
     752             :      step. 0 <= product+64*255 <= 48832 < 2^16.  The mod operation
     753             :      treats the input as unsigned though, so this is okay. */
     754             :   product0 = ws_add( product0, ws_bcast( (short)64*255 ) );
     755             :   product1 = ws_add( product1, ws_bcast( (short)64*255 ) );
     756             :   product2 = ws_add( product2, ws_bcast( (short)64*255 ) );
     757             :   product3 = ws_add( product3, ws_bcast( (short)64*255 ) );
     758             :   product4 = ws_add( product4, ws_bcast( (short)64*255 ) );
     759             :   product5 = ws_add( product5, ws_bcast( (short)64*255 ) );
     760             :   product6 = ws_add( product6, ws_bcast( (short)64*255 ) );
     761             :   product7 = ws_add( product7, ws_bcast( (short)64*255 ) );
     762             : 
     763             :   product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( wh_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     764             :   product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( wh_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     765             :   product2 = ws_and( ws_bcast( 0xFF ), ws_add( product2, ws_shru( wh_mulhi( product2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     766             :   product3 = ws_and( ws_bcast( 0xFF ), ws_add( product3, ws_shru( wh_mulhi( product3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     767             :   product4 = ws_and( ws_bcast( 0xFF ), ws_add( product4, ws_shru( wh_mulhi( product4, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     768             :   product5 = ws_and( ws_bcast( 0xFF ), ws_add( product5, ws_shru( wh_mulhi( product5, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     769             :   product6 = ws_and( ws_bcast( 0xFF ), ws_add( product6, ws_shru( wh_mulhi( product6, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     770             :   product7 = ws_and( ws_bcast( 0xFF ), ws_add( product7, ws_shru( wh_mulhi( product7, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     771             : 
     772             :   /* Now 0 <= product < 255 */
     773             : 
     774             :   /* log_pi is congruent (mod 255) to what the paper calls
     775             :      R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
     776             :      |log_pi| <= 128*255 */
     777             :   FD_REEDSOL_FWHT_128( product0, product1, product2, product3, product4, product5, product6, product7 );
     778             :   ws_t log_pi0 = product0;
     779             :   ws_t log_pi1 = product1;
     780             :   ws_t log_pi2 = product2;
     781             :   ws_t log_pi3 = product3;
     782             :   ws_t log_pi4 = product4;
     783             :   ws_t log_pi5 = product5;
     784             :   ws_t log_pi6 = product6;
     785             :   ws_t log_pi7 = product7;
     786             : 
     787             :   /* Negate the ones corresponding to erasures to compute 1/Pi' */
     788             :   log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
     789             :   log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
     790             :   log_pi2 = ws_adjust_sign( log_pi2, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec2, 1 ) ) );
     791             :   log_pi3 = ws_adjust_sign( log_pi3, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec3, 1 ) ) );
     792             :   log_pi4 = ws_adjust_sign( log_pi4, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec4, 1 ) ) );
     793             :   log_pi5 = ws_adjust_sign( log_pi5, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec5, 1 ) ) );
     794             :   log_pi6 = ws_adjust_sign( log_pi6, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec6, 1 ) ) );
     795             :   log_pi7 = ws_adjust_sign( log_pi7, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec7, 1 ) ) );
     796             : 
     797             :   log_pi0 = ws_add( log_pi0, ws_bcast( (short)(255*128) ) );
     798             :   log_pi1 = ws_add( log_pi1, ws_bcast( (short)(255*128) ) );
     799             :   log_pi2 = ws_add( log_pi2, ws_bcast( (short)(255*128) ) );
     800             :   log_pi3 = ws_add( log_pi3, ws_bcast( (short)(255*128) ) );
     801             :   log_pi4 = ws_add( log_pi4, ws_bcast( (short)(255*128) ) );
     802             :   log_pi5 = ws_add( log_pi5, ws_bcast( (short)(255*128) ) );
     803             :   log_pi6 = ws_add( log_pi6, ws_bcast( (short)(255*128) ) );
     804             :   log_pi7 = ws_add( log_pi7, ws_bcast( (short)(255*128) ) ); /* Now 0<= log_pi <= 65152 < 2^16 */
     805             : 
     806             :   /* GCC informs me that for a ushort x,
     807             :      (x%255) == 0xFF & ( x + (x*0x8081)>>23).
     808             :      We need at least 31 bits of precision for the product, so
     809             :      mulh_epu16 is perfect. */
     810             :   log_pi0 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi0, ws_shru( wh_mulhi( log_pi0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     811             :   log_pi1 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi1, ws_shru( wh_mulhi( log_pi1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     812             :   log_pi2 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi2, ws_shru( wh_mulhi( log_pi2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     813             :   log_pi3 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi3, ws_shru( wh_mulhi( log_pi3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     814             :   log_pi4 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi4, ws_shru( wh_mulhi( log_pi4, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     815             :   log_pi5 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi5, ws_shru( wh_mulhi( log_pi5, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     816             :   log_pi6 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi6, ws_shru( wh_mulhi( log_pi6, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     817             :   log_pi7 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi7, ws_shru( wh_mulhi( log_pi7, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     818             :   /* Now 0<=log_pi < 255 */
     819             : 
     820             :   /* Since our FWHT implementation is unscaled, we've computed a value
     821             :      128 times larger than what we'd like.  128^-1 == 2 (mod 255), but
     822             :      we're just going to use this in the exponentiation, so we can
     823             :      compute this implicitly.
     824             :        2^(log_pi * 128^-1) = 2^(2*log_pi) = (2^2)^log_pi = 4^log_pi
     825             :      (where 2 is the primitive element we used for the logs, an element
     826             :      of GF(2^8) ). */
     827             : 
     828             :   wb_t compact_log_pi0 = compact_ws( log_pi0, log_pi1 );
     829             :   wb_t compact_log_pi1 = compact_ws( log_pi2, log_pi3 );
     830             :   wb_t compact_log_pi2 = compact_ws( log_pi4, log_pi5 );
     831             :   wb_t compact_log_pi3 = compact_ws( log_pi6, log_pi7 );
     832             :   wb_t pi0 = exp_4( compact_log_pi0 );
     833             :   wb_t pi1 = exp_4( compact_log_pi1 );
     834             :   wb_t pi2 = exp_4( compact_log_pi2 );
     835             :   wb_t pi3 = exp_4( compact_log_pi3 );
     836             : 
     837             :   wb_st( output,        pi0 );
     838             :   wb_st( output + 32UL, pi1 );
     839             :   wb_st( output + 64UL, pi2 );
     840             :   wb_st( output + 96UL, pi3 );
     841             : 
     842             : #else
     843           6 :   wb_t erased_vec0 = wb_ld( is_erased        );
     844           6 :   wb_t erased_vec1 = wb_ld( is_erased + 32UL );
     845           6 :   wb_t erased_vec2 = wb_ld( is_erased + 64UL );
     846           6 :   wb_t erased_vec3 = wb_ld( is_erased + 96UL );
     847           6 :   wb_t to_transform0 = erased_vec0;
     848           6 :   wb_t to_transform1 = erased_vec1;
     849           6 :   wb_t to_transform2 = erased_vec2;
     850           6 :   wb_t to_transform3 = erased_vec3;
     851             : 
     852           6 :   FD_REEDSOL_FWHT_128( to_transform0, to_transform1, to_transform2, to_transform3 );
     853             : 
     854           6 :   ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 0 ) );
     855           6 :   ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 1 ) );
     856           6 :   ws_t transformed2 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 0 ) );
     857           6 :   ws_t transformed3 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 1 ) );
     858           6 :   ws_t transformed4 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 0 ) );
     859           6 :   ws_t transformed5 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 1 ) );
     860           6 :   ws_t transformed6 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 0 ) );
     861           6 :   ws_t transformed7 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 1 ) );
     862             : 
     863             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
     864             :      0<=product<256*255, so product is interpreted as unsigned. */
     865           6 :   ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_128         ) );
     866           6 :   ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_128 +  16UL ) );
     867           6 :   ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_128 +  32UL ) );
     868           6 :   ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_128 +  48UL ) );
     869           6 :   ws_t product4 = ws_mullo( transformed4, ws_ld( fwht_l_twiddle_128 +  64UL ) );
     870           6 :   ws_t product5 = ws_mullo( transformed5, ws_ld( fwht_l_twiddle_128 +  80UL ) );
     871           6 :   ws_t product6 = ws_mullo( transformed6, ws_ld( fwht_l_twiddle_128 +  96UL ) );
     872           6 :   ws_t product7 = ws_mullo( transformed7, ws_ld( fwht_l_twiddle_128 + 112UL ) );
     873             : 
     874             :   /* Compute mod 255, using the same approach as above. */
     875           6 :   product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( wh_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     876           6 :   product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( wh_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     877           6 :   product2 = ws_and( ws_bcast( 0xFF ), ws_add( product2, ws_shru( wh_mulhi( product2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     878           6 :   product3 = ws_and( ws_bcast( 0xFF ), ws_add( product3, ws_shru( wh_mulhi( product3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     879           6 :   product4 = ws_and( ws_bcast( 0xFF ), ws_add( product4, ws_shru( wh_mulhi( product4, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     880           6 :   product5 = ws_and( ws_bcast( 0xFF ), ws_add( product5, ws_shru( wh_mulhi( product5, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     881           6 :   product6 = ws_and( ws_bcast( 0xFF ), ws_add( product6, ws_shru( wh_mulhi( product6, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     882           6 :   product7 = ws_and( ws_bcast( 0xFF ), ws_add( product7, ws_shru( wh_mulhi( product7, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     883           6 :   wb_t compact_product0 = compact_ws( product0, product1 );
     884           6 :   wb_t compact_product1 = compact_ws( product2, product3 );
     885           6 :   wb_t compact_product2 = compact_ws( product4, product5 );
     886           6 :   wb_t compact_product3 = compact_ws( product6, product7 );
     887             : 
     888           6 :   FD_REEDSOL_FWHT_128( compact_product0, compact_product1, compact_product2, compact_product3 );
     889             : 
     890             :   /* Negate the ones corresponding to erasures */
     891           6 :   compact_product0 = wb_if( wb_eq( erased_vec0, wb_zero() ), compact_product0, wb_sub( wb_bcast( 255 ), compact_product0 ) );
     892           6 :   compact_product1 = wb_if( wb_eq( erased_vec1, wb_zero() ), compact_product1, wb_sub( wb_bcast( 255 ), compact_product1 ) );
     893           6 :   compact_product2 = wb_if( wb_eq( erased_vec2, wb_zero() ), compact_product2, wb_sub( wb_bcast( 255 ), compact_product2 ) );
     894           6 :   compact_product3 = wb_if( wb_eq( erased_vec3, wb_zero() ), compact_product3, wb_sub( wb_bcast( 255 ), compact_product3 ) );
     895             : 
     896           6 :   wb_t pi0 = exp_4( compact_product0 );
     897           6 :   wb_t pi1 = exp_4( compact_product1 );
     898           6 :   wb_t pi2 = exp_4( compact_product2 );
     899           6 :   wb_t pi3 = exp_4( compact_product3 );
     900           6 :   wb_st( output,        pi0 );
     901           6 :   wb_st( output + 32UL, pi1 );
     902           6 :   wb_st( output + 64UL, pi2 );
     903           6 :   wb_st( output + 96UL, pi3 );
     904           6 : #endif
     905             : #else /* No AVX implementation */
     906             : 
     907             :   gen_pi_noavx_generic( is_erased, output, 128UL, fwht_l_twiddle_128 );
     908             : 
     909             : #endif
     910           6 : }
     911             : 
     912             : void
     913             : fd_reedsol_private_gen_pi_256( uchar const * is_erased,
     914           0 :                                uchar       * output ) {
     915           0 : #if FD_REEDSOL_ARITH_IMPL>0
     916             : #if FD_REEDSOL_PI_USE_SHORT
     917             :   ws_t erased_vec0  = _mm256_cvtepu8_epi16( vb_ld( is_erased         ) );
     918             :   ws_t erased_vec1  = _mm256_cvtepu8_epi16( vb_ld( is_erased +  16UL ) );
     919             :   ws_t erased_vec2  = _mm256_cvtepu8_epi16( vb_ld( is_erased +  32UL ) );
     920             :   ws_t erased_vec3  = _mm256_cvtepu8_epi16( vb_ld( is_erased +  48UL ) );
     921             :   ws_t erased_vec4  = _mm256_cvtepu8_epi16( vb_ld( is_erased +  64UL ) );
     922             :   ws_t erased_vec5  = _mm256_cvtepu8_epi16( vb_ld( is_erased +  80UL ) );
     923             :   ws_t erased_vec6  = _mm256_cvtepu8_epi16( vb_ld( is_erased +  96UL ) );
     924             :   ws_t erased_vec7  = _mm256_cvtepu8_epi16( vb_ld( is_erased + 112UL ) );
     925             :   ws_t erased_vec8  = _mm256_cvtepu8_epi16( vb_ld( is_erased + 128UL ) );
     926             :   ws_t erased_vec9  = _mm256_cvtepu8_epi16( vb_ld( is_erased + 144UL ) );
     927             :   ws_t erased_vec10 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 160UL ) );
     928             :   ws_t erased_vec11 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 176UL ) );
     929             :   ws_t erased_vec12 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 192UL ) );
     930             :   ws_t erased_vec13 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 208UL ) );
     931             :   ws_t erased_vec14 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 224UL ) );
     932             :   ws_t erased_vec15 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 240UL ) );
     933             : 
     934             :   ws_t transformed0 = erased_vec0;
     935             :   ws_t transformed1 = erased_vec1;
     936             :   ws_t transformed2 = erased_vec2;
     937             :   ws_t transformed3 = erased_vec3;
     938             :   ws_t transformed4 = erased_vec4;
     939             :   ws_t transformed5 = erased_vec5;
     940             :   ws_t transformed6 = erased_vec6;
     941             :   ws_t transformed7 = erased_vec7;
     942             :   ws_t transformed8 = erased_vec8;
     943             :   ws_t transformed9 = erased_vec9;
     944             :   ws_t transformed10 = erased_vec10;
     945             :   ws_t transformed11 = erased_vec11;
     946             :   ws_t transformed12 = erased_vec12;
     947             :   ws_t transformed13 = erased_vec13;
     948             :   ws_t transformed14 = erased_vec14;
     949             :   ws_t transformed15 = erased_vec15;
     950             :   FD_REEDSOL_FWHT_256( transformed0, transformed1, transformed2, transformed3, transformed4, transformed5, transformed6, transformed7,
     951             :       transformed8, transformed9, transformed10, transformed11, transformed12, transformed13, transformed14, transformed15 ); /* FWHT( R~ ) */
     952             :   /* |transformed| <= 256 */
     953             : 
     954             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
     955             :      -32512 <= product <= 32512 */
     956             :   ws_t product0  = ws_mullo( transformed0,  ws_ld( fwht_l_twiddle_256         ) );
     957             :   ws_t product1  = ws_mullo( transformed1,  ws_ld( fwht_l_twiddle_256 +  16UL ) );
     958             :   ws_t product2  = ws_mullo( transformed2,  ws_ld( fwht_l_twiddle_256 +  32UL ) );
     959             :   ws_t product3  = ws_mullo( transformed3,  ws_ld( fwht_l_twiddle_256 +  48UL ) );
     960             :   ws_t product4  = ws_mullo( transformed4,  ws_ld( fwht_l_twiddle_256 +  64UL ) );
     961             :   ws_t product5  = ws_mullo( transformed5,  ws_ld( fwht_l_twiddle_256 +  80UL ) );
     962             :   ws_t product6  = ws_mullo( transformed6,  ws_ld( fwht_l_twiddle_256 +  96UL ) );
     963             :   ws_t product7  = ws_mullo( transformed7,  ws_ld( fwht_l_twiddle_256 + 112UL ) );
     964             :   ws_t product8  = ws_mullo( transformed8,  ws_ld( fwht_l_twiddle_256 + 128UL ) );
     965             :   ws_t product9  = ws_mullo( transformed9,  ws_ld( fwht_l_twiddle_256 + 144UL ) );
     966             :   ws_t product10 = ws_mullo( transformed10, ws_ld( fwht_l_twiddle_256 + 160UL ) );
     967             :   ws_t product11 = ws_mullo( transformed11, ws_ld( fwht_l_twiddle_256 + 176UL ) );
     968             :   ws_t product12 = ws_mullo( transformed12, ws_ld( fwht_l_twiddle_256 + 192UL ) );
     969             :   ws_t product13 = ws_mullo( transformed13, ws_ld( fwht_l_twiddle_256 + 208UL ) );
     970             :   ws_t product14 = ws_mullo( transformed14, ws_ld( fwht_l_twiddle_256 + 224UL ) );
     971             :   ws_t product15 = ws_mullo( transformed15, ws_ld( fwht_l_twiddle_256 + 240UL ) );
     972             : 
     973             :   /* We need to reduce these mod 255 to prevent overflow in the next
     974             :      step. 0 <= product+128*255 <= 65152 < 2^16.  The mod operation
     975             :      treats the input as unsigned though, so this is okay (but hanging
     976             :      in there by a thread!). */
     977             :   product0  = ws_mod255( ws_add( product0,  ws_bcast( (short)128*255 ) ) );
     978             :   product1  = ws_mod255( ws_add( product1,  ws_bcast( (short)128*255 ) ) );
     979             :   product2  = ws_mod255( ws_add( product2,  ws_bcast( (short)128*255 ) ) );
     980             :   product3  = ws_mod255( ws_add( product3,  ws_bcast( (short)128*255 ) ) );
     981             :   product4  = ws_mod255( ws_add( product4,  ws_bcast( (short)128*255 ) ) );
     982             :   product5  = ws_mod255( ws_add( product5,  ws_bcast( (short)128*255 ) ) );
     983             :   product6  = ws_mod255( ws_add( product6,  ws_bcast( (short)128*255 ) ) );
     984             :   product7  = ws_mod255( ws_add( product7,  ws_bcast( (short)128*255 ) ) );
     985             :   product8  = ws_mod255( ws_add( product8,  ws_bcast( (short)128*255 ) ) );
     986             :   product9  = ws_mod255( ws_add( product9,  ws_bcast( (short)128*255 ) ) );
     987             :   product10 = ws_mod255( ws_add( product10, ws_bcast( (short)128*255 ) ) );
     988             :   product11 = ws_mod255( ws_add( product11, ws_bcast( (short)128*255 ) ) );
     989             :   product12 = ws_mod255( ws_add( product12, ws_bcast( (short)128*255 ) ) );
     990             :   product13 = ws_mod255( ws_add( product13, ws_bcast( (short)128*255 ) ) );
     991             :   product14 = ws_mod255( ws_add( product14, ws_bcast( (short)128*255 ) ) );
     992             :   product15 = ws_mod255( ws_add( product15, ws_bcast( (short)128*255 ) ) );
     993             : 
     994             :   /* Now 0 <= product < 255 */
     995             : 
     996             :   /* log_pi is congruent (mod 255) to what the paper calls
     997             :      R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
     998             :      If we do the FWHT in the normal way, it might overflow, so we need to inline it and stick a mod in the middle */
     999             :   ws_t log_pi0  = ws_mod255( ws_add( product0, product8  ) );  ws_t log_pi1 = ws_mod255( ws_add( product1, product9  ) );
    1000             :   ws_t log_pi2  = ws_mod255( ws_add( product2, product10 ) );  ws_t log_pi3 = ws_mod255( ws_add( product3, product11 ) );
    1001             :   ws_t log_pi4  = ws_mod255( ws_add( product4, product12 ) );  ws_t log_pi5 = ws_mod255( ws_add( product5, product13 ) );
    1002             :   ws_t log_pi6  = ws_mod255( ws_add( product6, product14 ) );  ws_t log_pi7 = ws_mod255( ws_add( product7, product15 ) );
    1003             :   ws_t log_pi8  = ws_mod255( ws_add( ws_sub( product0, product8 ), ws_bcast( (short)255*2 ) ) );
    1004             :   ws_t log_pi9  = ws_mod255( ws_add( ws_sub( product1, product9 ), ws_bcast( (short)255*2 ) ) );
    1005             :   ws_t log_pi10 = ws_mod255( ws_add( ws_sub( product2, product10 ), ws_bcast( (short)255*2 ) ) );
    1006             :   ws_t log_pi11 = ws_mod255( ws_add( ws_sub( product3, product11 ), ws_bcast( (short)255*2 ) ) );
    1007             :   ws_t log_pi12 = ws_mod255( ws_add( ws_sub( product4, product12 ), ws_bcast( (short)255*2 ) ) );
    1008             :   ws_t log_pi13 = ws_mod255( ws_add( ws_sub( product5, product13 ), ws_bcast( (short)255*2 ) ) );
    1009             :   ws_t log_pi14 = ws_mod255( ws_add( ws_sub( product6, product14 ), ws_bcast( (short)255*2 ) ) );
    1010             :   ws_t log_pi15 = ws_mod255( ws_add( ws_sub( product7, product15 ), ws_bcast( (short)255*2 ) ) );
    1011             : 
    1012             :   FD_REEDSOL_FWHT_128( log_pi0, log_pi1, log_pi2,  log_pi3,  log_pi4,  log_pi5,  log_pi6,  log_pi7 );
    1013             :   FD_REEDSOL_FWHT_128( log_pi8, log_pi9, log_pi10, log_pi11, log_pi12, log_pi13, log_pi14, log_pi15 );
    1014             :   /* Now |log_pi| <= 128*255 */
    1015             : 
    1016             :   /* Negate the ones corresponding to erasures to compute 1/Pi' */
    1017             :   log_pi0  = ws_adjust_sign( log_pi0,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0,  1 ) ) );
    1018             :   log_pi1  = ws_adjust_sign( log_pi1,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1,  1 ) ) );
    1019             :   log_pi2  = ws_adjust_sign( log_pi2,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec2,  1 ) ) );
    1020             :   log_pi3  = ws_adjust_sign( log_pi3,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec3,  1 ) ) );
    1021             :   log_pi4  = ws_adjust_sign( log_pi4,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec4,  1 ) ) );
    1022             :   log_pi5  = ws_adjust_sign( log_pi5,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec5,  1 ) ) );
    1023             :   log_pi6  = ws_adjust_sign( log_pi6,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec6,  1 ) ) );
    1024             :   log_pi7  = ws_adjust_sign( log_pi7,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec7,  1 ) ) );
    1025             :   log_pi8  = ws_adjust_sign( log_pi8,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec8,  1 ) ) );
    1026             :   log_pi9  = ws_adjust_sign( log_pi9,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec9,  1 ) ) );
    1027             :   log_pi10 = ws_adjust_sign( log_pi10, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec10, 1 ) ) );
    1028             :   log_pi11 = ws_adjust_sign( log_pi11, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec11, 1 ) ) );
    1029             :   log_pi12 = ws_adjust_sign( log_pi12, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec12, 1 ) ) );
    1030             :   log_pi13 = ws_adjust_sign( log_pi13, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec13, 1 ) ) );
    1031             :   log_pi14 = ws_adjust_sign( log_pi14, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec14, 1 ) ) );
    1032             :   log_pi15 = ws_adjust_sign( log_pi15, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec15, 1 ) ) );
    1033             : 
    1034             :   /* After the addition below, 0<= log_pi <= 65152 < 2^16. The mod
    1035             :      brings it back to 0 <= log_pi < 255. */
    1036             :   log_pi0  = ws_mod255( ws_add( log_pi0,  ws_bcast( (short)(255*128) ) ) );
    1037             :   log_pi1  = ws_mod255( ws_add( log_pi1,  ws_bcast( (short)(255*128) ) ) );
    1038             :   log_pi2  = ws_mod255( ws_add( log_pi2,  ws_bcast( (short)(255*128) ) ) );
    1039             :   log_pi3  = ws_mod255( ws_add( log_pi3,  ws_bcast( (short)(255*128) ) ) );
    1040             :   log_pi4  = ws_mod255( ws_add( log_pi4,  ws_bcast( (short)(255*128) ) ) );
    1041             :   log_pi5  = ws_mod255( ws_add( log_pi5,  ws_bcast( (short)(255*128) ) ) );
    1042             :   log_pi6  = ws_mod255( ws_add( log_pi6,  ws_bcast( (short)(255*128) ) ) );
    1043             :   log_pi7  = ws_mod255( ws_add( log_pi7,  ws_bcast( (short)(255*128) ) ) );
    1044             :   log_pi8  = ws_mod255( ws_add( log_pi8,  ws_bcast( (short)(255*128) ) ) );
    1045             :   log_pi9  = ws_mod255( ws_add( log_pi9,  ws_bcast( (short)(255*128) ) ) );
    1046             :   log_pi10 = ws_mod255( ws_add( log_pi10, ws_bcast( (short)(255*128) ) ) );
    1047             :   log_pi11 = ws_mod255( ws_add( log_pi11, ws_bcast( (short)(255*128) ) ) );
    1048             :   log_pi12 = ws_mod255( ws_add( log_pi12, ws_bcast( (short)(255*128) ) ) );
    1049             :   log_pi13 = ws_mod255( ws_add( log_pi13, ws_bcast( (short)(255*128) ) ) );
    1050             :   log_pi14 = ws_mod255( ws_add( log_pi14, ws_bcast( (short)(255*128) ) ) );
    1051             :   log_pi15 = ws_mod255( ws_add( log_pi15, ws_bcast( (short)(255*128) ) ) );
    1052             : 
    1053             :   /* Since our FWHT implementation is unscaled, we've computed a value
    1054             :      256 times larger than what we'd like.  256^-1==1^-1 == 1 (mod 255),
    1055             :      so we don't need to do anything special. */
    1056             : 
    1057             :   wb_t compact_log_pi0 = compact_ws( log_pi0,  log_pi1  );
    1058             :   wb_t compact_log_pi1 = compact_ws( log_pi2,  log_pi3  );
    1059             :   wb_t compact_log_pi2 = compact_ws( log_pi4,  log_pi5  );
    1060             :   wb_t compact_log_pi3 = compact_ws( log_pi6,  log_pi7  );
    1061             :   wb_t compact_log_pi4 = compact_ws( log_pi8,  log_pi9  );
    1062             :   wb_t compact_log_pi5 = compact_ws( log_pi10, log_pi11 );
    1063             :   wb_t compact_log_pi6 = compact_ws( log_pi12, log_pi13 );
    1064             :   wb_t compact_log_pi7 = compact_ws( log_pi14, log_pi15 );
    1065             :   wb_t pi0 = exp_2( compact_log_pi0 );
    1066             :   wb_t pi1 = exp_2( compact_log_pi1 );
    1067             :   wb_t pi2 = exp_2( compact_log_pi2 );
    1068             :   wb_t pi3 = exp_2( compact_log_pi3 );
    1069             :   wb_t pi4 = exp_2( compact_log_pi4 );
    1070             :   wb_t pi5 = exp_2( compact_log_pi5 );
    1071             :   wb_t pi6 = exp_2( compact_log_pi6 );
    1072             :   wb_t pi7 = exp_2( compact_log_pi7 );
    1073             : 
    1074             :   wb_st( output,         pi0 );
    1075             :   wb_st( output +  32UL, pi1 );
    1076             :   wb_st( output +  64UL, pi2 );
    1077             :   wb_st( output +  96UL, pi3 );
    1078             :   wb_st( output + 128UL, pi4 );
    1079             :   wb_st( output + 160UL, pi5 );
    1080             :   wb_st( output + 192UL, pi6 );
    1081             :   wb_st( output + 224UL, pi7 );
    1082             : 
    1083             : #else
    1084           0 :   wb_t erased_vec0 = wb_ld( is_erased         );
    1085           0 :   wb_t erased_vec1 = wb_ld( is_erased +  32UL );
    1086           0 :   wb_t erased_vec2 = wb_ld( is_erased +  64UL );
    1087           0 :   wb_t erased_vec3 = wb_ld( is_erased +  96UL );
    1088           0 :   wb_t erased_vec4 = wb_ld( is_erased + 128UL );
    1089           0 :   wb_t erased_vec5 = wb_ld( is_erased + 160UL );
    1090           0 :   wb_t erased_vec6 = wb_ld( is_erased + 192UL );
    1091           0 :   wb_t erased_vec7 = wb_ld( is_erased + 224UL );
    1092           0 :   wb_t to_transform0 = erased_vec0;
    1093           0 :   wb_t to_transform1 = erased_vec1;
    1094           0 :   wb_t to_transform2 = erased_vec2;
    1095           0 :   wb_t to_transform3 = erased_vec3;
    1096           0 :   wb_t to_transform4 = erased_vec4;
    1097           0 :   wb_t to_transform5 = erased_vec5;
    1098           0 :   wb_t to_transform6 = erased_vec6;
    1099           0 :   wb_t to_transform7 = erased_vec7;
    1100             : 
    1101           0 :   FD_REEDSOL_FWHT_256( to_transform0, to_transform1, to_transform2, to_transform3,
    1102           0 :                        to_transform4, to_transform5, to_transform6, to_transform7 );
    1103             : 
    1104           0 :   ws_t transformed0  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 0 ) );
    1105           0 :   ws_t transformed1  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 1 ) );
    1106           0 :   ws_t transformed2  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 0 ) );
    1107           0 :   ws_t transformed3  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 1 ) );
    1108           0 :   ws_t transformed4  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 0 ) );
    1109           0 :   ws_t transformed5  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 1 ) );
    1110           0 :   ws_t transformed6  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 0 ) );
    1111           0 :   ws_t transformed7  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 1 ) );
    1112           0 :   ws_t transformed8  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform4, 0 ) );
    1113           0 :   ws_t transformed9  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform4, 1 ) );
    1114           0 :   ws_t transformed10 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform5, 0 ) );
    1115           0 :   ws_t transformed11 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform5, 1 ) );
    1116           0 :   ws_t transformed12 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform6, 0 ) );
    1117           0 :   ws_t transformed13 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform6, 1 ) );
    1118           0 :   ws_t transformed14 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform7, 0 ) );
    1119           0 :   ws_t transformed15 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform7, 1 ) );
    1120             : 
    1121             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
    1122             :      0<=product<256*255, so product is interpreted as unsigned. */
    1123           0 :   ws_t product0  = ws_mod255( ws_mullo( transformed0,  ws_ld( fwht_l_twiddle_256         ) ) );
    1124           0 :   ws_t product1  = ws_mod255( ws_mullo( transformed1,  ws_ld( fwht_l_twiddle_256 +  16UL ) ) );
    1125           0 :   ws_t product2  = ws_mod255( ws_mullo( transformed2,  ws_ld( fwht_l_twiddle_256 +  32UL ) ) );
    1126           0 :   ws_t product3  = ws_mod255( ws_mullo( transformed3,  ws_ld( fwht_l_twiddle_256 +  48UL ) ) );
    1127           0 :   ws_t product4  = ws_mod255( ws_mullo( transformed4,  ws_ld( fwht_l_twiddle_256 +  64UL ) ) );
    1128           0 :   ws_t product5  = ws_mod255( ws_mullo( transformed5,  ws_ld( fwht_l_twiddle_256 +  80UL ) ) );
    1129           0 :   ws_t product6  = ws_mod255( ws_mullo( transformed6,  ws_ld( fwht_l_twiddle_256 +  96UL ) ) );
    1130           0 :   ws_t product7  = ws_mod255( ws_mullo( transformed7,  ws_ld( fwht_l_twiddle_256 + 112UL ) ) );
    1131           0 :   ws_t product8  = ws_mod255( ws_mullo( transformed8,  ws_ld( fwht_l_twiddle_256 + 128UL ) ) );
    1132           0 :   ws_t product9  = ws_mod255( ws_mullo( transformed9,  ws_ld( fwht_l_twiddle_256 + 144UL ) ) );
    1133           0 :   ws_t product10 = ws_mod255( ws_mullo( transformed10, ws_ld( fwht_l_twiddle_256 + 160UL ) ) );
    1134           0 :   ws_t product11 = ws_mod255( ws_mullo( transformed11, ws_ld( fwht_l_twiddle_256 + 176UL ) ) );
    1135           0 :   ws_t product12 = ws_mod255( ws_mullo( transformed12, ws_ld( fwht_l_twiddle_256 + 192UL ) ) );
    1136           0 :   ws_t product13 = ws_mod255( ws_mullo( transformed13, ws_ld( fwht_l_twiddle_256 + 208UL ) ) );
    1137           0 :   ws_t product14 = ws_mod255( ws_mullo( transformed14, ws_ld( fwht_l_twiddle_256 + 224UL ) ) );
    1138           0 :   ws_t product15 = ws_mod255( ws_mullo( transformed15, ws_ld( fwht_l_twiddle_256 + 240UL ) ) );
    1139             : 
    1140           0 :   wb_t compact_product0 = compact_ws( product0,  product1  );
    1141           0 :   wb_t compact_product1 = compact_ws( product2,  product3  );
    1142           0 :   wb_t compact_product2 = compact_ws( product4,  product5  );
    1143           0 :   wb_t compact_product3 = compact_ws( product6,  product7  );
    1144           0 :   wb_t compact_product4 = compact_ws( product8,  product9  );
    1145           0 :   wb_t compact_product5 = compact_ws( product10, product11 );
    1146           0 :   wb_t compact_product6 = compact_ws( product12, product13 );
    1147           0 :   wb_t compact_product7 = compact_ws( product14, product15 );
    1148             : 
    1149           0 :   FD_REEDSOL_FWHT_256( compact_product0, compact_product1, compact_product2, compact_product3,
    1150           0 :                        compact_product4, compact_product5, compact_product6, compact_product7 );
    1151             : 
    1152             :   /* Negate the ones corresponding to erasures */
    1153           0 :   compact_product0 = wb_if( wb_eq( erased_vec0, wb_zero() ), compact_product0, wb_sub( wb_bcast( 255 ), compact_product0 ) );
    1154           0 :   compact_product1 = wb_if( wb_eq( erased_vec1, wb_zero() ), compact_product1, wb_sub( wb_bcast( 255 ), compact_product1 ) );
    1155           0 :   compact_product2 = wb_if( wb_eq( erased_vec2, wb_zero() ), compact_product2, wb_sub( wb_bcast( 255 ), compact_product2 ) );
    1156           0 :   compact_product3 = wb_if( wb_eq( erased_vec3, wb_zero() ), compact_product3, wb_sub( wb_bcast( 255 ), compact_product3 ) );
    1157           0 :   compact_product4 = wb_if( wb_eq( erased_vec4, wb_zero() ), compact_product4, wb_sub( wb_bcast( 255 ), compact_product4 ) );
    1158           0 :   compact_product5 = wb_if( wb_eq( erased_vec5, wb_zero() ), compact_product5, wb_sub( wb_bcast( 255 ), compact_product5 ) );
    1159           0 :   compact_product6 = wb_if( wb_eq( erased_vec6, wb_zero() ), compact_product6, wb_sub( wb_bcast( 255 ), compact_product6 ) );
    1160           0 :   compact_product7 = wb_if( wb_eq( erased_vec7, wb_zero() ), compact_product7, wb_sub( wb_bcast( 255 ), compact_product7 ) );
    1161             : 
    1162           0 :   wb_t pi0 = exp_2( compact_product0 );
    1163           0 :   wb_t pi1 = exp_2( compact_product1 );
    1164           0 :   wb_t pi2 = exp_2( compact_product2 );
    1165           0 :   wb_t pi3 = exp_2( compact_product3 );
    1166           0 :   wb_t pi4 = exp_2( compact_product4 );
    1167           0 :   wb_t pi5 = exp_2( compact_product5 );
    1168           0 :   wb_t pi6 = exp_2( compact_product6 );
    1169           0 :   wb_t pi7 = exp_2( compact_product7 );
    1170           0 :   wb_st( output,         pi0 );
    1171           0 :   wb_st( output +  32UL, pi1 );
    1172           0 :   wb_st( output +  64UL, pi2 );
    1173           0 :   wb_st( output +  96UL, pi3 );
    1174           0 :   wb_st( output + 128UL, pi4 );
    1175           0 :   wb_st( output + 160UL, pi5 );
    1176           0 :   wb_st( output + 192UL, pi6 );
    1177           0 :   wb_st( output + 224UL, pi7 );
    1178           0 : #endif
    1179             : #else /* No AVX implementation */
    1180             : 
    1181             :   gen_pi_noavx_generic( is_erased, output, 256UL, fwht_l_twiddle_256 );
    1182             : 
    1183             : #endif
    1184           0 : }

Generated by: LCOV version 1.14