LCOV - code coverage report
Current view: top level - ballet/reedsol - fd_reedsol_pi.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 226 343 65.9 %
Date: 2024-11-13 11:58:15 Functions: 11 17 64.7 %

          Line data    Source code
       1             : #include "fd_reedsol_private.h"
       2             : 
       3             : /* TODO: Move this high-level overview
       4             : 
       5             :    The main Lin, et al. paper proposes a clever method for dealing with
       6             :    erasures.  Suppose there is a polynomial P(x) of degree <k, and we
       7             :    know it's value at k inputs, but not necessarily at inputs 0, 1, ...
       8             :    k-1.  We construct a polynomial Pi(x) (that's the uppercase Greek
       9             :    letter, not shorthand for P_i) with zeros at the erasures, i.e.  at
      10             :    the x in [0, n) for which we don't know P(x).  We don't care what the
      11             :    value of Pi(x) is at the non-erasures other than that we know the
      12             :    value.  This means we know the value of the product P(x)Pi(x) for
      13             :    each x in [0, n).  Additionally, since the order of Pi(x) is n-k, the
      14             :    order of P(x)Pi(x) is <n.  Now we're back in the regime of
      15             :    Reed-Solomon encoding: we know the first n values of a polynomial of
      16             :    order <n, so we can use the FFT and IFFT operators to evaluate this
      17             :    polynomial computationally efficiently wherever we please.
      18             : 
      19             :    However, we don't actually care about the value of P(x)Pi(x); we want
      20             :    the value of P(x).  To accomplish this, we take the formal derivative
      21             :    ("formal" in the sense that these are polynomials over a finite field
      22             :    not real numbers, but we ignore that for a minute):
      23             :         (P(x)Pi(x))' = P'(x)Pi(x) + P(x)Pi'(x).
      24             :    At erasures, Pi(x) = 0 by construction, so
      25             :         P(x) = (P(x)Pi(x))' / Pi'(x)
      26             : 
      27             :    Now then, all we need is a fast way to compute the value of the
      28             :    formal derivative at some points given its value at 0, 1, ..., n-1.
      29             :    It turns out, the special basis we use for the rest of the
      30             :    Reed-Solomon computation gives us a computationally efficient way to
      31             :    compute formal derivatives.
      32             : 
      33             :    Thus, the overall algorithm is this:
      34             :      1. Compute the value of P(x)Pi(x) at x in [0, n)
      35             :      2. Use the IFFT to represent the polynomial in the coefficient basis.
      36             :      3. Take the formal derivative in the coefficient basis.
      37             :      4. Use the FFT to compute the value of (P(x)Pi(x))' for x in [0, n).
      38             :      5. Compute Pi'(x) directly.
      39             :      6. Divide the results of step 4 and 5.
      40             : 
      41             :    That's roughly the approach this implementation uses. The paper gives
      42             :    some tips on how to optimize the computation of Pi and Pi' using the
      43             :    Fast Walsh-Hadamard transform in Appendix A that the code in this
      44             :    implementation also uses. */
      45             : 
      46             : #if FD_REEDSOL_ARITH_IMPL>0
      47             : 
      48             : /* When using AVX, the representation used for internal computation can
      49             :    be done with unsigned chars or with shorts.  They give the same
      50             :    result in the end, but have different performance characteristics.
      51             :    It's not always obvious which is faster, so this is a compile-time
      52             :    switch for it.
      53             : 
      54             :    When FD_REEDSOL_PI_USE_SHORT==1, the FWHT operates on signed
      55             :    integers, so some care is needed to make sure overflow can't happen.
      56             : 
      57             :    When FD_REESOL_PI_USE_SHORT==0, the FWHT operates on a slightly
      58             :    overcomplete representation of the integers mod 255 (yes,
      59             :    unfortunately not mod 256).  In particular, the value 255 is allowed,
      60             :    which is interchangeable with 0. */
      61             : 
      62             : #ifndef FD_REEDSOL_PI_USE_SHORT
      63             : #define FD_REEDSOL_PI_USE_SHORT 0
      64             : #endif
      65             : 
      66             : /* Define some helper macros like what we have in util/simd for a vector
      67             :    of shorts.  */
      68             : 
      69             : #include "../../util/simd/fd_sse.h"
      70             : 
      71         564 : #define ws_t __m256i
      72             : #define ws_add(a,b)         _mm256_add_epi16( (a), (b) )
      73             : #define ws_sub(a,b)         _mm256_sub_epi16( (a), (b) )
      74             : #define ws_bcast(s0)        _mm256_set1_epi16( (s0) )
      75             : #define ws_adjust_sign(a,b) _mm256_sign_epi16( (a), (b) ) /* scales elements in a by the sign of the corresponding element of b */
      76         282 : #define ws_mullo(a,b)       _mm256_mullo_epi16( (a), (b) )
      77             : #define ws_mulhi(a,b)       _mm256_mulhi_epu16( (a), (b) )
      78             : #define ws_shl(a,imm)       _mm256_slli_epi16( (a), (imm) )
      79         282 : #define ws_and(a,b)         _mm256_and_si256(    (a), (b) )
      80             : #define ws_shru(a,imm)      _mm256_srli_epi16( (a), (imm) )
      81          12 : #define ws_zero()           _mm256_setzero_si256() /* Return [ 0 0 0 0 0 ... 0 0 ] */
      82             : 
      83         282 : FD_FN_UNUSED static inline ws_t ws_ld(  short const * p   ) { return _mm256_load_si256(  (__m256i const *)p ); }
      84           0 : FD_FN_UNUSED static inline ws_t ws_ldu( short const * p   ) { return _mm256_loadu_si256( (__m256i const *)p ); }
      85           0 : FD_FN_UNUSED static inline void ws_st(  short * p, ws_t i ) { _mm256_store_si256(  (__m256i *)p, i ); }
      86           0 : FD_FN_UNUSED static inline void ws_stu( short * p, ws_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
      87             : 
      88             : static inline ws_t
      89           0 : ws_mod255( ws_t x ) {
      90             :   /* GCC informs me that for a ushort x,
      91             :      (x%255) == 0xFF & ( x + (x*0x8081)>>23).
      92             :      We need at least 31 bits of precision for the product, so
      93             :      mulh_epu16 is perfect. */
      94           0 :   return ws_and( ws_bcast( 0xFF ), ws_add( x, ws_shru( ws_mulhi( x, ws_bcast( (short)0x8081 ) ), 7 ) ) );
      95           0 : }
      96             : 
      97             : /* The following macros implement the unscaled Fast Walsh-Hadamard
      98             :    transform.  As alluded to above, this gives us a way to compute Pi
      99             :    and Pi' in O(n lg n) time.  These are designed for use within this
     100             :    file, not external use.
     101             : 
     102             :    Unlike the rest of the similar-seeming components in fd_reedsol (e.g.
     103             :    FFT, PPT), this computes the transform within a single (or few) AVX
     104             :    vectors, not in parallel across each component of the vector. I.e. if
     105             :    FD_REEDSOL_ARITH_IMPL>0, to compute a 16-element FWHD, you pass one
     106             :    AVX vector (16*short), not 16 vectors.
     107             : 
     108             :    Also unlike the rest of the similar-seeming components in fd_reedsol,
     109             :    this works on the group Z/255Z (integers mod 255).  Since 255 is not
     110             :    a prime, this is not a field, but the FD_REEDSOL_FWHT only needs addition,
     111             :    subtraction, and division by powers of 2 (which have inverses mod
     112             :    255), so it's not a problem.
     113             : 
     114             :    The typical FWHT multiplies by a factor of 1/sqrt(2) at each step.
     115             :    To convert the unscaled version to the scaled version, divide the
     116             :    result by sqrt(2)^lg(N).  Since we often do two transforms, we need
     117             :    to divide by N ( = (sqrt(2)^lg(N))^2 ).
     118             :    */
     119             : 
     120             : #if FD_REEDSOL_PI_USE_SHORT
     121             : 
     122             : #define FD_REEDSOL_FWHT_16( x )  do { ws_t _x = (x);                                                                  \
     123             :   _x = ws_add( _mm256_setr_m128i( _mm256_extracti128_si256( _x, 1 ), _mm256_extracti128_si256( _x, 0 ) ), \
     124             :       ws_adjust_sign( _x, _mm256_setr_epi16( 1,1,1,1, 1,1,1,1, -1,-1,-1,-1, -1,-1,-1,-1 ) ) ); \
     125             :   _x = ws_add( _mm256_shuffle_epi32( _x, 0x4E ), \
     126             :       ws_adjust_sign( _x, _mm256_setr_epi16( 1,1,1,1, -1,-1,-1,-1, 1,1,1,1, -1,-1,-1,-1 ) ) ); \
     127             :   _x = ws_add( _mm256_shuffle_epi32( _x, 0xB1 ), \
     128             :       ws_adjust_sign( _x, _mm256_setr_epi16( 1,1,-1,-1, 1,1,-1,-1, 1,1,-1,-1, 1,1,-1,-1 ) ) ); \
     129             :   _x = ws_add( _mm256_shuffle_epi8( _x, _mm256_setr_epi8( 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, \
     130             :                                                           2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ) ), \
     131             :           ws_adjust_sign( _x, _mm256_setr_epi16( 1,-1,1,-1, 1,-1,1,-1, 1,-1,1,-1, 1,-1,1,-1 ) ) ); \
     132             :   (x) = _x; } while( 0 )
     133             : 
     134             : #define FD_REEDSOL_FWHT_32( x0, x1 )  do { \
     135             :   ws_t _y0i = (x0);                  ws_t _y1i = (x1);  \
     136             :   ws_t _y0  = ws_add( _y0i, _y1i );  ws_t _y1  = ws_sub( _y0i, _y1i ); \
     137             :   FD_REEDSOL_FWHT_16( _y0 );         FD_REEDSOL_FWHT_16( _y1 ); \
     138             :   (x0) = _y0;                        (x1) = _y1; \
     139             : } while( 0 )
     140             : 
     141             : #define FD_REEDSOL_FWHT_64( x0, x1, x2, x3 )  do { \
     142             :   ws_t _z0, _z1, _z2, _z3; ws_t _z0i, _z1i, _z2i, _z3i; \
     143             :   _z0i = (x0);                  _z1i = (x1);                  _z2i = (x2);                  _z3i = (x3);  \
     144             :   _z0  = ws_add( _z0i, _z2i );  _z1  = ws_add( _z1i, _z3i );  _z2  = ws_sub( _z0i, _z2i );  _z3  = ws_sub( _z1i, _z3i );   \
     145             :   FD_REEDSOL_FWHT_32( _z0, _z1 );                             FD_REEDSOL_FWHT_32( _z2, _z3 ); \
     146             :   (x0) = _z0;                   (x1) = _z1;                   (x2) = _z2;                   (x3) = _z3; \
     147             : } while( 0 )
     148             : 
     149             : #define FD_REEDSOL_FWHT_128( x0, x1, x2, x3, x4, x5, x6, x7 )  do { \
     150             :   ws_t _w0,  _w1,  _w2,  _w3,  _w4,  _w5,  _w6,  _w7; \
     151             :   ws_t _w0i, _w1i, _w2i, _w3i, _w4i, _w5i, _w6i, _w7i; \
     152             :   _w0i = (x0);                  _w1i = (x1);                  _w2i = (x2);                  _w3i = (x3);  \
     153             :   _w4i = (x4);                  _w5i = (x5);                  _w6i = (x6);                  _w7i = (x7);  \
     154             :   _w0  = ws_add( _w0i, _w4i );  _w1  = ws_add( _w1i, _w5i );  _w2  = ws_add( _w2i, _w6i );  _w3  = ws_add( _w3i, _w7i );   \
     155             :   _w4  = ws_sub( _w0i, _w4i );  _w5  = ws_sub( _w1i, _w5i );  _w6  = ws_sub( _w2i, _w6i );  _w7  = ws_sub( _w3i, _w7i );   \
     156             :   FD_REEDSOL_FWHT_64( _w0, _w1, _w2, _w3 );                   FD_REEDSOL_FWHT_64( _w4, _w5, _w6, _w7 ); \
     157             :   (x0) = _w0; (x1) = _w1; (x2) = _w2; (x3) = _w3; (x4) = _w4; (x5) = _w5; (x6) = _w6; (x7) = _w7; \
     158             : } while( 0 )
     159             : 
     160             : #define FD_REEDSOL_FWHT_256( x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 )  do { \
     161             :   ws_t _v0,  _v1,  _v2,  _v3,  _v4,  _v5,  _v6,  _v7,  _v8,  _v9,  _v10,  _v11,  _v12,  _v13,  _v14,  _v15; \
     162             :   ws_t _v0i, _v1i, _v2i, _v3i, _v4i, _v5i, _v6i, _v7i, _v8i, _v9i, _v10i, _v11i, _v12i, _v13i, _v14i, _v15i; \
     163             :   _v0i  = (x0);                 _v1i  = (x1);                  _v2i  = (x2);                  _v3i  = (x3);  \
     164             :   _v4i  = (x4);                 _v5i  = (x5);                  _v6i  = (x6);                  _v7i  = (x7);  \
     165             :   _v8i  = (x8);                 _v9i  = (x9);                  _v10i = (x10);                 _v11i = (x11);  \
     166             :   _v12i = (x12);                _v13i = (x13);                 _v14i = (x14);                 _v15i = (x15);  \
     167             :   _v0  = ws_add( _v0i, _v8i  ); _v1   = ws_add( _v1i, _v9i  ); _v2   = ws_add( _v2i, _v10i ); _v3   = ws_add( _v3i, _v11i );   \
     168             :   _v4  = ws_add( _v4i, _v12i ); _v5   = ws_add( _v5i, _v13i ); _v6   = ws_add( _v6i, _v14i ); _v7   = ws_add( _v7i, _v15i );   \
     169             :   _v8  = ws_sub( _v0i, _v8i  ); _v9   = ws_sub( _v1i, _v9i  ); _v10  = ws_sub( _v2i, _v10i ); _v11  = ws_sub( _v3i, _v11i );   \
     170             :   _v12 = ws_sub( _v4i, _v12i ); _v13  = ws_sub( _v5i, _v13i ); _v14  = ws_sub( _v6i, _v14i ); _v15  = ws_sub( _v7i, _v15i );   \
     171             :   FD_REEDSOL_FWHT_128( _v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7 ); FD_REEDSOL_FWHT_128( _v8, _v9, _v10, _v11, _v12, _v13, _v14, _v15 ); \
     172             :   (x0) = _v0; (x1) = _v1; (x2)  = _v2;  (x3)  = _v3;  (x4)  = _v4;  (x5)  = _v5;  (x6)  = _v6;  (x7)  = _v7; \
     173             :   (x8) = _v8; (x9) = _v9; (x10) = _v10; (x11) = _v11; (x12) = _v12; (x13) = _v13; (x14) = _v14; (x15) = _v15; \
     174             : } while( 0 )
     175             : 
     176             : #else /* FD_REEDSOL_PI_USE_SHORT */
     177             : 
     178             : static inline wb_t
     179        1698 : add_mod_255( wb_t a, wb_t b ) {
     180        1698 :   wb_t sum = wb_add( a, b );
     181        1698 :   wb_t overflowed = wb_lt( sum, a );
     182        1698 :   return wb_sub( sum, overflowed );
     183        1698 : }
     184             : 
     185         294 : #define FD_REEDSOL_FWHT_16( x ) do { wb_t _x = (x); \
     186         294 :   wb_t negated, unshifted, shifted; \
     187         294 :   /* Shift by 8 elements (8B) */ \
     188         294 :   negated = wb_sub( wb_bcast( 0xFF ), _x ); \
     189         294 :   unshifted = _mm256_blend_epi32( _x, negated, 0xCC ); \
     190         294 :   shifted = _mm256_shuffle_epi32( _x, 0x4E ); \
     191         294 :   _x = add_mod_255( unshifted, shifted ); \
     192         294 :   /* Shift by 4 elements (4B) */ \
     193         294 :   negated = wb_sub( wb_bcast( 0xFF ), _x ); \
     194         294 :   unshifted = _mm256_blend_epi32( _x, negated, 0xAA ); \
     195         294 :   shifted = _mm256_shuffle_epi32( _x, 0xB1 ); \
     196         294 :   _x = add_mod_255( unshifted, shifted ); \
     197         294 :   /* Shift by 2 elements (2B) */ \
     198         294 :   negated = wb_sub( wb_bcast( 0xFF ), _x ); \
     199         294 :   unshifted = _mm256_blend_epi16( _x, negated, 0xAA ); \
     200         294 :   shifted = wb_exch_adj_pair( _x ); \
     201         294 :   _x = add_mod_255( unshifted, shifted ); \
     202         294 :   /* Shift by 1 element (1B) */ \
     203         294 :   negated = wb_sub( wb_bcast( 0xFF ), _x ); \
     204         294 :   unshifted = _mm256_blendv_epi8( _x, negated, wb_bcast_pair( 0x01, 0xFF ) ); \
     205         294 :   shifted = wb_exch_adj( _x ); \
     206         294 :   _x = add_mod_255( unshifted, shifted ); \
     207         294 :   (x) = _x; \
     208         294 : } while( 0 )
     209             : 
     210         270 : #define FD_REEDSOL_FWHT_32( x ) do { wb_t _y = (x); \
     211         270 :   wb_t negated, unshifted, shifted; \
     212         270 :   /* Shift by 16 elements (16B) */ \
     213         270 :   negated = wb_sub( wb_bcast( 0xFF ), _y ); \
     214         270 :   unshifted = _mm256_blend_epi32( _y, negated, 0xF0 ); \
     215         270 :   shifted = _mm256_setr_m128i( _mm256_extracti128_si256( _y, 1 ), _mm256_extracti128_si256( _y, 0 ) ); \
     216         270 :   _y = add_mod_255( unshifted, shifted ); \
     217         270 :   FD_REEDSOL_FWHT_16( _y ); \
     218         270 :   (x) = _y; \
     219         270 : } while( 0 )
     220             : 
     221         102 : #define FD_REEDSOL_FWHT_64( x0, x1 ) do { wb_t _z0i = (x0); wb_t _z1i = (x1);  \
     222         102 :   wb_t _z0 = add_mod_255( _z0i, _z1i );  wb_t _z1 = add_mod_255( _z0i, wb_sub( wb_bcast( 0xFF ), _z1i ) ); \
     223         102 :   FD_REEDSOL_FWHT_32( _z0 );             FD_REEDSOL_FWHT_32( _z1 ); \
     224         102 :   (x0) = _z0;                            (x1) = _z1; \
     225         102 : } while( 0 )
     226             : 
     227          12 : #define FD_REEDSOL_FWHT_128( x0, x1, x2, x3 ) do { wb_t _w0i = (x0); wb_t _w1i = (x1); wb_t _w2i = (x2); wb_t _w3i = (x3);  \
     228          12 :   wb_t _w0, _w1, _w2, _w3; \
     229          12 :   _w0 = add_mod_255( _w0i, _w2i );                               _w1 = add_mod_255( _w1i, _w3i ); \
     230          12 :   _w2 = add_mod_255( _w0i, wb_sub( wb_bcast( 0xFF ), _w2i ) );   _w3 = add_mod_255( _w1i, wb_sub( wb_bcast( 0xFF ), _w3i ) ); \
     231          12 :   FD_REEDSOL_FWHT_64( _w0, _w1 );                                FD_REEDSOL_FWHT_64( _w2, _w3 ); \
     232          12 :   (x0) = _w0; (x1) = _w1; (x2) = _w2; (x3) = _w3; \
     233          12 : } while( 0 )
     234             : 
     235           0 : #define FD_REEDSOL_FWHT_256( x0, x1, x2, x3, x4, x5, x6, x7 )  do { \
     236           0 :   wb_t _v0,  _v1,  _v2,  _v3,  _v4,  _v5,  _v6,  _v7; \
     237           0 :   wb_t _v0i, _v1i, _v2i, _v3i, _v4i, _v5i, _v6i, _v7i; \
     238           0 :   _v0i = (x0);                  _v1i = (x1);                  _v2i = (x2);                  _v3i = (x3);  \
     239           0 :   _v4i = (x4);                  _v5i = (x5);                  _v6i = (x6);                  _v7i = (x7);  \
     240           0 :   _v0 = add_mod_255( _v0i, _v4i );                               _v1 = add_mod_255( _v1i, _v5i ); \
     241           0 :   _v2 = add_mod_255( _v2i, _v6i );                               _v3 = add_mod_255( _v3i, _v7i ); \
     242           0 :   _v4 = add_mod_255( _v0i, wb_sub( wb_bcast( 0xFF ), _v4i ) );   _v5 = add_mod_255( _v1i, wb_sub( wb_bcast( 0xFF ), _v5i ) ); \
     243           0 :   _v6 = add_mod_255( _v2i, wb_sub( wb_bcast( 0xFF ), _v6i ) );   _v7 = add_mod_255( _v3i, wb_sub( wb_bcast( 0xFF ), _v7i ) ); \
     244           0 :   FD_REEDSOL_FWHT_128( _v0, _v1, _v2, _v3 );                   FD_REEDSOL_FWHT_128( _v4, _v5, _v6, _v7 ); \
     245           0 :   (x0) = _v0; (x1) = _v1; (x2) = _v2; (x3) = _v3; (x4) = _v4; (x5) = _v5; (x6) = _v6; (x7) = _v7; \
     246           0 : } while( 0 )
     247             : #endif
     248             : 
     249             : /* Casts each element of a to a uchar, forming a 16-element uchar vector.  Then
     250             :  * casts each element of b to a uchar, forming a second 16-element uchar
     251             :  * vector. Concatenates the two 16-element vectors to form a single
     252             :  * 32-element wb_t (a first, then b). */
     253             : static inline wb_t
     254         147 : compact_ws( ws_t a, ws_t b ) {
     255             :   /* There's also _mm256_packus_epi16, but it's no better than this */
     256         147 :   wb_t shuffled0 = _mm256_shuffle_epi8(a, wb(  0,  2,  4,  6,  8, 10, 12, 14, 128,128,128,128,128,128,128,128,
     257         147 :                                              128, 128,128,128,128,128,128,128,  0,  2,  4,  6,  8, 10, 12, 14 ) );
     258         147 :   wb_t shuffled1 = _mm256_shuffle_epi8(b, wb(  0,  2,  4,  6,  8, 10, 12, 14, 128,128,128,128,128,128,128,128,
     259         147 :                                              128, 128,128,128,128,128,128,128,  0,  2,  4,  6,  8, 10, 12, 14 ) );
     260         147 :   return _mm256_setr_m128i(
     261         147 :       _mm_or_si128( _mm256_extracti128_si256( shuffled0, 0 ), _mm256_extracti128_si256( shuffled0, 1 ) ),
     262         147 :       _mm_or_si128( _mm256_extracti128_si256( shuffled1, 0 ), _mm256_extracti128_si256( shuffled1, 1 ) ) );
     263         147 : }
     264             : 
     265             : /* exp_{n}( x ) computes n^x_i in GF(2^8) for each byte x_i in the
     266             :    vector x.  That's exponentiation, not xor. For example, exp_76
     267             :    interprets 76 as an element of GF(2^8) and x_i as an integer, and
     268             :    computes the product of multiplying GF(76) times itself x_i times.
     269             :    Recall that exponentiation is an operator from (GF(2^8) x (Z/255Z))
     270             :    -> GF(2^8), so x is interpreted mod 255.  (equivalently, observe
     271             :    n^255=1). As an input, x^255 is okay and is the same as x^0. */
     272             : static inline wb_t
     273          12 : exp_76( wb_t x ) {
     274             :   /* Decompose x = xh3*0x80 + xh2*0x40 + xh1*0x20 + xh0*0x10 + xl
     275             :      where 0<=xl<16 and 0<=xh_j<1 for each j.  Then
     276             : 
     277             :        76^x = (76^xl) * (76^0x10)^xh0 * (76^0x20)^xh1 *
     278             :                         (76^0x40)^xh2 (76^0x80)^xh3
     279             :              = (76^xl) * 2^xh0 * 4^xh1 * 16^xh2 * 29^xh3.
     280             : 
     281             :     We use vpshub to implement the 4-bit lookup table 76^xl.  The for
     282             :     the rest, we're either multiplying by a constant or not doing the
     283             :     multiply, so we can use our normal GF_MUL with a blend. */
     284             : 
     285          12 :   wb_t low = wb_and( x, wb_bcast( 0xF ) );
     286          12 :   wb_t exp_low = _mm256_shuffle_epi8( wb( 1,  76, 157,  70,  95, 253, 217, 129, 133, 168, 230, 227, 130,  81, 18,  44,
     287          12 :                                           1,  76, 157,  70,  95, 253, 217, 129, 133, 168, 230, 227, 130,  81, 18,  44 ),
     288          12 :                                       low );
     289          12 :   wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low,  2 ), _mm256_slli_epi16( x, 3 ) );
     290          12 :   wb_t with1 = _mm256_blendv_epi8( with0,   GF_MUL( with0,    4 ), _mm256_slli_epi16( x, 2 ) );
     291          12 :   wb_t with2 = _mm256_blendv_epi8( with1,   GF_MUL( with1,   16 ), _mm256_slli_epi16( x, 1 ) );
     292          12 :   wb_t with3 = _mm256_blendv_epi8( with2,   GF_MUL( with2,   29 ),                    x      );
     293          12 :   return with3;
     294          12 : }
     295             : 
     296             : static inline wb_t
     297          33 : exp_29( wb_t x ) {
     298          33 :   wb_t low = wb_and( x, wb_bcast( 0xF ) );
     299          33 :   wb_t exp_low = _mm256_shuffle_epi8( wb( 1,  29,  76, 143, 157, 106,  70,  93,  95, 101, 253, 254, 217,  13, 129,  59,
     300          33 :                                           1,  29,  76, 143, 157, 106,  70,  93,  95, 101, 253, 254, 217,  13, 129,  59 ),
     301          33 :                                       low );
     302          33 :   wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 133 ), _mm256_slli_epi16( x, 3 ) );
     303          33 :   wb_t with1 = _mm256_blendv_epi8( with0,   GF_MUL( with0,     2 ), _mm256_slli_epi16( x, 2 ) );
     304          33 :   wb_t with2 = _mm256_blendv_epi8( with1,   GF_MUL( with1,     4 ), _mm256_slli_epi16( x, 1 ) );
     305          33 :   wb_t with3 = _mm256_blendv_epi8( with2,   GF_MUL( with2,    16 ),                    x      );
     306          33 :   return with3;
     307          33 : }
     308             : static inline wb_t
     309          78 : exp_16( wb_t x ) {
     310          78 :   wb_t low = wb_and( x, wb_bcast( 0xF ) );
     311          78 :   wb_t exp_low = _mm256_shuffle_epi8( wb( 1,  16,  29, 205,  76, 180, 143,  24, 157,  37, 106, 238,  70,  20,  93, 185,
     312          78 :                                           1,  16,  29, 205,  76, 180, 143,  24, 157,  37, 106, 238,  70,  20,  93, 185 ),
     313          78 :                                       low );
     314          78 :   wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low,  95 ), _mm256_slli_epi16( x, 3 ) );
     315          78 :   wb_t with1 = _mm256_blendv_epi8( with0,   GF_MUL( with0,   133 ), _mm256_slli_epi16( x, 2 ) );
     316          78 :   wb_t with2 = _mm256_blendv_epi8( with1,   GF_MUL( with1,     2 ), _mm256_slli_epi16( x, 1 ) );
     317          78 :   wb_t with3 = _mm256_blendv_epi8( with2,   GF_MUL( with2,     4 ),                    x      );
     318          78 :   return with3;
     319          78 : }
     320             : static inline wb_t
     321          24 : exp_4( wb_t x ) {
     322          24 :   wb_t low = wb_and( x, wb_bcast( 0xF ) );
     323          24 :   wb_t exp_low = _mm256_shuffle_epi8( wb( 1,   4,  16,  64,  29, 116, 205,  19,  76,  45, 180, 234, 143,   6,  24,  96,
     324          24 :                                           1,   4,  16,  64,  29, 116, 205,  19,  76,  45, 180, 234, 143,   6,  24,  96 ),
     325          24 :                                       low );
     326          24 :   wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 157 ), _mm256_slli_epi16( x, 3 ) );
     327          24 :   wb_t with1 = _mm256_blendv_epi8( with0,   GF_MUL( with0,    95 ), _mm256_slli_epi16( x, 2 ) );
     328          24 :   wb_t with2 = _mm256_blendv_epi8( with1,   GF_MUL( with1,   133 ), _mm256_slli_epi16( x, 1 ) );
     329          24 :   wb_t with3 = _mm256_blendv_epi8( with2,   GF_MUL( with2,     2 ),                    x      );
     330          24 :   return with3;
     331          24 : }
     332             : 
     333             : static inline wb_t
     334           0 : exp_2( wb_t x ) {
     335           0 :   wb_t low = wb_and( x, wb_bcast( 0xF ) );
     336           0 :   wb_t exp_low = _mm256_shuffle_epi8( wb(   1,   2,   4,   8,  16,  32,  64, 128,  29,  58, 116, 232, 205, 135,  19,  38,
     337           0 :                                             1,   2,   4,   8,  16,  32,  64, 128,  29,  58, 116, 232, 205, 135,  19,  38),
     338           0 :                                       low );
     339           0 :   wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low,  76 ), _mm256_slli_epi16( x, 3 ) );
     340           0 :   wb_t with1 = _mm256_blendv_epi8( with0,   GF_MUL( with0,   157 ), _mm256_slli_epi16( x, 2 ) );
     341           0 :   wb_t with2 = _mm256_blendv_epi8( with1,   GF_MUL( with1,    95 ), _mm256_slli_epi16( x, 1 ) );
     342           0 :   wb_t with3 = _mm256_blendv_epi8( with2,   GF_MUL( with2,   133 ),                    x      );
     343           0 :   return with3;
     344           0 : }
     345             : 
     346             : #endif /* FD_REEDSOL_ARITH_IMPL>0 */
     347             : 
     348             : /* l_twiddle_{N} stores the size N FWHT of what the paper calls L~, i.e.
     349             :          ( 0, Log(1), Log(2), Log(3), ... Log(N-1) )
     350             : 
     351             :    The discrete log uses a primitive element of 2, and the FWHT is taken
     352             :    mod 255, which means all of the values can fit in a uchar.  However,
     353             :    Intel doesn't give us a multiplication instruction for 8-bit
     354             :    integers, which means that we'd have to zero-extend these values
     355             :    anyways.
     356             : 
     357             :    Although L~ for a smaller size is a subset of that for a larger size,
     358             :    because we also precompute the value of the FWHT, we store the
     359             :    variables separately.  Perhaps a good compiler could
     360             :    constant-propagate through the AVX instructions, but it's just 5
     361             :    values of N, so I prefer not to depend on that. */
     362             : static const short fwht_l_twiddle_16 [  16 ] = {0xca,0xa1,0x6a,0xa9,0x73,0xfc,0xe2,0x44,0x93,0x74,0x08,0x7f,0x96,0x8c,0x42,0xf2};
     363             : static const short fwht_l_twiddle_32 [  32 ] = {0x24,0x8f,0xc2,0x7e,0x49,0x89,0x74,0xdc,0x4f,0x95,0x43,0xb4,0x09,0xba,0x03,0x83,
     364             :                                                 0x71,0xb3,0x12,0xd4,0x9d,0x70,0x51,0xab,0xd7,0x53,0xcc,0x4a,0x24,0x5e,0x81,0x62};
     365             : static const short fwht_l_twiddle_64 [  64 ] = {0x05,0x81,0x9a,0x82,0x07,0x7c,0x3c,0xbe,0xd3,0xbc,0xed,0x23,0xc2,0x24,0xee,0xc8,
     366             :                                                 0x3f,0x5d,0x11,0x18,0x8a,0xf9,0x1c,0x4b,0x0e,0x02,0x8e,0xe4,0x77,0x8c,0x97,0x6d,
     367             :                                                 0x43,0x9d,0xea,0x7a,0x8b,0x96,0xac,0xfa,0xca,0x6e,0x98,0x46,0x4f,0x51,0x17,0x3e,
     368             :                                                 0xa3,0x0a,0x13,0x91,0xb0,0xe6,0x86,0x0c,0xa1,0xa4,0x0b,0xaf,0xd0,0x30,0x6b,0x57};
     369             : static const short fwht_l_twiddle_128[ 128 ] = {0xfe,0x89,0x15,0xeb,0x48,0xea,0x04,0xfe,0x32,0xd9,0xca,0x2c,0x1e,0x58,0x8d,0xed,
     370             :                                                 0x6f,0x36,0x53,0x24,0xb2,0x27,0x3e,0x06,0xec,0x96,0x41,0x05,0xbe,0x1d,0xb1,0xdd,
     371             :                                                 0x18,0x64,0xf4,0xc3,0x16,0x0a,0x2e,0x00,0xde,0x34,0xaf,0x42,0xd7,0x5e,0x92,0x02,
     372             :                                                 0xbf,0x5a,0x6a,0x97,0xe1,0x39,0xd0,0xf6,0x66,0x86,0xb5,0x61,0x8a,0xa2,0x8f,0x49,
     373             :                                                 0x0b,0x79,0x20,0x19,0xc5,0x0e,0x74,0x7e,0x75,0x9f,0x11,0x1a,0x67,0xef,0x50,0xa3,
     374             :                                                 0x0f,0x84,0xce,0x0c,0x62,0xcc,0xf9,0x90,0x2f,0x6d,0xdb,0xc4,0x30,0xfb,0x7d,0xfc,
     375             :                                                 0x6e,0xd6,0xe0,0x31,0x01,0x23,0x2b,0xf5,0xb6,0xa8,0x81,0x4a,0xc6,0x44,0x9b,0x7a,
     376             :                                                 0x87,0xb9,0xbb,0x8b,0x7f,0x94,0x3c,0x21,0xdc,0xc2,0x60,0xfd,0x17,0xbd,0x47,0x65};
     377             : static const short fwht_l_twiddle_256[ 256 ] = {0x00,0xfc,0xfb,0x15,0x2d,0xfa,0xc1,0x14,0x62,0x2c,0xd9,0xf9,0xc0,0x45,0x13,0xe8,
     378             :                                                 0x01,0x61,0x86,0x2b,0xd8,0xba,0xf8,0x5d,0xbf,0x7a,0x44,0x6a,0x07,0x12,0xf1,0xe7,
     379             :                                                 0x00,0xdc,0x60,0x0a,0x1f,0x85,0x1c,0x2a,0x8b,0xd7,0x92,0xb9,0xf7,0x82,0x5c,0xad,
     380             :                                                 0x19,0xbe,0xb1,0x79,0x43,0x3d,0x69,0x9e,0x06,0x75,0x11,0x27,0x70,0xf0,0xd2,0xe6,
     381             :                                                 0xfe,0x2f,0xdb,0xea,0x88,0x5f,0x7c,0x09,0x0c,0x1e,0x8d,0x84,0x1b,0x3f,0x29,0xd4,
     382             :                                                 0x31,0x8a,0x8f,0xd6,0x91,0xcb,0xb8,0xc9,0xf6,0xb6,0x81,0x39,0xc7,0x5b,0x55,0xac,
     383             :                                                 0x18,0x65,0xbd,0xf4,0x22,0xb0,0xb4,0x78,0x7f,0x42,0x34,0x3c,0x68,0x37,0x9d,0x4e,
     384             :                                                 0xc5,0x05,0x96,0x74,0x10,0x59,0x26,0x9a,0x6f,0xa3,0xef,0x53,0x4b,0xd1,0xaa,0xe5,
     385             :                                                 0xfd,0x16,0x2e,0xc2,0x63,0xda,0x46,0xe9,0x02,0x87,0xbb,0x5e,0x7b,0x6b,0x08,0xf2,
     386             :                                                 0xdd,0x0b,0x20,0x1d,0x8c,0x93,0x83,0xae,0x1a,0xb2,0x3e,0x9f,0x76,0x28,0x71,0xd3,
     387             :                                                 0x30,0xeb,0x89,0x7d,0x0d,0x8e,0x40,0xd5,0x32,0x90,0xcc,0xca,0xb7,0x3a,0xc8,0x56,
     388             :                                                 0x66,0xf5,0x23,0xb5,0x80,0x35,0x38,0x4f,0xc6,0x97,0x5a,0x9b,0xa4,0x54,0x4c,0xab,
     389             :                                                 0x17,0xc3,0x64,0x47,0x03,0xbc,0x6c,0xf3,0xde,0x21,0x94,0xaf,0xb3,0xa0,0x77,0x72,
     390             :                                                 0xec,0x7e,0x0e,0x41,0x33,0xcd,0x3b,0x57,0x67,0x24,0x36,0x50,0x98,0x9c,0xa5,0x4d,
     391             :                                                 0xc4,0x48,0x04,0x6d,0xdf,0x95,0xa1,0x73,0xed,0x0f,0xce,0x58,0x25,0x51,0x99,0xa6,
     392             :                                                 0x49,0x6e,0xe0,0xa2,0xee,0xcf,0x52,0xa7,0x4a,0xe1,0xd0,0xa8,0xe2,0xa9,0xe3,0xe4};
     393             : 
     394             : #if FD_REEDSOL_ARITH_IMPL==0
     395             : static void
     396             : gen_pi_noavx_generic( uchar const * is_erased,
     397             :                       uchar       * output,
     398             :                       ulong         sz,
     399             :                       const short * l_twiddle ) {
     400             :   long scratch[ 256 ];
     401             : 
     402             :   for( ulong i=0UL; i<sz; i++ ) scratch[ i ] = is_erased[ i ];
     403             : 
     404             :   /* Unscaled FWHT */
     405             :   for( ulong h=1UL; h<sz; h<<=1 ) {
     406             :     for( ulong i=0UL; i<sz; i += 2UL*h ) for( ulong j=i; j<i+h; j++ ) {
     407             :       long x = scratch[ j   ];
     408             :       long y = scratch[ j+h ];
     409             :       scratch[ j   ] = x+y;
     410             :       scratch[ j+h ] = x-y;
     411             :     }
     412             :   }
     413             : 
     414             :   for( ulong i=0UL; i<sz; i++ ) scratch[ i ] *= l_twiddle[ i ];
     415             : 
     416             :   for( ulong h=1UL; h<sz; h<<=1 ) {
     417             :     for( ulong i=0UL; i<sz; i += 2UL*h ) for( ulong j=i; j<i+h; j++ ) {
     418             :       long x = scratch[ j   ];
     419             :       long y = scratch[ j+h ];
     420             :       scratch[ j   ] = x+y;
     421             :       scratch[ j+h ] = x-y;
     422             :     }
     423             :   }
     424             : 
     425             :   /* Negate the ones corresponding to erasures to compute 1/Pi' */
     426             :   for( ulong i=0UL; i<sz; i++ ) scratch[ i ] *= fd_long_if( is_erased[ i ], -1L, 1L );
     427             : 
     428             :   /* To fix the FWHT scaling, we need to multiply by sz^-1 mod 255.
     429             :      Given that sz is always a power of 2, this is not too bad.
     430             :      Let s = lg(sz).  Note that 2^8 == 256 == 1 (mod 255),
     431             :      so then (2^s) * (2^(8-s)) == 1    (mod 255).
     432             :      This implies that sz^-1 = 2^(8-s) (mod 255), and we can compute
     433             :      2^(8-s) by 256/s with normal integer division. */
     434             :   long sz_inv = 256L/(long)sz;
     435             :   /* The % operator in C doesn't work like the mathematical mod
     436             :      operation for negative numbers, so we need to add in a shift to
     437             :      make any input non-negative.  We can compute the smallest possible
     438             :      value at this point and it's -142397, so we add 255*559=142545. */
     439             :   for( ulong i=0UL; i<sz; i++ ) scratch[ i ] = (sz_inv*(scratch[ i ] + 255L*559L)) % 255L;
     440             : 
     441             :   for( ulong i=0UL; i<sz; i++ ) output[ i ] = (uchar)gf_arith_invlog_tbl[ scratch[ i ] ];
     442             : }
     443             : #endif
     444             : 
     445             : void
     446             : fd_reedsol_private_gen_pi_16( uchar const * is_erased,
     447          12 :                               uchar       * output ) {
     448          12 : #if FD_REEDSOL_ARITH_IMPL>0
     449             : #if FD_REEDSOL_PI_USE_SHORT
     450             :   ws_t erased_vec = _mm256_cvtepu8_epi16( vb_ld( is_erased ) );
     451             : 
     452             :   ws_t transformed = erased_vec;
     453             :   FD_REEDSOL_FWHT_16( transformed ); /* FWHT( R~ ) */
     454             :   /* |transformed| <= 16 */
     455             : 
     456             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
     457             :      |product| <= 16*255, but definitely may be negative */
     458             :   ws_t product = ws_mullo( transformed, ws_ld( fwht_l_twiddle_16 ) );
     459             : 
     460             :   /* log_pi is congruent (mod 255) to what the paper calls
     461             :      R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
     462             :      Let H be the 16x16 Hadamard matrix.  Then
     463             :          log_pi = (H * diag( fwht_l_twiddle_16 ) * H) * erased_vec
     464             :                   |---------------------------------|
     465             :                                 M
     466             :      The part labeled M is a matrix known ahead of time, so we can bound
     467             :      log_pi relatively easily when combined with the fact
     468             :      0<=erased_vec<=1 (just set the negative entries to 0, and multiply
     469             :      by all 1s. Then do the same, setting the positive entries to 0
     470             :      instead).  This tells us |log_pi| <= 4489 */
     471             :   FD_REEDSOL_FWHT_16( product );
     472             :   ws_t log_pi = product;
     473             : 
     474             :   /* Negate the ones corresponding to erasures to compute 1/Pi' */
     475             :   log_pi = ws_adjust_sign( log_pi, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec, 1 ) ) );
     476             : 
     477             :   log_pi = ws_add( log_pi, ws_bcast( (short)(255*18) ) ); /* Now 0<= log_pi <= 9079 < 2^15 */
     478             : 
     479             :   /* GCC informs me that for a ushort x,
     480             :      (x%255) == 0xFF & ( x + (x*0x8081)>>23).
     481             :      We need at least 31 bits of precision for the product, so
     482             :      mulh_epu16 is perfect. */
     483             :   log_pi = ws_and( ws_bcast( 0xFF ), ws_add( log_pi, ws_shru( ws_mulhi( log_pi, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     484             :   /* Now 0<=log_pi < 255 */
     485             : 
     486             :   /* Since our FWHT implementation is unscaled, we've computed a value
     487             :      16 times larger than what we'd like.  16^-1 == 16 (mod 255), but
     488             :      we're just going to use this in the exponentiation, so we can
     489             :      compute this implicitly.
     490             :        2^(log_pi * 16^-1) = 2^(16*log_pi) = (2^16)^log_pi = 76^log_pi
     491             :      (where 2 is the primitive element we used for the logs, an element
     492             :      of GF(2^8) ). */
     493             : 
     494             :   wb_t compact_log_pi = compact_ws( log_pi, ws_zero() );
     495             :   wb_t pi = exp_76( compact_log_pi );
     496             : 
     497             :   vb_st( output, _mm256_extracti128_si256( pi, 0 ) );
     498             : 
     499             : #else
     500          12 :   wb_t erased_vec = _mm256_setr_m128i( vb_ldu( is_erased ), _mm_setzero_si128() );
     501          12 :   wb_t to_transform = erased_vec;
     502          12 :   FD_REEDSOL_FWHT_16( to_transform );
     503          12 :   ws_t transformed = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform, 0 ) );
     504             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
     505             :      0<=product<256*255, so product is interpreted as unsigned. */
     506          12 :   ws_t product = ws_mullo( transformed, ws_ld( fwht_l_twiddle_16 ) );
     507             : 
     508             :   /* Compute mod 255, using the same approach as above. */
     509          12 :   product = ws_and( ws_bcast( 0xFF ), ws_add( product, ws_shru( ws_mulhi( product, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     510          12 :   wb_t compact_product = compact_ws( product, ws_zero() );
     511             : 
     512          12 :   FD_REEDSOL_FWHT_16( compact_product );
     513             : 
     514             :   /* Negate the ones corresponding to erasures */
     515          12 :   compact_product = wb_if( wb_eq( erased_vec, wb_zero() ), compact_product, wb_sub( wb_bcast( 255 ), compact_product ) );
     516             : 
     517          12 :   wb_t pi = exp_76( compact_product );
     518          12 :   vb_st( output, _mm256_extracti128_si256( pi, 0 ) );
     519          12 : #endif
     520             : #else /* No AVX implementation */
     521             : 
     522             :   gen_pi_noavx_generic( is_erased, output, 16UL, fwht_l_twiddle_16 );
     523             : 
     524             : #endif
     525          12 : }
     526             : 
     527             : void
     528             : fd_reedsol_private_gen_pi_32( uchar const * is_erased,
     529          33 :                               uchar       * output ) {
     530          33 : #if FD_REEDSOL_ARITH_IMPL>0
     531             : #if FD_REEDSOL_PI_USE_SHORT
     532             :   ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased        ) );
     533             :   ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 16UL ) );
     534             : 
     535             :   ws_t transformed0 = erased_vec0;
     536             :   ws_t transformed1 = erased_vec1;
     537             :   FD_REEDSOL_FWHT_32( transformed0, transformed1 ); /* FWHT( R~ ) */
     538             :   /* |transformed| <= 32 */
     539             : 
     540             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
     541             :      |product| <= 32*255, but definitely may be negative */
     542             :   ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_32        ) );
     543             :   ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_32 + 16UL ) );
     544             : 
     545             :   /* log_pi is congruent (mod 255) to what the paper calls
     546             :      R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
     547             :      |log_pi| <= 6945 using the same approach as above. */
     548             :   FD_REEDSOL_FWHT_32( product0, product1 );
     549             :   ws_t log_pi0 = product0;
     550             :   ws_t log_pi1 = product1;
     551             : 
     552             :   /* Negate the ones corresponding to erasures to compute 1/Pi' */
     553             :   log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
     554             :   log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
     555             : 
     556             :   log_pi0 = ws_add( log_pi0, ws_bcast( (short)(255*28) ) );
     557             :   log_pi1 = ws_add( log_pi1, ws_bcast( (short)(255*28) ) ); /* Now 0<= log_pi <= 14085 < 2^15 */
     558             : 
     559             :   /* GCC informs me that for a ushort x,
     560             :      (x%255) == 0xFF & ( x + (x*0x8081)>>23).
     561             :      We need at least 31 bits of precision for the product, so
     562             :      mulh_epu16 is perfect. */
     563             :   log_pi0 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi0, ws_shru( ws_mulhi( log_pi0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     564             :   log_pi1 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi1, ws_shru( ws_mulhi( log_pi1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     565             :   /* Now 0<=log_pi < 255 */
     566             : 
     567             :   /* Since our FWHT implementation is unscaled, we've computed a value
     568             :      32 times larger than what we'd like.  32^-1 == 8 (mod 255), but
     569             :      we're just going to use this in the exponentiation, so we can
     570             :      compute this implicitly.
     571             :        2^(log_pi * 32^-1) = 2^(8*log_pi) = (2^8)^log_pi = 29^log_pi
     572             :      (where 2 is the primitive element we used for the logs, an element
     573             :      of GF(2^8) ). */
     574             : 
     575             :   wb_t compact_log_pi = compact_ws( log_pi0, log_pi1 );
     576             :   wb_t pi = exp_29( compact_log_pi );
     577             : 
     578             :   wb_st( output, pi );
     579             : 
     580             : #else
     581          33 :   wb_t erased_vec = wb_ld( is_erased );
     582          33 :   wb_t to_transform = erased_vec;
     583          33 :   FD_REEDSOL_FWHT_32( to_transform );
     584          33 :   ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform, 0 ) );
     585          33 :   ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform, 1 ) );
     586             : 
     587             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
     588             :      0<=product<256*255, so product is interpreted as unsigned. */
     589          33 :   ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_32        ) );
     590          33 :   ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_32 + 16UL ) );
     591             : 
     592             :   /* Compute mod 255, using the same approach as above. */
     593          33 :   product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( ws_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     594          33 :   product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( ws_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     595          33 :   wb_t compact_product = compact_ws( product0, product1 );
     596             : 
     597          33 :   FD_REEDSOL_FWHT_32( compact_product );
     598             : 
     599             :   /* Negate the ones corresponding to erasures */
     600          33 :   compact_product = wb_if( wb_eq( erased_vec, wb_zero() ), compact_product, wb_sub( wb_bcast( 255 ), compact_product ) );
     601             : 
     602          33 :   wb_t pi = exp_29( compact_product );
     603          33 :   wb_st( output, pi );
     604          33 : #endif
     605             : #else /* No AVX implementation */
     606             : 
     607             :   gen_pi_noavx_generic( is_erased, output, 32UL, fwht_l_twiddle_32 );
     608             : 
     609             : #endif
     610          33 : }
     611             : 
     612             : void
     613             : fd_reedsol_private_gen_pi_64( uchar const * is_erased,
     614          39 :                               uchar       * output ) {
     615          39 : #if FD_REEDSOL_ARITH_IMPL>0
     616             : #if FD_REEDSOL_PI_USE_SHORT
     617             :   ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased        ) );
     618             :   ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 16UL ) );
     619             :   ws_t erased_vec2 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 32UL ) );
     620             :   ws_t erased_vec3 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 48UL ) );
     621             : 
     622             :   ws_t transformed0 = erased_vec0;
     623             :   ws_t transformed1 = erased_vec1;
     624             :   ws_t transformed2 = erased_vec2;
     625             :   ws_t transformed3 = erased_vec3;
     626             :   FD_REEDSOL_FWHT_64( transformed0, transformed1, transformed2, transformed3 ); /* FWHT( R~ ) */
     627             :   /* |transformed| <= 64 */
     628             : 
     629             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
     630             :      |product| <= 64*255, but definitely may be negative */
     631             :   ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_64        ) );
     632             :   ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_64 + 16UL ) );
     633             :   ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_64 + 32UL ) );
     634             :   ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_64 + 48UL ) );
     635             : 
     636             :   /* log_pi is congruent (mod 255) to what the paper calls
     637             :      R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
     638             :      |log_pi| <= 18918 using the same approach as above. */
     639             :   FD_REEDSOL_FWHT_64( product0, product1, product2, product3 );
     640             :   ws_t log_pi0 = product0;
     641             :   ws_t log_pi1 = product1;
     642             :   ws_t log_pi2 = product2;
     643             :   ws_t log_pi3 = product3;
     644             : 
     645             :   /* Negate the ones corresponding to erasures to compute 1/Pi' */
     646             :   log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
     647             :   log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
     648             :   log_pi2 = ws_adjust_sign( log_pi2, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec2, 1 ) ) );
     649             :   log_pi3 = ws_adjust_sign( log_pi3, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec3, 1 ) ) );
     650             : 
     651             :   log_pi0 = ws_add( log_pi0, ws_bcast( (short)(255*75) ) );
     652             :   log_pi1 = ws_add( log_pi1, ws_bcast( (short)(255*75) ) );
     653             :   log_pi2 = ws_add( log_pi2, ws_bcast( (short)(255*75) ) );
     654             :   log_pi3 = ws_add( log_pi3, ws_bcast( (short)(255*75) ) );
     655             :   /* Now 0<= log_pi <= 38043 < 2^16 (okay, since the next step treats it as unsigned */
     656             : 
     657             :   /* GCC informs me that for a ushort x,
     658             :      (x%255) == 0xFF & ( x + (x*0x8081)>>23).
     659             :      We need at least 31 bits of precision for the product, so
     660             :      mulh_epu16 is perfect. */
     661             :   log_pi0 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi0, ws_shru( ws_mulhi( log_pi0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     662             :   log_pi1 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi1, ws_shru( ws_mulhi( log_pi1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     663             :   log_pi2 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi2, ws_shru( ws_mulhi( log_pi2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     664             :   log_pi3 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi3, ws_shru( ws_mulhi( log_pi3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     665             :   /* Now 0<=log_pi < 255 */
     666             : 
     667             :   /* Since our FWHT implementation is unscaled, we've computed a value
     668             :      64 times larger than what we'd like.  64^-1 == 4 (mod 255), but
     669             :      we're just going to use this in the exponentiation, so we can
     670             :      compute this implicitly.
     671             :        2^(log_pi * 64^-1) = 2^(4*log_pi) = (2^4)^log_pi = 16^log_pi
     672             :      (where 2 is the primitive element we used for the logs, an element
     673             :      of GF(2^8) ). */
     674             : 
     675             :   wb_t compact_log_pi0 = compact_ws( log_pi0, log_pi1 );
     676             :   wb_t compact_log_pi1 = compact_ws( log_pi2, log_pi3 );
     677             :   wb_t pi0 = exp_16( compact_log_pi0 );
     678             :   wb_t pi1 = exp_16( compact_log_pi1 );
     679             : 
     680             :   wb_st( output,      pi0 );
     681             :   wb_st( output+32UL, pi1 );
     682             : 
     683             : #else
     684          39 :   wb_t erased_vec0 = wb_ld( is_erased        );
     685          39 :   wb_t erased_vec1 = wb_ld( is_erased + 32UL );
     686          39 :   wb_t to_transform0 = erased_vec0;
     687          39 :   wb_t to_transform1 = erased_vec1;
     688             : 
     689          39 :   FD_REEDSOL_FWHT_64( to_transform0, to_transform1 );
     690             : 
     691          39 :   ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 0 ) );
     692          39 :   ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 1 ) );
     693          39 :   ws_t transformed2 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 0 ) );
     694          39 :   ws_t transformed3 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 1 ) );
     695             : 
     696             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
     697             :      0<=product<256*255, so product is interpreted as unsigned. */
     698          39 :   ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_64        ) );
     699          39 :   ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_64 + 16UL ) );
     700          39 :   ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_64 + 32UL ) );
     701          39 :   ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_64 + 48UL ) );
     702             : 
     703             :   /* Compute mod 255, using the same approach as above. */
     704          39 :   product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( ws_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     705          39 :   product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( ws_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     706          39 :   product2 = ws_and( ws_bcast( 0xFF ), ws_add( product2, ws_shru( ws_mulhi( product2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     707          39 :   product3 = ws_and( ws_bcast( 0xFF ), ws_add( product3, ws_shru( ws_mulhi( product3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     708             : 
     709          39 :   wb_t compact_product0 = compact_ws( product0, product1 );
     710          39 :   wb_t compact_product1 = compact_ws( product2, product3 );
     711             : 
     712          39 :   FD_REEDSOL_FWHT_64( compact_product0, compact_product1 );
     713             : 
     714             :   /* Negate the ones corresponding to erasures */
     715          39 :   compact_product0 = wb_if( wb_eq( erased_vec0, wb_zero() ), compact_product0, wb_sub( wb_bcast( 255 ), compact_product0 ) );
     716          39 :   compact_product1 = wb_if( wb_eq( erased_vec1, wb_zero() ), compact_product1, wb_sub( wb_bcast( 255 ), compact_product1 ) );
     717             : 
     718          39 :   wb_t pi0 = exp_16( compact_product0 );
     719          39 :   wb_t pi1 = exp_16( compact_product1 );
     720          39 :   wb_st( output       , pi0 );
     721          39 :   wb_st( output + 32UL, pi1 );
     722          39 : #endif
     723             : #else /* No AVX implementation */
     724             : 
     725             :   gen_pi_noavx_generic( is_erased, output, 64UL, fwht_l_twiddle_64 );
     726             : 
     727             : #endif
     728          39 : }
     729             : 
     730             : void
     731             : fd_reedsol_private_gen_pi_128( uchar const * is_erased,
     732           6 :                                uchar       * output ) {
     733           6 : #if FD_REEDSOL_ARITH_IMPL>0
     734             : #if FD_REEDSOL_PI_USE_SHORT
     735             :   ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased         ) );
     736             :   ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased +  16UL ) );
     737             :   ws_t erased_vec2 = _mm256_cvtepu8_epi16( vb_ld( is_erased +  32UL ) );
     738             :   ws_t erased_vec3 = _mm256_cvtepu8_epi16( vb_ld( is_erased +  48UL ) );
     739             :   ws_t erased_vec4 = _mm256_cvtepu8_epi16( vb_ld( is_erased +  64UL ) );
     740             :   ws_t erased_vec5 = _mm256_cvtepu8_epi16( vb_ld( is_erased +  80UL ) );
     741             :   ws_t erased_vec6 = _mm256_cvtepu8_epi16( vb_ld( is_erased +  96UL ) );
     742             :   ws_t erased_vec7 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 112UL ) );
     743             : 
     744             :   ws_t transformed0 = erased_vec0;
     745             :   ws_t transformed1 = erased_vec1;
     746             :   ws_t transformed2 = erased_vec2;
     747             :   ws_t transformed3 = erased_vec3;
     748             :   ws_t transformed4 = erased_vec4;
     749             :   ws_t transformed5 = erased_vec5;
     750             :   ws_t transformed6 = erased_vec6;
     751             :   ws_t transformed7 = erased_vec7;
     752             :   FD_REEDSOL_FWHT_128( transformed0, transformed1, transformed2, transformed3, transformed4, transformed5, transformed6, transformed7 ); /* FWHT( R~ ) */
     753             :   /* |transformed| <= 128 */
     754             : 
     755             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
     756             :      -16256 <= product <= 32512 */
     757             :   ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_128         ) );
     758             :   ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_128 +  16UL ) );
     759             :   ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_128 +  32UL ) );
     760             :   ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_128 +  48UL ) );
     761             :   ws_t product4 = ws_mullo( transformed4, ws_ld( fwht_l_twiddle_128 +  64UL ) );
     762             :   ws_t product5 = ws_mullo( transformed5, ws_ld( fwht_l_twiddle_128 +  80UL ) );
     763             :   ws_t product6 = ws_mullo( transformed6, ws_ld( fwht_l_twiddle_128 +  96UL ) );
     764             :   ws_t product7 = ws_mullo( transformed7, ws_ld( fwht_l_twiddle_128 + 112UL ) );
     765             : 
     766             :   /* We need to reduce these mod 255 to prevent overflow in the next
     767             :      step. 0 <= product+64*255 <= 48832 < 2^16.  The mod operation
     768             :      treats the input as unsigned though, so this is okay. */
     769             :   product0 = ws_add( product0, ws_bcast( (short)64*255 ) );
     770             :   product1 = ws_add( product1, ws_bcast( (short)64*255 ) );
     771             :   product2 = ws_add( product2, ws_bcast( (short)64*255 ) );
     772             :   product3 = ws_add( product3, ws_bcast( (short)64*255 ) );
     773             :   product4 = ws_add( product4, ws_bcast( (short)64*255 ) );
     774             :   product5 = ws_add( product5, ws_bcast( (short)64*255 ) );
     775             :   product6 = ws_add( product6, ws_bcast( (short)64*255 ) );
     776             :   product7 = ws_add( product7, ws_bcast( (short)64*255 ) );
     777             : 
     778             :   product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( ws_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     779             :   product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( ws_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     780             :   product2 = ws_and( ws_bcast( 0xFF ), ws_add( product2, ws_shru( ws_mulhi( product2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     781             :   product3 = ws_and( ws_bcast( 0xFF ), ws_add( product3, ws_shru( ws_mulhi( product3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     782             :   product4 = ws_and( ws_bcast( 0xFF ), ws_add( product4, ws_shru( ws_mulhi( product4, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     783             :   product5 = ws_and( ws_bcast( 0xFF ), ws_add( product5, ws_shru( ws_mulhi( product5, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     784             :   product6 = ws_and( ws_bcast( 0xFF ), ws_add( product6, ws_shru( ws_mulhi( product6, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     785             :   product7 = ws_and( ws_bcast( 0xFF ), ws_add( product7, ws_shru( ws_mulhi( product7, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     786             : 
     787             :   /* Now 0 <= product < 255 */
     788             : 
     789             :   /* log_pi is congruent (mod 255) to what the paper calls
     790             :      R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
     791             :      |log_pi| <= 128*255 */
     792             :   FD_REEDSOL_FWHT_128( product0, product1, product2, product3, product4, product5, product6, product7 );
     793             :   ws_t log_pi0 = product0;
     794             :   ws_t log_pi1 = product1;
     795             :   ws_t log_pi2 = product2;
     796             :   ws_t log_pi3 = product3;
     797             :   ws_t log_pi4 = product4;
     798             :   ws_t log_pi5 = product5;
     799             :   ws_t log_pi6 = product6;
     800             :   ws_t log_pi7 = product7;
     801             : 
     802             :   /* Negate the ones corresponding to erasures to compute 1/Pi' */
     803             :   log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
     804             :   log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
     805             :   log_pi2 = ws_adjust_sign( log_pi2, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec2, 1 ) ) );
     806             :   log_pi3 = ws_adjust_sign( log_pi3, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec3, 1 ) ) );
     807             :   log_pi4 = ws_adjust_sign( log_pi4, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec4, 1 ) ) );
     808             :   log_pi5 = ws_adjust_sign( log_pi5, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec5, 1 ) ) );
     809             :   log_pi6 = ws_adjust_sign( log_pi6, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec6, 1 ) ) );
     810             :   log_pi7 = ws_adjust_sign( log_pi7, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec7, 1 ) ) );
     811             : 
     812             :   log_pi0 = ws_add( log_pi0, ws_bcast( (short)(255*128) ) );
     813             :   log_pi1 = ws_add( log_pi1, ws_bcast( (short)(255*128) ) );
     814             :   log_pi2 = ws_add( log_pi2, ws_bcast( (short)(255*128) ) );
     815             :   log_pi3 = ws_add( log_pi3, ws_bcast( (short)(255*128) ) );
     816             :   log_pi4 = ws_add( log_pi4, ws_bcast( (short)(255*128) ) );
     817             :   log_pi5 = ws_add( log_pi5, ws_bcast( (short)(255*128) ) );
     818             :   log_pi6 = ws_add( log_pi6, ws_bcast( (short)(255*128) ) );
     819             :   log_pi7 = ws_add( log_pi7, ws_bcast( (short)(255*128) ) ); /* Now 0<= log_pi <= 65152 < 2^16 */
     820             : 
     821             :   /* GCC informs me that for a ushort x,
     822             :      (x%255) == 0xFF & ( x + (x*0x8081)>>23).
     823             :      We need at least 31 bits of precision for the product, so
     824             :      mulh_epu16 is perfect. */
     825             :   log_pi0 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi0, ws_shru( ws_mulhi( log_pi0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     826             :   log_pi1 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi1, ws_shru( ws_mulhi( log_pi1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     827             :   log_pi2 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi2, ws_shru( ws_mulhi( log_pi2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     828             :   log_pi3 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi3, ws_shru( ws_mulhi( log_pi3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     829             :   log_pi4 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi4, ws_shru( ws_mulhi( log_pi4, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     830             :   log_pi5 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi5, ws_shru( ws_mulhi( log_pi5, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     831             :   log_pi6 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi6, ws_shru( ws_mulhi( log_pi6, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     832             :   log_pi7 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi7, ws_shru( ws_mulhi( log_pi7, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     833             :   /* Now 0<=log_pi < 255 */
     834             : 
     835             :   /* Since our FWHT implementation is unscaled, we've computed a value
     836             :      128 times larger than what we'd like.  128^-1 == 2 (mod 255), but
     837             :      we're just going to use this in the exponentiation, so we can
     838             :      compute this implicitly.
     839             :        2^(log_pi * 128^-1) = 2^(2*log_pi) = (2^2)^log_pi = 4^log_pi
     840             :      (where 2 is the primitive element we used for the logs, an element
     841             :      of GF(2^8) ). */
     842             : 
     843             :   wb_t compact_log_pi0 = compact_ws( log_pi0, log_pi1 );
     844             :   wb_t compact_log_pi1 = compact_ws( log_pi2, log_pi3 );
     845             :   wb_t compact_log_pi2 = compact_ws( log_pi4, log_pi5 );
     846             :   wb_t compact_log_pi3 = compact_ws( log_pi6, log_pi7 );
     847             :   wb_t pi0 = exp_4( compact_log_pi0 );
     848             :   wb_t pi1 = exp_4( compact_log_pi1 );
     849             :   wb_t pi2 = exp_4( compact_log_pi2 );
     850             :   wb_t pi3 = exp_4( compact_log_pi3 );
     851             : 
     852             :   wb_st( output,        pi0 );
     853             :   wb_st( output + 32UL, pi1 );
     854             :   wb_st( output + 64UL, pi2 );
     855             :   wb_st( output + 96UL, pi3 );
     856             : 
     857             : #else
     858           6 :   wb_t erased_vec0 = wb_ld( is_erased        );
     859           6 :   wb_t erased_vec1 = wb_ld( is_erased + 32UL );
     860           6 :   wb_t erased_vec2 = wb_ld( is_erased + 64UL );
     861           6 :   wb_t erased_vec3 = wb_ld( is_erased + 96UL );
     862           6 :   wb_t to_transform0 = erased_vec0;
     863           6 :   wb_t to_transform1 = erased_vec1;
     864           6 :   wb_t to_transform2 = erased_vec2;
     865           6 :   wb_t to_transform3 = erased_vec3;
     866             : 
     867           6 :   FD_REEDSOL_FWHT_128( to_transform0, to_transform1, to_transform2, to_transform3 );
     868             : 
     869           6 :   ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 0 ) );
     870           6 :   ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 1 ) );
     871           6 :   ws_t transformed2 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 0 ) );
     872           6 :   ws_t transformed3 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 1 ) );
     873           6 :   ws_t transformed4 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 0 ) );
     874           6 :   ws_t transformed5 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 1 ) );
     875           6 :   ws_t transformed6 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 0 ) );
     876           6 :   ws_t transformed7 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 1 ) );
     877             : 
     878             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
     879             :      0<=product<256*255, so product is interpreted as unsigned. */
     880           6 :   ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_128         ) );
     881           6 :   ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_128 +  16UL ) );
     882           6 :   ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_128 +  32UL ) );
     883           6 :   ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_128 +  48UL ) );
     884           6 :   ws_t product4 = ws_mullo( transformed4, ws_ld( fwht_l_twiddle_128 +  64UL ) );
     885           6 :   ws_t product5 = ws_mullo( transformed5, ws_ld( fwht_l_twiddle_128 +  80UL ) );
     886           6 :   ws_t product6 = ws_mullo( transformed6, ws_ld( fwht_l_twiddle_128 +  96UL ) );
     887           6 :   ws_t product7 = ws_mullo( transformed7, ws_ld( fwht_l_twiddle_128 + 112UL ) );
     888             : 
     889             :   /* Compute mod 255, using the same approach as above. */
     890           6 :   product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( ws_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     891           6 :   product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( ws_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     892           6 :   product2 = ws_and( ws_bcast( 0xFF ), ws_add( product2, ws_shru( ws_mulhi( product2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     893           6 :   product3 = ws_and( ws_bcast( 0xFF ), ws_add( product3, ws_shru( ws_mulhi( product3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     894           6 :   product4 = ws_and( ws_bcast( 0xFF ), ws_add( product4, ws_shru( ws_mulhi( product4, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     895           6 :   product5 = ws_and( ws_bcast( 0xFF ), ws_add( product5, ws_shru( ws_mulhi( product5, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     896           6 :   product6 = ws_and( ws_bcast( 0xFF ), ws_add( product6, ws_shru( ws_mulhi( product6, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     897           6 :   product7 = ws_and( ws_bcast( 0xFF ), ws_add( product7, ws_shru( ws_mulhi( product7, ws_bcast( (short)0x8081 ) ), 7 ) ) );
     898           6 :   wb_t compact_product0 = compact_ws( product0, product1 );
     899           6 :   wb_t compact_product1 = compact_ws( product2, product3 );
     900           6 :   wb_t compact_product2 = compact_ws( product4, product5 );
     901           6 :   wb_t compact_product3 = compact_ws( product6, product7 );
     902             : 
     903           6 :   FD_REEDSOL_FWHT_128( compact_product0, compact_product1, compact_product2, compact_product3 );
     904             : 
     905             :   /* Negate the ones corresponding to erasures */
     906           6 :   compact_product0 = wb_if( wb_eq( erased_vec0, wb_zero() ), compact_product0, wb_sub( wb_bcast( 255 ), compact_product0 ) );
     907           6 :   compact_product1 = wb_if( wb_eq( erased_vec1, wb_zero() ), compact_product1, wb_sub( wb_bcast( 255 ), compact_product1 ) );
     908           6 :   compact_product2 = wb_if( wb_eq( erased_vec2, wb_zero() ), compact_product2, wb_sub( wb_bcast( 255 ), compact_product2 ) );
     909           6 :   compact_product3 = wb_if( wb_eq( erased_vec3, wb_zero() ), compact_product3, wb_sub( wb_bcast( 255 ), compact_product3 ) );
     910             : 
     911           6 :   wb_t pi0 = exp_4( compact_product0 );
     912           6 :   wb_t pi1 = exp_4( compact_product1 );
     913           6 :   wb_t pi2 = exp_4( compact_product2 );
     914           6 :   wb_t pi3 = exp_4( compact_product3 );
     915           6 :   wb_st( output,        pi0 );
     916           6 :   wb_st( output + 32UL, pi1 );
     917           6 :   wb_st( output + 64UL, pi2 );
     918           6 :   wb_st( output + 96UL, pi3 );
     919           6 : #endif
     920             : #else /* No AVX implementation */
     921             : 
     922             :   gen_pi_noavx_generic( is_erased, output, 128UL, fwht_l_twiddle_128 );
     923             : 
     924             : #endif
     925           6 : }
     926             : 
     927             : void
     928             : fd_reedsol_private_gen_pi_256( uchar const * is_erased,
     929           0 :                                uchar       * output ) {
     930           0 : #if FD_REEDSOL_ARITH_IMPL>0
     931             : #if FD_REEDSOL_PI_USE_SHORT
     932             :   ws_t erased_vec0  = _mm256_cvtepu8_epi16( vb_ld( is_erased         ) );
     933             :   ws_t erased_vec1  = _mm256_cvtepu8_epi16( vb_ld( is_erased +  16UL ) );
     934             :   ws_t erased_vec2  = _mm256_cvtepu8_epi16( vb_ld( is_erased +  32UL ) );
     935             :   ws_t erased_vec3  = _mm256_cvtepu8_epi16( vb_ld( is_erased +  48UL ) );
     936             :   ws_t erased_vec4  = _mm256_cvtepu8_epi16( vb_ld( is_erased +  64UL ) );
     937             :   ws_t erased_vec5  = _mm256_cvtepu8_epi16( vb_ld( is_erased +  80UL ) );
     938             :   ws_t erased_vec6  = _mm256_cvtepu8_epi16( vb_ld( is_erased +  96UL ) );
     939             :   ws_t erased_vec7  = _mm256_cvtepu8_epi16( vb_ld( is_erased + 112UL ) );
     940             :   ws_t erased_vec8  = _mm256_cvtepu8_epi16( vb_ld( is_erased + 128UL ) );
     941             :   ws_t erased_vec9  = _mm256_cvtepu8_epi16( vb_ld( is_erased + 144UL ) );
     942             :   ws_t erased_vec10 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 160UL ) );
     943             :   ws_t erased_vec11 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 176UL ) );
     944             :   ws_t erased_vec12 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 192UL ) );
     945             :   ws_t erased_vec13 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 208UL ) );
     946             :   ws_t erased_vec14 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 224UL ) );
     947             :   ws_t erased_vec15 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 240UL ) );
     948             : 
     949             :   ws_t transformed0 = erased_vec0;
     950             :   ws_t transformed1 = erased_vec1;
     951             :   ws_t transformed2 = erased_vec2;
     952             :   ws_t transformed3 = erased_vec3;
     953             :   ws_t transformed4 = erased_vec4;
     954             :   ws_t transformed5 = erased_vec5;
     955             :   ws_t transformed6 = erased_vec6;
     956             :   ws_t transformed7 = erased_vec7;
     957             :   ws_t transformed8 = erased_vec8;
     958             :   ws_t transformed9 = erased_vec9;
     959             :   ws_t transformed10 = erased_vec10;
     960             :   ws_t transformed11 = erased_vec11;
     961             :   ws_t transformed12 = erased_vec12;
     962             :   ws_t transformed13 = erased_vec13;
     963             :   ws_t transformed14 = erased_vec14;
     964             :   ws_t transformed15 = erased_vec15;
     965             :   FD_REEDSOL_FWHT_256( transformed0, transformed1, transformed2, transformed3, transformed4, transformed5, transformed6, transformed7,
     966             :       transformed8, transformed9, transformed10, transformed11, transformed12, transformed13, transformed14, transformed15 ); /* FWHT( R~ ) */
     967             :   /* |transformed| <= 256 */
     968             : 
     969             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
     970             :      -32512 <= product <= 32512 */
     971             :   ws_t product0  = ws_mullo( transformed0,  ws_ld( fwht_l_twiddle_256         ) );
     972             :   ws_t product1  = ws_mullo( transformed1,  ws_ld( fwht_l_twiddle_256 +  16UL ) );
     973             :   ws_t product2  = ws_mullo( transformed2,  ws_ld( fwht_l_twiddle_256 +  32UL ) );
     974             :   ws_t product3  = ws_mullo( transformed3,  ws_ld( fwht_l_twiddle_256 +  48UL ) );
     975             :   ws_t product4  = ws_mullo( transformed4,  ws_ld( fwht_l_twiddle_256 +  64UL ) );
     976             :   ws_t product5  = ws_mullo( transformed5,  ws_ld( fwht_l_twiddle_256 +  80UL ) );
     977             :   ws_t product6  = ws_mullo( transformed6,  ws_ld( fwht_l_twiddle_256 +  96UL ) );
     978             :   ws_t product7  = ws_mullo( transformed7,  ws_ld( fwht_l_twiddle_256 + 112UL ) );
     979             :   ws_t product8  = ws_mullo( transformed8,  ws_ld( fwht_l_twiddle_256 + 128UL ) );
     980             :   ws_t product9  = ws_mullo( transformed9,  ws_ld( fwht_l_twiddle_256 + 144UL ) );
     981             :   ws_t product10 = ws_mullo( transformed10, ws_ld( fwht_l_twiddle_256 + 160UL ) );
     982             :   ws_t product11 = ws_mullo( transformed11, ws_ld( fwht_l_twiddle_256 + 176UL ) );
     983             :   ws_t product12 = ws_mullo( transformed12, ws_ld( fwht_l_twiddle_256 + 192UL ) );
     984             :   ws_t product13 = ws_mullo( transformed13, ws_ld( fwht_l_twiddle_256 + 208UL ) );
     985             :   ws_t product14 = ws_mullo( transformed14, ws_ld( fwht_l_twiddle_256 + 224UL ) );
     986             :   ws_t product15 = ws_mullo( transformed15, ws_ld( fwht_l_twiddle_256 + 240UL ) );
     987             : 
     988             :   /* We need to reduce these mod 255 to prevent overflow in the next
     989             :      step. 0 <= product+128*255 <= 65152 < 2^16.  The mod operation
     990             :      treats the input as unsigned though, so this is okay (but hanging
     991             :      in there by a thread!). */
     992             :   product0  = ws_mod255( ws_add( product0,  ws_bcast( (short)128*255 ) ) );
     993             :   product1  = ws_mod255( ws_add( product1,  ws_bcast( (short)128*255 ) ) );
     994             :   product2  = ws_mod255( ws_add( product2,  ws_bcast( (short)128*255 ) ) );
     995             :   product3  = ws_mod255( ws_add( product3,  ws_bcast( (short)128*255 ) ) );
     996             :   product4  = ws_mod255( ws_add( product4,  ws_bcast( (short)128*255 ) ) );
     997             :   product5  = ws_mod255( ws_add( product5,  ws_bcast( (short)128*255 ) ) );
     998             :   product6  = ws_mod255( ws_add( product6,  ws_bcast( (short)128*255 ) ) );
     999             :   product7  = ws_mod255( ws_add( product7,  ws_bcast( (short)128*255 ) ) );
    1000             :   product8  = ws_mod255( ws_add( product8,  ws_bcast( (short)128*255 ) ) );
    1001             :   product9  = ws_mod255( ws_add( product9,  ws_bcast( (short)128*255 ) ) );
    1002             :   product10 = ws_mod255( ws_add( product10, ws_bcast( (short)128*255 ) ) );
    1003             :   product11 = ws_mod255( ws_add( product11, ws_bcast( (short)128*255 ) ) );
    1004             :   product12 = ws_mod255( ws_add( product12, ws_bcast( (short)128*255 ) ) );
    1005             :   product13 = ws_mod255( ws_add( product13, ws_bcast( (short)128*255 ) ) );
    1006             :   product14 = ws_mod255( ws_add( product14, ws_bcast( (short)128*255 ) ) );
    1007             :   product15 = ws_mod255( ws_add( product15, ws_bcast( (short)128*255 ) ) );
    1008             : 
    1009             :   /* Now 0 <= product < 255 */
    1010             : 
    1011             :   /* log_pi is congruent (mod 255) to what the paper calls
    1012             :      R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
    1013             :      If we do the FWHT in the normal way, it might overflow, so we need to inline it and stick a mod in the middle */
    1014             :   ws_t log_pi0  = ws_mod255( ws_add( product0, product8  ) );  ws_t log_pi1 = ws_mod255( ws_add( product1, product9  ) );
    1015             :   ws_t log_pi2  = ws_mod255( ws_add( product2, product10 ) );  ws_t log_pi3 = ws_mod255( ws_add( product3, product11 ) );
    1016             :   ws_t log_pi4  = ws_mod255( ws_add( product4, product12 ) );  ws_t log_pi5 = ws_mod255( ws_add( product5, product13 ) );
    1017             :   ws_t log_pi6  = ws_mod255( ws_add( product6, product14 ) );  ws_t log_pi7 = ws_mod255( ws_add( product7, product15 ) );
    1018             :   ws_t log_pi8  = ws_mod255( ws_add( ws_sub( product0, product8 ), ws_bcast( (short)255*2 ) ) );
    1019             :   ws_t log_pi9  = ws_mod255( ws_add( ws_sub( product1, product9 ), ws_bcast( (short)255*2 ) ) );
    1020             :   ws_t log_pi10 = ws_mod255( ws_add( ws_sub( product2, product10 ), ws_bcast( (short)255*2 ) ) );
    1021             :   ws_t log_pi11 = ws_mod255( ws_add( ws_sub( product3, product11 ), ws_bcast( (short)255*2 ) ) );
    1022             :   ws_t log_pi12 = ws_mod255( ws_add( ws_sub( product4, product12 ), ws_bcast( (short)255*2 ) ) );
    1023             :   ws_t log_pi13 = ws_mod255( ws_add( ws_sub( product5, product13 ), ws_bcast( (short)255*2 ) ) );
    1024             :   ws_t log_pi14 = ws_mod255( ws_add( ws_sub( product6, product14 ), ws_bcast( (short)255*2 ) ) );
    1025             :   ws_t log_pi15 = ws_mod255( ws_add( ws_sub( product7, product15 ), ws_bcast( (short)255*2 ) ) );
    1026             : 
    1027             :   FD_REEDSOL_FWHT_128( log_pi0, log_pi1, log_pi2,  log_pi3,  log_pi4,  log_pi5,  log_pi6,  log_pi7 );
    1028             :   FD_REEDSOL_FWHT_128( log_pi8, log_pi9, log_pi10, log_pi11, log_pi12, log_pi13, log_pi14, log_pi15 );
    1029             :   /* Now |log_pi| <= 128*255 */
    1030             : 
    1031             :   /* Negate the ones corresponding to erasures to compute 1/Pi' */
    1032             :   log_pi0  = ws_adjust_sign( log_pi0,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0,  1 ) ) );
    1033             :   log_pi1  = ws_adjust_sign( log_pi1,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1,  1 ) ) );
    1034             :   log_pi2  = ws_adjust_sign( log_pi2,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec2,  1 ) ) );
    1035             :   log_pi3  = ws_adjust_sign( log_pi3,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec3,  1 ) ) );
    1036             :   log_pi4  = ws_adjust_sign( log_pi4,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec4,  1 ) ) );
    1037             :   log_pi5  = ws_adjust_sign( log_pi5,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec5,  1 ) ) );
    1038             :   log_pi6  = ws_adjust_sign( log_pi6,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec6,  1 ) ) );
    1039             :   log_pi7  = ws_adjust_sign( log_pi7,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec7,  1 ) ) );
    1040             :   log_pi8  = ws_adjust_sign( log_pi8,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec8,  1 ) ) );
    1041             :   log_pi9  = ws_adjust_sign( log_pi9,  ws_sub( ws_bcast( 1 ), ws_shl( erased_vec9,  1 ) ) );
    1042             :   log_pi10 = ws_adjust_sign( log_pi10, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec10, 1 ) ) );
    1043             :   log_pi11 = ws_adjust_sign( log_pi11, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec11, 1 ) ) );
    1044             :   log_pi12 = ws_adjust_sign( log_pi12, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec12, 1 ) ) );
    1045             :   log_pi13 = ws_adjust_sign( log_pi13, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec13, 1 ) ) );
    1046             :   log_pi14 = ws_adjust_sign( log_pi14, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec14, 1 ) ) );
    1047             :   log_pi15 = ws_adjust_sign( log_pi15, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec15, 1 ) ) );
    1048             : 
    1049             :   /* After the addition below, 0<= log_pi <= 65152 < 2^16. The mod
    1050             :      brings it back to 0 <= log_pi < 255. */
    1051             :   log_pi0  = ws_mod255( ws_add( log_pi0,  ws_bcast( (short)(255*128) ) ) );
    1052             :   log_pi1  = ws_mod255( ws_add( log_pi1,  ws_bcast( (short)(255*128) ) ) );
    1053             :   log_pi2  = ws_mod255( ws_add( log_pi2,  ws_bcast( (short)(255*128) ) ) );
    1054             :   log_pi3  = ws_mod255( ws_add( log_pi3,  ws_bcast( (short)(255*128) ) ) );
    1055             :   log_pi4  = ws_mod255( ws_add( log_pi4,  ws_bcast( (short)(255*128) ) ) );
    1056             :   log_pi5  = ws_mod255( ws_add( log_pi5,  ws_bcast( (short)(255*128) ) ) );
    1057             :   log_pi6  = ws_mod255( ws_add( log_pi6,  ws_bcast( (short)(255*128) ) ) );
    1058             :   log_pi7  = ws_mod255( ws_add( log_pi7,  ws_bcast( (short)(255*128) ) ) );
    1059             :   log_pi8  = ws_mod255( ws_add( log_pi8,  ws_bcast( (short)(255*128) ) ) );
    1060             :   log_pi9  = ws_mod255( ws_add( log_pi9,  ws_bcast( (short)(255*128) ) ) );
    1061             :   log_pi10 = ws_mod255( ws_add( log_pi10, ws_bcast( (short)(255*128) ) ) );
    1062             :   log_pi11 = ws_mod255( ws_add( log_pi11, ws_bcast( (short)(255*128) ) ) );
    1063             :   log_pi12 = ws_mod255( ws_add( log_pi12, ws_bcast( (short)(255*128) ) ) );
    1064             :   log_pi13 = ws_mod255( ws_add( log_pi13, ws_bcast( (short)(255*128) ) ) );
    1065             :   log_pi14 = ws_mod255( ws_add( log_pi14, ws_bcast( (short)(255*128) ) ) );
    1066             :   log_pi15 = ws_mod255( ws_add( log_pi15, ws_bcast( (short)(255*128) ) ) );
    1067             : 
    1068             :   /* Since our FWHT implementation is unscaled, we've computed a value
    1069             :      256 times larger than what we'd like.  256^-1==1^-1 == 1 (mod 255),
    1070             :      so we don't need to do anything special. */
    1071             : 
    1072             :   wb_t compact_log_pi0 = compact_ws( log_pi0,  log_pi1  );
    1073             :   wb_t compact_log_pi1 = compact_ws( log_pi2,  log_pi3  );
    1074             :   wb_t compact_log_pi2 = compact_ws( log_pi4,  log_pi5  );
    1075             :   wb_t compact_log_pi3 = compact_ws( log_pi6,  log_pi7  );
    1076             :   wb_t compact_log_pi4 = compact_ws( log_pi8,  log_pi9  );
    1077             :   wb_t compact_log_pi5 = compact_ws( log_pi10, log_pi11 );
    1078             :   wb_t compact_log_pi6 = compact_ws( log_pi12, log_pi13 );
    1079             :   wb_t compact_log_pi7 = compact_ws( log_pi14, log_pi15 );
    1080             :   wb_t pi0 = exp_2( compact_log_pi0 );
    1081             :   wb_t pi1 = exp_2( compact_log_pi1 );
    1082             :   wb_t pi2 = exp_2( compact_log_pi2 );
    1083             :   wb_t pi3 = exp_2( compact_log_pi3 );
    1084             :   wb_t pi4 = exp_2( compact_log_pi4 );
    1085             :   wb_t pi5 = exp_2( compact_log_pi5 );
    1086             :   wb_t pi6 = exp_2( compact_log_pi6 );
    1087             :   wb_t pi7 = exp_2( compact_log_pi7 );
    1088             : 
    1089             :   wb_st( output,         pi0 );
    1090             :   wb_st( output +  32UL, pi1 );
    1091             :   wb_st( output +  64UL, pi2 );
    1092             :   wb_st( output +  96UL, pi3 );
    1093             :   wb_st( output + 128UL, pi4 );
    1094             :   wb_st( output + 160UL, pi5 );
    1095             :   wb_st( output + 192UL, pi6 );
    1096             :   wb_st( output + 224UL, pi7 );
    1097             : 
    1098             : #else
    1099           0 :   wb_t erased_vec0 = wb_ld( is_erased         );
    1100           0 :   wb_t erased_vec1 = wb_ld( is_erased +  32UL );
    1101           0 :   wb_t erased_vec2 = wb_ld( is_erased +  64UL );
    1102           0 :   wb_t erased_vec3 = wb_ld( is_erased +  96UL );
    1103           0 :   wb_t erased_vec4 = wb_ld( is_erased + 128UL );
    1104           0 :   wb_t erased_vec5 = wb_ld( is_erased + 160UL );
    1105           0 :   wb_t erased_vec6 = wb_ld( is_erased + 192UL );
    1106           0 :   wb_t erased_vec7 = wb_ld( is_erased + 224UL );
    1107           0 :   wb_t to_transform0 = erased_vec0;
    1108           0 :   wb_t to_transform1 = erased_vec1;
    1109           0 :   wb_t to_transform2 = erased_vec2;
    1110           0 :   wb_t to_transform3 = erased_vec3;
    1111           0 :   wb_t to_transform4 = erased_vec4;
    1112           0 :   wb_t to_transform5 = erased_vec5;
    1113           0 :   wb_t to_transform6 = erased_vec6;
    1114           0 :   wb_t to_transform7 = erased_vec7;
    1115             : 
    1116           0 :   FD_REEDSOL_FWHT_256( to_transform0, to_transform1, to_transform2, to_transform3,
    1117           0 :                        to_transform4, to_transform5, to_transform6, to_transform7 );
    1118             : 
    1119           0 :   ws_t transformed0  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 0 ) );
    1120           0 :   ws_t transformed1  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 1 ) );
    1121           0 :   ws_t transformed2  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 0 ) );
    1122           0 :   ws_t transformed3  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 1 ) );
    1123           0 :   ws_t transformed4  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 0 ) );
    1124           0 :   ws_t transformed5  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 1 ) );
    1125           0 :   ws_t transformed6  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 0 ) );
    1126           0 :   ws_t transformed7  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 1 ) );
    1127           0 :   ws_t transformed8  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform4, 0 ) );
    1128           0 :   ws_t transformed9  = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform4, 1 ) );
    1129           0 :   ws_t transformed10 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform5, 0 ) );
    1130           0 :   ws_t transformed11 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform5, 1 ) );
    1131           0 :   ws_t transformed12 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform6, 0 ) );
    1132           0 :   ws_t transformed13 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform6, 1 ) );
    1133           0 :   ws_t transformed14 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform7, 0 ) );
    1134           0 :   ws_t transformed15 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform7, 1 ) );
    1135             : 
    1136             :   /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
    1137             :      0<=product<256*255, so product is interpreted as unsigned. */
    1138           0 :   ws_t product0  = ws_mod255( ws_mullo( transformed0,  ws_ld( fwht_l_twiddle_256         ) ) );
    1139           0 :   ws_t product1  = ws_mod255( ws_mullo( transformed1,  ws_ld( fwht_l_twiddle_256 +  16UL ) ) );
    1140           0 :   ws_t product2  = ws_mod255( ws_mullo( transformed2,  ws_ld( fwht_l_twiddle_256 +  32UL ) ) );
    1141           0 :   ws_t product3  = ws_mod255( ws_mullo( transformed3,  ws_ld( fwht_l_twiddle_256 +  48UL ) ) );
    1142           0 :   ws_t product4  = ws_mod255( ws_mullo( transformed4,  ws_ld( fwht_l_twiddle_256 +  64UL ) ) );
    1143           0 :   ws_t product5  = ws_mod255( ws_mullo( transformed5,  ws_ld( fwht_l_twiddle_256 +  80UL ) ) );
    1144           0 :   ws_t product6  = ws_mod255( ws_mullo( transformed6,  ws_ld( fwht_l_twiddle_256 +  96UL ) ) );
    1145           0 :   ws_t product7  = ws_mod255( ws_mullo( transformed7,  ws_ld( fwht_l_twiddle_256 + 112UL ) ) );
    1146           0 :   ws_t product8  = ws_mod255( ws_mullo( transformed8,  ws_ld( fwht_l_twiddle_256 + 128UL ) ) );
    1147           0 :   ws_t product9  = ws_mod255( ws_mullo( transformed9,  ws_ld( fwht_l_twiddle_256 + 144UL ) ) );
    1148           0 :   ws_t product10 = ws_mod255( ws_mullo( transformed10, ws_ld( fwht_l_twiddle_256 + 160UL ) ) );
    1149           0 :   ws_t product11 = ws_mod255( ws_mullo( transformed11, ws_ld( fwht_l_twiddle_256 + 176UL ) ) );
    1150           0 :   ws_t product12 = ws_mod255( ws_mullo( transformed12, ws_ld( fwht_l_twiddle_256 + 192UL ) ) );
    1151           0 :   ws_t product13 = ws_mod255( ws_mullo( transformed13, ws_ld( fwht_l_twiddle_256 + 208UL ) ) );
    1152           0 :   ws_t product14 = ws_mod255( ws_mullo( transformed14, ws_ld( fwht_l_twiddle_256 + 224UL ) ) );
    1153           0 :   ws_t product15 = ws_mod255( ws_mullo( transformed15, ws_ld( fwht_l_twiddle_256 + 240UL ) ) );
    1154             : 
    1155           0 :   wb_t compact_product0 = compact_ws( product0,  product1  );
    1156           0 :   wb_t compact_product1 = compact_ws( product2,  product3  );
    1157           0 :   wb_t compact_product2 = compact_ws( product4,  product5  );
    1158           0 :   wb_t compact_product3 = compact_ws( product6,  product7  );
    1159           0 :   wb_t compact_product4 = compact_ws( product8,  product9  );
    1160           0 :   wb_t compact_product5 = compact_ws( product10, product11 );
    1161           0 :   wb_t compact_product6 = compact_ws( product12, product13 );
    1162           0 :   wb_t compact_product7 = compact_ws( product14, product15 );
    1163             : 
    1164           0 :   FD_REEDSOL_FWHT_256( compact_product0, compact_product1, compact_product2, compact_product3,
    1165           0 :                        compact_product4, compact_product5, compact_product6, compact_product7 );
    1166             : 
    1167             :   /* Negate the ones corresponding to erasures */
    1168           0 :   compact_product0 = wb_if( wb_eq( erased_vec0, wb_zero() ), compact_product0, wb_sub( wb_bcast( 255 ), compact_product0 ) );
    1169           0 :   compact_product1 = wb_if( wb_eq( erased_vec1, wb_zero() ), compact_product1, wb_sub( wb_bcast( 255 ), compact_product1 ) );
    1170           0 :   compact_product2 = wb_if( wb_eq( erased_vec2, wb_zero() ), compact_product2, wb_sub( wb_bcast( 255 ), compact_product2 ) );
    1171           0 :   compact_product3 = wb_if( wb_eq( erased_vec3, wb_zero() ), compact_product3, wb_sub( wb_bcast( 255 ), compact_product3 ) );
    1172           0 :   compact_product4 = wb_if( wb_eq( erased_vec4, wb_zero() ), compact_product4, wb_sub( wb_bcast( 255 ), compact_product4 ) );
    1173           0 :   compact_product5 = wb_if( wb_eq( erased_vec5, wb_zero() ), compact_product5, wb_sub( wb_bcast( 255 ), compact_product5 ) );
    1174           0 :   compact_product6 = wb_if( wb_eq( erased_vec6, wb_zero() ), compact_product6, wb_sub( wb_bcast( 255 ), compact_product6 ) );
    1175           0 :   compact_product7 = wb_if( wb_eq( erased_vec7, wb_zero() ), compact_product7, wb_sub( wb_bcast( 255 ), compact_product7 ) );
    1176             : 
    1177           0 :   wb_t pi0 = exp_2( compact_product0 );
    1178           0 :   wb_t pi1 = exp_2( compact_product1 );
    1179           0 :   wb_t pi2 = exp_2( compact_product2 );
    1180           0 :   wb_t pi3 = exp_2( compact_product3 );
    1181           0 :   wb_t pi4 = exp_2( compact_product4 );
    1182           0 :   wb_t pi5 = exp_2( compact_product5 );
    1183           0 :   wb_t pi6 = exp_2( compact_product6 );
    1184           0 :   wb_t pi7 = exp_2( compact_product7 );
    1185           0 :   wb_st( output,         pi0 );
    1186           0 :   wb_st( output +  32UL, pi1 );
    1187           0 :   wb_st( output +  64UL, pi2 );
    1188           0 :   wb_st( output +  96UL, pi3 );
    1189           0 :   wb_st( output + 128UL, pi4 );
    1190           0 :   wb_st( output + 160UL, pi5 );
    1191           0 :   wb_st( output + 192UL, pi6 );
    1192           0 :   wb_st( output + 224UL, pi7 );
    1193           0 : #endif
    1194             : #else /* No AVX implementation */
    1195             : 
    1196             :   gen_pi_noavx_generic( is_erased, output, 256UL, fwht_l_twiddle_256 );
    1197             : 
    1198             : #endif
    1199           0 : }

Generated by: LCOV version 1.14