Line data Source code
1 : #include "fd_reedsol_private.h"
2 :
3 : /* TODO: Move this high-level overview
4 :
5 : The main Lin, et al. paper proposes a clever method for dealing with
6 : erasures. Suppose there is a polynomial P(x) of degree <k, and we
7 : know it's value at k inputs, but not necessarily at inputs 0, 1, ...
8 : k-1. We construct a polynomial Pi(x) (that's the uppercase Greek
9 : letter, not shorthand for P_i) with zeros at the erasures, i.e. at
10 : the x in [0, n) for which we don't know P(x). We don't care what the
11 : value of Pi(x) is at the non-erasures other than that we know the
12 : value. This means we know the value of the product P(x)Pi(x) for
13 : each x in [0, n). Additionally, since the order of Pi(x) is n-k, the
14 : order of P(x)Pi(x) is <n. Now we're back in the regime of
15 : Reed-Solomon encoding: we know the first n values of a polynomial of
16 : order <n, so we can use the FFT and IFFT operators to evaluate this
17 : polynomial computationally efficiently wherever we please.
18 :
19 : However, we don't actually care about the value of P(x)Pi(x); we want
20 : the value of P(x). To accomplish this, we take the formal derivative
21 : ("formal" in the sense that these are polynomials over a finite field
22 : not real numbers, but we ignore that for a minute):
23 : (P(x)Pi(x))' = P'(x)Pi(x) + P(x)Pi'(x).
24 : At erasures, Pi(x) = 0 by construction, so
25 : P(x) = (P(x)Pi(x))' / Pi'(x)
26 :
27 : Now then, all we need is a fast way to compute the value of the
28 : formal derivative at some points given its value at 0, 1, ..., n-1.
29 : It turns out, the special basis we use for the rest of the
30 : Reed-Solomon computation gives us a computationally efficient way to
31 : compute formal derivatives.
32 :
33 : Thus, the overall algorithm is this:
34 : 1. Compute the value of P(x)Pi(x) at x in [0, n)
35 : 2. Use the IFFT to represent the polynomial in the coefficient basis.
36 : 3. Take the formal derivative in the coefficient basis.
37 : 4. Use the FFT to compute the value of (P(x)Pi(x))' for x in [0, n).
38 : 5. Compute Pi'(x) directly.
39 : 6. Divide the results of step 4 and 5.
40 :
41 : That's roughly the approach this implementation uses. The paper gives
42 : some tips on how to optimize the computation of Pi and Pi' using the
43 : Fast Walsh-Hadamard transform in Appendix A that the code in this
44 : implementation also uses. */
45 :
46 : #if FD_REEDSOL_ARITH_IMPL>0
47 :
48 : /* When using AVX, the representation used for internal computation can
49 : be done with unsigned chars or with shorts. They give the same
50 : result in the end, but have different performance characteristics.
51 : It's not always obvious which is faster, so this is a compile-time
52 : switch for it.
53 :
54 : When FD_REEDSOL_PI_USE_SHORT==1, the FWHT operates on signed
55 : integers, so some care is needed to make sure overflow can't happen.
56 :
57 : When FD_REESOL_PI_USE_SHORT==0, the FWHT operates on a slightly
58 : overcomplete representation of the integers mod 255 (yes,
59 : unfortunately not mod 256). In particular, the value 255 is allowed,
60 : which is interchangeable with 0. */
61 :
62 : #ifndef FD_REEDSOL_PI_USE_SHORT
63 : #define FD_REEDSOL_PI_USE_SHORT 0
64 : #endif
65 :
66 : /* Define some helper macros like what we have in util/simd for a vector
67 : of shorts. */
68 :
69 : #include "../../util/simd/fd_sse.h"
70 :
71 564 : #define ws_t __m256i
72 : #define ws_add(a,b) _mm256_add_epi16( (a), (b) )
73 : #define ws_sub(a,b) _mm256_sub_epi16( (a), (b) )
74 : #define ws_bcast(s0) _mm256_set1_epi16( (s0) )
75 : #define ws_adjust_sign(a,b) _mm256_sign_epi16( (a), (b) ) /* scales elements in a by the sign of the corresponding element of b */
76 282 : #define ws_mullo(a,b) _mm256_mullo_epi16( (a), (b) )
77 : #define ws_mulhi(a,b) _mm256_mulhi_epu16( (a), (b) )
78 : #define ws_shl(a,imm) _mm256_slli_epi16( (a), (imm) )
79 282 : #define ws_and(a,b) _mm256_and_si256( (a), (b) )
80 : #define ws_shru(a,imm) _mm256_srli_epi16( (a), (imm) )
81 12 : #define ws_zero() _mm256_setzero_si256() /* Return [ 0 0 0 0 0 ... 0 0 ] */
82 :
83 282 : FD_FN_UNUSED static inline ws_t ws_ld( short const * p ) { return _mm256_load_si256( (__m256i const *)p ); }
84 0 : FD_FN_UNUSED static inline ws_t ws_ldu( short const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
85 0 : FD_FN_UNUSED static inline void ws_st( short * p, ws_t i ) { _mm256_store_si256( (__m256i *)p, i ); }
86 0 : FD_FN_UNUSED static inline void ws_stu( short * p, ws_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
87 :
88 : static inline ws_t
89 0 : ws_mod255( ws_t x ) {
90 : /* GCC informs me that for a ushort x,
91 : (x%255) == 0xFF & ( x + (x*0x8081)>>23).
92 : We need at least 31 bits of precision for the product, so
93 : mulh_epu16 is perfect. */
94 0 : return ws_and( ws_bcast( 0xFF ), ws_add( x, ws_shru( ws_mulhi( x, ws_bcast( (short)0x8081 ) ), 7 ) ) );
95 0 : }
96 :
97 : /* The following macros implement the unscaled Fast Walsh-Hadamard
98 : transform. As alluded to above, this gives us a way to compute Pi
99 : and Pi' in O(n lg n) time. These are designed for use within this
100 : file, not external use.
101 :
102 : Unlike the rest of the similar-seeming components in fd_reedsol (e.g.
103 : FFT, PPT), this computes the transform within a single (or few) AVX
104 : vectors, not in parallel across each component of the vector. I.e. if
105 : FD_REEDSOL_ARITH_IMPL>0, to compute a 16-element FWHD, you pass one
106 : AVX vector (16*short), not 16 vectors.
107 :
108 : Also unlike the rest of the similar-seeming components in fd_reedsol,
109 : this works on the group Z/255Z (integers mod 255). Since 255 is not
110 : a prime, this is not a field, but the FD_REEDSOL_FWHT only needs addition,
111 : subtraction, and division by powers of 2 (which have inverses mod
112 : 255), so it's not a problem.
113 :
114 : The typical FWHT multiplies by a factor of 1/sqrt(2) at each step.
115 : To convert the unscaled version to the scaled version, divide the
116 : result by sqrt(2)^lg(N). Since we often do two transforms, we need
117 : to divide by N ( = (sqrt(2)^lg(N))^2 ).
118 : */
119 :
120 : #if FD_REEDSOL_PI_USE_SHORT
121 :
122 : #define FD_REEDSOL_FWHT_16( x ) do { ws_t _x = (x); \
123 : _x = ws_add( _mm256_setr_m128i( _mm256_extracti128_si256( _x, 1 ), _mm256_extracti128_si256( _x, 0 ) ), \
124 : ws_adjust_sign( _x, _mm256_setr_epi16( 1,1,1,1, 1,1,1,1, -1,-1,-1,-1, -1,-1,-1,-1 ) ) ); \
125 : _x = ws_add( _mm256_shuffle_epi32( _x, 0x4E ), \
126 : ws_adjust_sign( _x, _mm256_setr_epi16( 1,1,1,1, -1,-1,-1,-1, 1,1,1,1, -1,-1,-1,-1 ) ) ); \
127 : _x = ws_add( _mm256_shuffle_epi32( _x, 0xB1 ), \
128 : ws_adjust_sign( _x, _mm256_setr_epi16( 1,1,-1,-1, 1,1,-1,-1, 1,1,-1,-1, 1,1,-1,-1 ) ) ); \
129 : _x = ws_add( _mm256_shuffle_epi8( _x, _mm256_setr_epi8( 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, \
130 : 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ) ), \
131 : ws_adjust_sign( _x, _mm256_setr_epi16( 1,-1,1,-1, 1,-1,1,-1, 1,-1,1,-1, 1,-1,1,-1 ) ) ); \
132 : (x) = _x; } while( 0 )
133 :
134 : #define FD_REEDSOL_FWHT_32( x0, x1 ) do { \
135 : ws_t _y0i = (x0); ws_t _y1i = (x1); \
136 : ws_t _y0 = ws_add( _y0i, _y1i ); ws_t _y1 = ws_sub( _y0i, _y1i ); \
137 : FD_REEDSOL_FWHT_16( _y0 ); FD_REEDSOL_FWHT_16( _y1 ); \
138 : (x0) = _y0; (x1) = _y1; \
139 : } while( 0 )
140 :
141 : #define FD_REEDSOL_FWHT_64( x0, x1, x2, x3 ) do { \
142 : ws_t _z0, _z1, _z2, _z3; ws_t _z0i, _z1i, _z2i, _z3i; \
143 : _z0i = (x0); _z1i = (x1); _z2i = (x2); _z3i = (x3); \
144 : _z0 = ws_add( _z0i, _z2i ); _z1 = ws_add( _z1i, _z3i ); _z2 = ws_sub( _z0i, _z2i ); _z3 = ws_sub( _z1i, _z3i ); \
145 : FD_REEDSOL_FWHT_32( _z0, _z1 ); FD_REEDSOL_FWHT_32( _z2, _z3 ); \
146 : (x0) = _z0; (x1) = _z1; (x2) = _z2; (x3) = _z3; \
147 : } while( 0 )
148 :
149 : #define FD_REEDSOL_FWHT_128( x0, x1, x2, x3, x4, x5, x6, x7 ) do { \
150 : ws_t _w0, _w1, _w2, _w3, _w4, _w5, _w6, _w7; \
151 : ws_t _w0i, _w1i, _w2i, _w3i, _w4i, _w5i, _w6i, _w7i; \
152 : _w0i = (x0); _w1i = (x1); _w2i = (x2); _w3i = (x3); \
153 : _w4i = (x4); _w5i = (x5); _w6i = (x6); _w7i = (x7); \
154 : _w0 = ws_add( _w0i, _w4i ); _w1 = ws_add( _w1i, _w5i ); _w2 = ws_add( _w2i, _w6i ); _w3 = ws_add( _w3i, _w7i ); \
155 : _w4 = ws_sub( _w0i, _w4i ); _w5 = ws_sub( _w1i, _w5i ); _w6 = ws_sub( _w2i, _w6i ); _w7 = ws_sub( _w3i, _w7i ); \
156 : FD_REEDSOL_FWHT_64( _w0, _w1, _w2, _w3 ); FD_REEDSOL_FWHT_64( _w4, _w5, _w6, _w7 ); \
157 : (x0) = _w0; (x1) = _w1; (x2) = _w2; (x3) = _w3; (x4) = _w4; (x5) = _w5; (x6) = _w6; (x7) = _w7; \
158 : } while( 0 )
159 :
160 : #define FD_REEDSOL_FWHT_256( x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 ) do { \
161 : ws_t _v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7, _v8, _v9, _v10, _v11, _v12, _v13, _v14, _v15; \
162 : ws_t _v0i, _v1i, _v2i, _v3i, _v4i, _v5i, _v6i, _v7i, _v8i, _v9i, _v10i, _v11i, _v12i, _v13i, _v14i, _v15i; \
163 : _v0i = (x0); _v1i = (x1); _v2i = (x2); _v3i = (x3); \
164 : _v4i = (x4); _v5i = (x5); _v6i = (x6); _v7i = (x7); \
165 : _v8i = (x8); _v9i = (x9); _v10i = (x10); _v11i = (x11); \
166 : _v12i = (x12); _v13i = (x13); _v14i = (x14); _v15i = (x15); \
167 : _v0 = ws_add( _v0i, _v8i ); _v1 = ws_add( _v1i, _v9i ); _v2 = ws_add( _v2i, _v10i ); _v3 = ws_add( _v3i, _v11i ); \
168 : _v4 = ws_add( _v4i, _v12i ); _v5 = ws_add( _v5i, _v13i ); _v6 = ws_add( _v6i, _v14i ); _v7 = ws_add( _v7i, _v15i ); \
169 : _v8 = ws_sub( _v0i, _v8i ); _v9 = ws_sub( _v1i, _v9i ); _v10 = ws_sub( _v2i, _v10i ); _v11 = ws_sub( _v3i, _v11i ); \
170 : _v12 = ws_sub( _v4i, _v12i ); _v13 = ws_sub( _v5i, _v13i ); _v14 = ws_sub( _v6i, _v14i ); _v15 = ws_sub( _v7i, _v15i ); \
171 : FD_REEDSOL_FWHT_128( _v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7 ); FD_REEDSOL_FWHT_128( _v8, _v9, _v10, _v11, _v12, _v13, _v14, _v15 ); \
172 : (x0) = _v0; (x1) = _v1; (x2) = _v2; (x3) = _v3; (x4) = _v4; (x5) = _v5; (x6) = _v6; (x7) = _v7; \
173 : (x8) = _v8; (x9) = _v9; (x10) = _v10; (x11) = _v11; (x12) = _v12; (x13) = _v13; (x14) = _v14; (x15) = _v15; \
174 : } while( 0 )
175 :
176 : #else /* FD_REEDSOL_PI_USE_SHORT */
177 :
178 : static inline wb_t
179 1698 : add_mod_255( wb_t a, wb_t b ) {
180 1698 : wb_t sum = wb_add( a, b );
181 1698 : wb_t overflowed = wb_lt( sum, a );
182 1698 : return wb_sub( sum, overflowed );
183 1698 : }
184 :
185 294 : #define FD_REEDSOL_FWHT_16( x ) do { wb_t _x = (x); \
186 294 : wb_t negated, unshifted, shifted; \
187 294 : /* Shift by 8 elements (8B) */ \
188 294 : negated = wb_sub( wb_bcast( 0xFF ), _x ); \
189 294 : unshifted = _mm256_blend_epi32( _x, negated, 0xCC ); \
190 294 : shifted = _mm256_shuffle_epi32( _x, 0x4E ); \
191 294 : _x = add_mod_255( unshifted, shifted ); \
192 294 : /* Shift by 4 elements (4B) */ \
193 294 : negated = wb_sub( wb_bcast( 0xFF ), _x ); \
194 294 : unshifted = _mm256_blend_epi32( _x, negated, 0xAA ); \
195 294 : shifted = _mm256_shuffle_epi32( _x, 0xB1 ); \
196 294 : _x = add_mod_255( unshifted, shifted ); \
197 294 : /* Shift by 2 elements (2B) */ \
198 294 : negated = wb_sub( wb_bcast( 0xFF ), _x ); \
199 294 : unshifted = _mm256_blend_epi16( _x, negated, 0xAA ); \
200 294 : shifted = wb_exch_adj_pair( _x ); \
201 294 : _x = add_mod_255( unshifted, shifted ); \
202 294 : /* Shift by 1 element (1B) */ \
203 294 : negated = wb_sub( wb_bcast( 0xFF ), _x ); \
204 294 : unshifted = _mm256_blendv_epi8( _x, negated, wb_bcast_pair( 0x01, 0xFF ) ); \
205 294 : shifted = wb_exch_adj( _x ); \
206 294 : _x = add_mod_255( unshifted, shifted ); \
207 294 : (x) = _x; \
208 294 : } while( 0 )
209 :
210 270 : #define FD_REEDSOL_FWHT_32( x ) do { wb_t _y = (x); \
211 270 : wb_t negated, unshifted, shifted; \
212 270 : /* Shift by 16 elements (16B) */ \
213 270 : negated = wb_sub( wb_bcast( 0xFF ), _y ); \
214 270 : unshifted = _mm256_blend_epi32( _y, negated, 0xF0 ); \
215 270 : shifted = _mm256_setr_m128i( _mm256_extracti128_si256( _y, 1 ), _mm256_extracti128_si256( _y, 0 ) ); \
216 270 : _y = add_mod_255( unshifted, shifted ); \
217 270 : FD_REEDSOL_FWHT_16( _y ); \
218 270 : (x) = _y; \
219 270 : } while( 0 )
220 :
221 102 : #define FD_REEDSOL_FWHT_64( x0, x1 ) do { wb_t _z0i = (x0); wb_t _z1i = (x1); \
222 102 : wb_t _z0 = add_mod_255( _z0i, _z1i ); wb_t _z1 = add_mod_255( _z0i, wb_sub( wb_bcast( 0xFF ), _z1i ) ); \
223 102 : FD_REEDSOL_FWHT_32( _z0 ); FD_REEDSOL_FWHT_32( _z1 ); \
224 102 : (x0) = _z0; (x1) = _z1; \
225 102 : } while( 0 )
226 :
227 12 : #define FD_REEDSOL_FWHT_128( x0, x1, x2, x3 ) do { wb_t _w0i = (x0); wb_t _w1i = (x1); wb_t _w2i = (x2); wb_t _w3i = (x3); \
228 12 : wb_t _w0, _w1, _w2, _w3; \
229 12 : _w0 = add_mod_255( _w0i, _w2i ); _w1 = add_mod_255( _w1i, _w3i ); \
230 12 : _w2 = add_mod_255( _w0i, wb_sub( wb_bcast( 0xFF ), _w2i ) ); _w3 = add_mod_255( _w1i, wb_sub( wb_bcast( 0xFF ), _w3i ) ); \
231 12 : FD_REEDSOL_FWHT_64( _w0, _w1 ); FD_REEDSOL_FWHT_64( _w2, _w3 ); \
232 12 : (x0) = _w0; (x1) = _w1; (x2) = _w2; (x3) = _w3; \
233 12 : } while( 0 )
234 :
235 0 : #define FD_REEDSOL_FWHT_256( x0, x1, x2, x3, x4, x5, x6, x7 ) do { \
236 0 : wb_t _v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7; \
237 0 : wb_t _v0i, _v1i, _v2i, _v3i, _v4i, _v5i, _v6i, _v7i; \
238 0 : _v0i = (x0); _v1i = (x1); _v2i = (x2); _v3i = (x3); \
239 0 : _v4i = (x4); _v5i = (x5); _v6i = (x6); _v7i = (x7); \
240 0 : _v0 = add_mod_255( _v0i, _v4i ); _v1 = add_mod_255( _v1i, _v5i ); \
241 0 : _v2 = add_mod_255( _v2i, _v6i ); _v3 = add_mod_255( _v3i, _v7i ); \
242 0 : _v4 = add_mod_255( _v0i, wb_sub( wb_bcast( 0xFF ), _v4i ) ); _v5 = add_mod_255( _v1i, wb_sub( wb_bcast( 0xFF ), _v5i ) ); \
243 0 : _v6 = add_mod_255( _v2i, wb_sub( wb_bcast( 0xFF ), _v6i ) ); _v7 = add_mod_255( _v3i, wb_sub( wb_bcast( 0xFF ), _v7i ) ); \
244 0 : FD_REEDSOL_FWHT_128( _v0, _v1, _v2, _v3 ); FD_REEDSOL_FWHT_128( _v4, _v5, _v6, _v7 ); \
245 0 : (x0) = _v0; (x1) = _v1; (x2) = _v2; (x3) = _v3; (x4) = _v4; (x5) = _v5; (x6) = _v6; (x7) = _v7; \
246 0 : } while( 0 )
247 : #endif
248 :
249 : /* Casts each element of a to a uchar, forming a 16-element uchar vector. Then
250 : * casts each element of b to a uchar, forming a second 16-element uchar
251 : * vector. Concatenates the two 16-element vectors to form a single
252 : * 32-element wb_t (a first, then b). */
253 : static inline wb_t
254 147 : compact_ws( ws_t a, ws_t b ) {
255 : /* There's also _mm256_packus_epi16, but it's no better than this */
256 147 : wb_t shuffled0 = _mm256_shuffle_epi8(a, wb( 0, 2, 4, 6, 8, 10, 12, 14, 128,128,128,128,128,128,128,128,
257 147 : 128, 128,128,128,128,128,128,128, 0, 2, 4, 6, 8, 10, 12, 14 ) );
258 147 : wb_t shuffled1 = _mm256_shuffle_epi8(b, wb( 0, 2, 4, 6, 8, 10, 12, 14, 128,128,128,128,128,128,128,128,
259 147 : 128, 128,128,128,128,128,128,128, 0, 2, 4, 6, 8, 10, 12, 14 ) );
260 147 : return _mm256_setr_m128i(
261 147 : _mm_or_si128( _mm256_extracti128_si256( shuffled0, 0 ), _mm256_extracti128_si256( shuffled0, 1 ) ),
262 147 : _mm_or_si128( _mm256_extracti128_si256( shuffled1, 0 ), _mm256_extracti128_si256( shuffled1, 1 ) ) );
263 147 : }
264 :
265 : /* exp_{n}( x ) computes n^x_i in GF(2^8) for each byte x_i in the
266 : vector x. That's exponentiation, not xor. For example, exp_76
267 : interprets 76 as an element of GF(2^8) and x_i as an integer, and
268 : computes the product of multiplying GF(76) times itself x_i times.
269 : Recall that exponentiation is an operator from (GF(2^8) x (Z/255Z))
270 : -> GF(2^8), so x is interpreted mod 255. (equivalently, observe
271 : n^255=1). As an input, x^255 is okay and is the same as x^0. */
272 : static inline wb_t
273 12 : exp_76( wb_t x ) {
274 : /* Decompose x = xh3*0x80 + xh2*0x40 + xh1*0x20 + xh0*0x10 + xl
275 : where 0<=xl<16 and 0<=xh_j<1 for each j. Then
276 :
277 : 76^x = (76^xl) * (76^0x10)^xh0 * (76^0x20)^xh1 *
278 : (76^0x40)^xh2 (76^0x80)^xh3
279 : = (76^xl) * 2^xh0 * 4^xh1 * 16^xh2 * 29^xh3.
280 :
281 : We use vpshub to implement the 4-bit lookup table 76^xl. The for
282 : the rest, we're either multiplying by a constant or not doing the
283 : multiply, so we can use our normal GF_MUL with a blend. */
284 :
285 12 : wb_t low = wb_and( x, wb_bcast( 0xF ) );
286 12 : wb_t exp_low = _mm256_shuffle_epi8( wb( 1, 76, 157, 70, 95, 253, 217, 129, 133, 168, 230, 227, 130, 81, 18, 44,
287 12 : 1, 76, 157, 70, 95, 253, 217, 129, 133, 168, 230, 227, 130, 81, 18, 44 ),
288 12 : low );
289 12 : wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 2 ), _mm256_slli_epi16( x, 3 ) );
290 12 : wb_t with1 = _mm256_blendv_epi8( with0, GF_MUL( with0, 4 ), _mm256_slli_epi16( x, 2 ) );
291 12 : wb_t with2 = _mm256_blendv_epi8( with1, GF_MUL( with1, 16 ), _mm256_slli_epi16( x, 1 ) );
292 12 : wb_t with3 = _mm256_blendv_epi8( with2, GF_MUL( with2, 29 ), x );
293 12 : return with3;
294 12 : }
295 :
296 : static inline wb_t
297 33 : exp_29( wb_t x ) {
298 33 : wb_t low = wb_and( x, wb_bcast( 0xF ) );
299 33 : wb_t exp_low = _mm256_shuffle_epi8( wb( 1, 29, 76, 143, 157, 106, 70, 93, 95, 101, 253, 254, 217, 13, 129, 59,
300 33 : 1, 29, 76, 143, 157, 106, 70, 93, 95, 101, 253, 254, 217, 13, 129, 59 ),
301 33 : low );
302 33 : wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 133 ), _mm256_slli_epi16( x, 3 ) );
303 33 : wb_t with1 = _mm256_blendv_epi8( with0, GF_MUL( with0, 2 ), _mm256_slli_epi16( x, 2 ) );
304 33 : wb_t with2 = _mm256_blendv_epi8( with1, GF_MUL( with1, 4 ), _mm256_slli_epi16( x, 1 ) );
305 33 : wb_t with3 = _mm256_blendv_epi8( with2, GF_MUL( with2, 16 ), x );
306 33 : return with3;
307 33 : }
308 : static inline wb_t
309 78 : exp_16( wb_t x ) {
310 78 : wb_t low = wb_and( x, wb_bcast( 0xF ) );
311 78 : wb_t exp_low = _mm256_shuffle_epi8( wb( 1, 16, 29, 205, 76, 180, 143, 24, 157, 37, 106, 238, 70, 20, 93, 185,
312 78 : 1, 16, 29, 205, 76, 180, 143, 24, 157, 37, 106, 238, 70, 20, 93, 185 ),
313 78 : low );
314 78 : wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 95 ), _mm256_slli_epi16( x, 3 ) );
315 78 : wb_t with1 = _mm256_blendv_epi8( with0, GF_MUL( with0, 133 ), _mm256_slli_epi16( x, 2 ) );
316 78 : wb_t with2 = _mm256_blendv_epi8( with1, GF_MUL( with1, 2 ), _mm256_slli_epi16( x, 1 ) );
317 78 : wb_t with3 = _mm256_blendv_epi8( with2, GF_MUL( with2, 4 ), x );
318 78 : return with3;
319 78 : }
320 : static inline wb_t
321 24 : exp_4( wb_t x ) {
322 24 : wb_t low = wb_and( x, wb_bcast( 0xF ) );
323 24 : wb_t exp_low = _mm256_shuffle_epi8( wb( 1, 4, 16, 64, 29, 116, 205, 19, 76, 45, 180, 234, 143, 6, 24, 96,
324 24 : 1, 4, 16, 64, 29, 116, 205, 19, 76, 45, 180, 234, 143, 6, 24, 96 ),
325 24 : low );
326 24 : wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 157 ), _mm256_slli_epi16( x, 3 ) );
327 24 : wb_t with1 = _mm256_blendv_epi8( with0, GF_MUL( with0, 95 ), _mm256_slli_epi16( x, 2 ) );
328 24 : wb_t with2 = _mm256_blendv_epi8( with1, GF_MUL( with1, 133 ), _mm256_slli_epi16( x, 1 ) );
329 24 : wb_t with3 = _mm256_blendv_epi8( with2, GF_MUL( with2, 2 ), x );
330 24 : return with3;
331 24 : }
332 :
333 : static inline wb_t
334 0 : exp_2( wb_t x ) {
335 0 : wb_t low = wb_and( x, wb_bcast( 0xF ) );
336 0 : wb_t exp_low = _mm256_shuffle_epi8( wb( 1, 2, 4, 8, 16, 32, 64, 128, 29, 58, 116, 232, 205, 135, 19, 38,
337 0 : 1, 2, 4, 8, 16, 32, 64, 128, 29, 58, 116, 232, 205, 135, 19, 38),
338 0 : low );
339 0 : wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 76 ), _mm256_slli_epi16( x, 3 ) );
340 0 : wb_t with1 = _mm256_blendv_epi8( with0, GF_MUL( with0, 157 ), _mm256_slli_epi16( x, 2 ) );
341 0 : wb_t with2 = _mm256_blendv_epi8( with1, GF_MUL( with1, 95 ), _mm256_slli_epi16( x, 1 ) );
342 0 : wb_t with3 = _mm256_blendv_epi8( with2, GF_MUL( with2, 133 ), x );
343 0 : return with3;
344 0 : }
345 :
346 : #endif /* FD_REEDSOL_ARITH_IMPL>0 */
347 :
348 : /* l_twiddle_{N} stores the size N FWHT of what the paper calls L~, i.e.
349 : ( 0, Log(1), Log(2), Log(3), ... Log(N-1) )
350 :
351 : The discrete log uses a primitive element of 2, and the FWHT is taken
352 : mod 255, which means all of the values can fit in a uchar. However,
353 : Intel doesn't give us a multiplication instruction for 8-bit
354 : integers, which means that we'd have to zero-extend these values
355 : anyways.
356 :
357 : Although L~ for a smaller size is a subset of that for a larger size,
358 : because we also precompute the value of the FWHT, we store the
359 : variables separately. Perhaps a good compiler could
360 : constant-propagate through the AVX instructions, but it's just 5
361 : values of N, so I prefer not to depend on that. */
362 : static const short fwht_l_twiddle_16 [ 16 ] = {0xca,0xa1,0x6a,0xa9,0x73,0xfc,0xe2,0x44,0x93,0x74,0x08,0x7f,0x96,0x8c,0x42,0xf2};
363 : static const short fwht_l_twiddle_32 [ 32 ] = {0x24,0x8f,0xc2,0x7e,0x49,0x89,0x74,0xdc,0x4f,0x95,0x43,0xb4,0x09,0xba,0x03,0x83,
364 : 0x71,0xb3,0x12,0xd4,0x9d,0x70,0x51,0xab,0xd7,0x53,0xcc,0x4a,0x24,0x5e,0x81,0x62};
365 : static const short fwht_l_twiddle_64 [ 64 ] = {0x05,0x81,0x9a,0x82,0x07,0x7c,0x3c,0xbe,0xd3,0xbc,0xed,0x23,0xc2,0x24,0xee,0xc8,
366 : 0x3f,0x5d,0x11,0x18,0x8a,0xf9,0x1c,0x4b,0x0e,0x02,0x8e,0xe4,0x77,0x8c,0x97,0x6d,
367 : 0x43,0x9d,0xea,0x7a,0x8b,0x96,0xac,0xfa,0xca,0x6e,0x98,0x46,0x4f,0x51,0x17,0x3e,
368 : 0xa3,0x0a,0x13,0x91,0xb0,0xe6,0x86,0x0c,0xa1,0xa4,0x0b,0xaf,0xd0,0x30,0x6b,0x57};
369 : static const short fwht_l_twiddle_128[ 128 ] = {0xfe,0x89,0x15,0xeb,0x48,0xea,0x04,0xfe,0x32,0xd9,0xca,0x2c,0x1e,0x58,0x8d,0xed,
370 : 0x6f,0x36,0x53,0x24,0xb2,0x27,0x3e,0x06,0xec,0x96,0x41,0x05,0xbe,0x1d,0xb1,0xdd,
371 : 0x18,0x64,0xf4,0xc3,0x16,0x0a,0x2e,0x00,0xde,0x34,0xaf,0x42,0xd7,0x5e,0x92,0x02,
372 : 0xbf,0x5a,0x6a,0x97,0xe1,0x39,0xd0,0xf6,0x66,0x86,0xb5,0x61,0x8a,0xa2,0x8f,0x49,
373 : 0x0b,0x79,0x20,0x19,0xc5,0x0e,0x74,0x7e,0x75,0x9f,0x11,0x1a,0x67,0xef,0x50,0xa3,
374 : 0x0f,0x84,0xce,0x0c,0x62,0xcc,0xf9,0x90,0x2f,0x6d,0xdb,0xc4,0x30,0xfb,0x7d,0xfc,
375 : 0x6e,0xd6,0xe0,0x31,0x01,0x23,0x2b,0xf5,0xb6,0xa8,0x81,0x4a,0xc6,0x44,0x9b,0x7a,
376 : 0x87,0xb9,0xbb,0x8b,0x7f,0x94,0x3c,0x21,0xdc,0xc2,0x60,0xfd,0x17,0xbd,0x47,0x65};
377 : static const short fwht_l_twiddle_256[ 256 ] = {0x00,0xfc,0xfb,0x15,0x2d,0xfa,0xc1,0x14,0x62,0x2c,0xd9,0xf9,0xc0,0x45,0x13,0xe8,
378 : 0x01,0x61,0x86,0x2b,0xd8,0xba,0xf8,0x5d,0xbf,0x7a,0x44,0x6a,0x07,0x12,0xf1,0xe7,
379 : 0x00,0xdc,0x60,0x0a,0x1f,0x85,0x1c,0x2a,0x8b,0xd7,0x92,0xb9,0xf7,0x82,0x5c,0xad,
380 : 0x19,0xbe,0xb1,0x79,0x43,0x3d,0x69,0x9e,0x06,0x75,0x11,0x27,0x70,0xf0,0xd2,0xe6,
381 : 0xfe,0x2f,0xdb,0xea,0x88,0x5f,0x7c,0x09,0x0c,0x1e,0x8d,0x84,0x1b,0x3f,0x29,0xd4,
382 : 0x31,0x8a,0x8f,0xd6,0x91,0xcb,0xb8,0xc9,0xf6,0xb6,0x81,0x39,0xc7,0x5b,0x55,0xac,
383 : 0x18,0x65,0xbd,0xf4,0x22,0xb0,0xb4,0x78,0x7f,0x42,0x34,0x3c,0x68,0x37,0x9d,0x4e,
384 : 0xc5,0x05,0x96,0x74,0x10,0x59,0x26,0x9a,0x6f,0xa3,0xef,0x53,0x4b,0xd1,0xaa,0xe5,
385 : 0xfd,0x16,0x2e,0xc2,0x63,0xda,0x46,0xe9,0x02,0x87,0xbb,0x5e,0x7b,0x6b,0x08,0xf2,
386 : 0xdd,0x0b,0x20,0x1d,0x8c,0x93,0x83,0xae,0x1a,0xb2,0x3e,0x9f,0x76,0x28,0x71,0xd3,
387 : 0x30,0xeb,0x89,0x7d,0x0d,0x8e,0x40,0xd5,0x32,0x90,0xcc,0xca,0xb7,0x3a,0xc8,0x56,
388 : 0x66,0xf5,0x23,0xb5,0x80,0x35,0x38,0x4f,0xc6,0x97,0x5a,0x9b,0xa4,0x54,0x4c,0xab,
389 : 0x17,0xc3,0x64,0x47,0x03,0xbc,0x6c,0xf3,0xde,0x21,0x94,0xaf,0xb3,0xa0,0x77,0x72,
390 : 0xec,0x7e,0x0e,0x41,0x33,0xcd,0x3b,0x57,0x67,0x24,0x36,0x50,0x98,0x9c,0xa5,0x4d,
391 : 0xc4,0x48,0x04,0x6d,0xdf,0x95,0xa1,0x73,0xed,0x0f,0xce,0x58,0x25,0x51,0x99,0xa6,
392 : 0x49,0x6e,0xe0,0xa2,0xee,0xcf,0x52,0xa7,0x4a,0xe1,0xd0,0xa8,0xe2,0xa9,0xe3,0xe4};
393 :
394 : #if FD_REEDSOL_ARITH_IMPL==0
395 : static void
396 : gen_pi_noavx_generic( uchar const * is_erased,
397 : uchar * output,
398 : ulong sz,
399 : const short * l_twiddle ) {
400 : long scratch[ 256 ];
401 :
402 : for( ulong i=0UL; i<sz; i++ ) scratch[ i ] = is_erased[ i ];
403 :
404 : /* Unscaled FWHT */
405 : for( ulong h=1UL; h<sz; h<<=1 ) {
406 : for( ulong i=0UL; i<sz; i += 2UL*h ) for( ulong j=i; j<i+h; j++ ) {
407 : long x = scratch[ j ];
408 : long y = scratch[ j+h ];
409 : scratch[ j ] = x+y;
410 : scratch[ j+h ] = x-y;
411 : }
412 : }
413 :
414 : for( ulong i=0UL; i<sz; i++ ) scratch[ i ] *= l_twiddle[ i ];
415 :
416 : for( ulong h=1UL; h<sz; h<<=1 ) {
417 : for( ulong i=0UL; i<sz; i += 2UL*h ) for( ulong j=i; j<i+h; j++ ) {
418 : long x = scratch[ j ];
419 : long y = scratch[ j+h ];
420 : scratch[ j ] = x+y;
421 : scratch[ j+h ] = x-y;
422 : }
423 : }
424 :
425 : /* Negate the ones corresponding to erasures to compute 1/Pi' */
426 : for( ulong i=0UL; i<sz; i++ ) scratch[ i ] *= fd_long_if( is_erased[ i ], -1L, 1L );
427 :
428 : /* To fix the FWHT scaling, we need to multiply by sz^-1 mod 255.
429 : Given that sz is always a power of 2, this is not too bad.
430 : Let s = lg(sz). Note that 2^8 == 256 == 1 (mod 255),
431 : so then (2^s) * (2^(8-s)) == 1 (mod 255).
432 : This implies that sz^-1 = 2^(8-s) (mod 255), and we can compute
433 : 2^(8-s) by 256/s with normal integer division. */
434 : long sz_inv = 256L/(long)sz;
435 : /* The % operator in C doesn't work like the mathematical mod
436 : operation for negative numbers, so we need to add in a shift to
437 : make any input non-negative. We can compute the smallest possible
438 : value at this point and it's -142397, so we add 255*559=142545. */
439 : for( ulong i=0UL; i<sz; i++ ) scratch[ i ] = (sz_inv*(scratch[ i ] + 255L*559L)) % 255L;
440 :
441 : for( ulong i=0UL; i<sz; i++ ) output[ i ] = (uchar)gf_arith_invlog_tbl[ scratch[ i ] ];
442 : }
443 : #endif
444 :
445 : void
446 : fd_reedsol_private_gen_pi_16( uchar const * is_erased,
447 12 : uchar * output ) {
448 12 : #if FD_REEDSOL_ARITH_IMPL>0
449 : #if FD_REEDSOL_PI_USE_SHORT
450 : ws_t erased_vec = _mm256_cvtepu8_epi16( vb_ld( is_erased ) );
451 :
452 : ws_t transformed = erased_vec;
453 : FD_REEDSOL_FWHT_16( transformed ); /* FWHT( R~ ) */
454 : /* |transformed| <= 16 */
455 :
456 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
457 : |product| <= 16*255, but definitely may be negative */
458 : ws_t product = ws_mullo( transformed, ws_ld( fwht_l_twiddle_16 ) );
459 :
460 : /* log_pi is congruent (mod 255) to what the paper calls
461 : R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
462 : Let H be the 16x16 Hadamard matrix. Then
463 : log_pi = (H * diag( fwht_l_twiddle_16 ) * H) * erased_vec
464 : |---------------------------------|
465 : M
466 : The part labeled M is a matrix known ahead of time, so we can bound
467 : log_pi relatively easily when combined with the fact
468 : 0<=erased_vec<=1 (just set the negative entries to 0, and multiply
469 : by all 1s. Then do the same, setting the positive entries to 0
470 : instead). This tells us |log_pi| <= 4489 */
471 : FD_REEDSOL_FWHT_16( product );
472 : ws_t log_pi = product;
473 :
474 : /* Negate the ones corresponding to erasures to compute 1/Pi' */
475 : log_pi = ws_adjust_sign( log_pi, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec, 1 ) ) );
476 :
477 : log_pi = ws_add( log_pi, ws_bcast( (short)(255*18) ) ); /* Now 0<= log_pi <= 9079 < 2^15 */
478 :
479 : /* GCC informs me that for a ushort x,
480 : (x%255) == 0xFF & ( x + (x*0x8081)>>23).
481 : We need at least 31 bits of precision for the product, so
482 : mulh_epu16 is perfect. */
483 : log_pi = ws_and( ws_bcast( 0xFF ), ws_add( log_pi, ws_shru( ws_mulhi( log_pi, ws_bcast( (short)0x8081 ) ), 7 ) ) );
484 : /* Now 0<=log_pi < 255 */
485 :
486 : /* Since our FWHT implementation is unscaled, we've computed a value
487 : 16 times larger than what we'd like. 16^-1 == 16 (mod 255), but
488 : we're just going to use this in the exponentiation, so we can
489 : compute this implicitly.
490 : 2^(log_pi * 16^-1) = 2^(16*log_pi) = (2^16)^log_pi = 76^log_pi
491 : (where 2 is the primitive element we used for the logs, an element
492 : of GF(2^8) ). */
493 :
494 : wb_t compact_log_pi = compact_ws( log_pi, ws_zero() );
495 : wb_t pi = exp_76( compact_log_pi );
496 :
497 : vb_st( output, _mm256_extracti128_si256( pi, 0 ) );
498 :
499 : #else
500 12 : wb_t erased_vec = _mm256_setr_m128i( vb_ldu( is_erased ), _mm_setzero_si128() );
501 12 : wb_t to_transform = erased_vec;
502 12 : FD_REEDSOL_FWHT_16( to_transform );
503 12 : ws_t transformed = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform, 0 ) );
504 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
505 : 0<=product<256*255, so product is interpreted as unsigned. */
506 12 : ws_t product = ws_mullo( transformed, ws_ld( fwht_l_twiddle_16 ) );
507 :
508 : /* Compute mod 255, using the same approach as above. */
509 12 : product = ws_and( ws_bcast( 0xFF ), ws_add( product, ws_shru( ws_mulhi( product, ws_bcast( (short)0x8081 ) ), 7 ) ) );
510 12 : wb_t compact_product = compact_ws( product, ws_zero() );
511 :
512 12 : FD_REEDSOL_FWHT_16( compact_product );
513 :
514 : /* Negate the ones corresponding to erasures */
515 12 : compact_product = wb_if( wb_eq( erased_vec, wb_zero() ), compact_product, wb_sub( wb_bcast( 255 ), compact_product ) );
516 :
517 12 : wb_t pi = exp_76( compact_product );
518 12 : vb_st( output, _mm256_extracti128_si256( pi, 0 ) );
519 12 : #endif
520 : #else /* No AVX implementation */
521 :
522 : gen_pi_noavx_generic( is_erased, output, 16UL, fwht_l_twiddle_16 );
523 :
524 : #endif
525 12 : }
526 :
527 : void
528 : fd_reedsol_private_gen_pi_32( uchar const * is_erased,
529 33 : uchar * output ) {
530 33 : #if FD_REEDSOL_ARITH_IMPL>0
531 : #if FD_REEDSOL_PI_USE_SHORT
532 : ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased ) );
533 : ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 16UL ) );
534 :
535 : ws_t transformed0 = erased_vec0;
536 : ws_t transformed1 = erased_vec1;
537 : FD_REEDSOL_FWHT_32( transformed0, transformed1 ); /* FWHT( R~ ) */
538 : /* |transformed| <= 32 */
539 :
540 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
541 : |product| <= 32*255, but definitely may be negative */
542 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_32 ) );
543 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_32 + 16UL ) );
544 :
545 : /* log_pi is congruent (mod 255) to what the paper calls
546 : R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
547 : |log_pi| <= 6945 using the same approach as above. */
548 : FD_REEDSOL_FWHT_32( product0, product1 );
549 : ws_t log_pi0 = product0;
550 : ws_t log_pi1 = product1;
551 :
552 : /* Negate the ones corresponding to erasures to compute 1/Pi' */
553 : log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
554 : log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
555 :
556 : log_pi0 = ws_add( log_pi0, ws_bcast( (short)(255*28) ) );
557 : log_pi1 = ws_add( log_pi1, ws_bcast( (short)(255*28) ) ); /* Now 0<= log_pi <= 14085 < 2^15 */
558 :
559 : /* GCC informs me that for a ushort x,
560 : (x%255) == 0xFF & ( x + (x*0x8081)>>23).
561 : We need at least 31 bits of precision for the product, so
562 : mulh_epu16 is perfect. */
563 : log_pi0 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi0, ws_shru( ws_mulhi( log_pi0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
564 : log_pi1 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi1, ws_shru( ws_mulhi( log_pi1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
565 : /* Now 0<=log_pi < 255 */
566 :
567 : /* Since our FWHT implementation is unscaled, we've computed a value
568 : 32 times larger than what we'd like. 32^-1 == 8 (mod 255), but
569 : we're just going to use this in the exponentiation, so we can
570 : compute this implicitly.
571 : 2^(log_pi * 32^-1) = 2^(8*log_pi) = (2^8)^log_pi = 29^log_pi
572 : (where 2 is the primitive element we used for the logs, an element
573 : of GF(2^8) ). */
574 :
575 : wb_t compact_log_pi = compact_ws( log_pi0, log_pi1 );
576 : wb_t pi = exp_29( compact_log_pi );
577 :
578 : wb_st( output, pi );
579 :
580 : #else
581 33 : wb_t erased_vec = wb_ld( is_erased );
582 33 : wb_t to_transform = erased_vec;
583 33 : FD_REEDSOL_FWHT_32( to_transform );
584 33 : ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform, 0 ) );
585 33 : ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform, 1 ) );
586 :
587 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
588 : 0<=product<256*255, so product is interpreted as unsigned. */
589 33 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_32 ) );
590 33 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_32 + 16UL ) );
591 :
592 : /* Compute mod 255, using the same approach as above. */
593 33 : product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( ws_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
594 33 : product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( ws_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
595 33 : wb_t compact_product = compact_ws( product0, product1 );
596 :
597 33 : FD_REEDSOL_FWHT_32( compact_product );
598 :
599 : /* Negate the ones corresponding to erasures */
600 33 : compact_product = wb_if( wb_eq( erased_vec, wb_zero() ), compact_product, wb_sub( wb_bcast( 255 ), compact_product ) );
601 :
602 33 : wb_t pi = exp_29( compact_product );
603 33 : wb_st( output, pi );
604 33 : #endif
605 : #else /* No AVX implementation */
606 :
607 : gen_pi_noavx_generic( is_erased, output, 32UL, fwht_l_twiddle_32 );
608 :
609 : #endif
610 33 : }
611 :
612 : void
613 : fd_reedsol_private_gen_pi_64( uchar const * is_erased,
614 39 : uchar * output ) {
615 39 : #if FD_REEDSOL_ARITH_IMPL>0
616 : #if FD_REEDSOL_PI_USE_SHORT
617 : ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased ) );
618 : ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 16UL ) );
619 : ws_t erased_vec2 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 32UL ) );
620 : ws_t erased_vec3 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 48UL ) );
621 :
622 : ws_t transformed0 = erased_vec0;
623 : ws_t transformed1 = erased_vec1;
624 : ws_t transformed2 = erased_vec2;
625 : ws_t transformed3 = erased_vec3;
626 : FD_REEDSOL_FWHT_64( transformed0, transformed1, transformed2, transformed3 ); /* FWHT( R~ ) */
627 : /* |transformed| <= 64 */
628 :
629 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
630 : |product| <= 64*255, but definitely may be negative */
631 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_64 ) );
632 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_64 + 16UL ) );
633 : ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_64 + 32UL ) );
634 : ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_64 + 48UL ) );
635 :
636 : /* log_pi is congruent (mod 255) to what the paper calls
637 : R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
638 : |log_pi| <= 18918 using the same approach as above. */
639 : FD_REEDSOL_FWHT_64( product0, product1, product2, product3 );
640 : ws_t log_pi0 = product0;
641 : ws_t log_pi1 = product1;
642 : ws_t log_pi2 = product2;
643 : ws_t log_pi3 = product3;
644 :
645 : /* Negate the ones corresponding to erasures to compute 1/Pi' */
646 : log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
647 : log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
648 : log_pi2 = ws_adjust_sign( log_pi2, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec2, 1 ) ) );
649 : log_pi3 = ws_adjust_sign( log_pi3, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec3, 1 ) ) );
650 :
651 : log_pi0 = ws_add( log_pi0, ws_bcast( (short)(255*75) ) );
652 : log_pi1 = ws_add( log_pi1, ws_bcast( (short)(255*75) ) );
653 : log_pi2 = ws_add( log_pi2, ws_bcast( (short)(255*75) ) );
654 : log_pi3 = ws_add( log_pi3, ws_bcast( (short)(255*75) ) );
655 : /* Now 0<= log_pi <= 38043 < 2^16 (okay, since the next step treats it as unsigned */
656 :
657 : /* GCC informs me that for a ushort x,
658 : (x%255) == 0xFF & ( x + (x*0x8081)>>23).
659 : We need at least 31 bits of precision for the product, so
660 : mulh_epu16 is perfect. */
661 : log_pi0 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi0, ws_shru( ws_mulhi( log_pi0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
662 : log_pi1 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi1, ws_shru( ws_mulhi( log_pi1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
663 : log_pi2 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi2, ws_shru( ws_mulhi( log_pi2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
664 : log_pi3 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi3, ws_shru( ws_mulhi( log_pi3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
665 : /* Now 0<=log_pi < 255 */
666 :
667 : /* Since our FWHT implementation is unscaled, we've computed a value
668 : 64 times larger than what we'd like. 64^-1 == 4 (mod 255), but
669 : we're just going to use this in the exponentiation, so we can
670 : compute this implicitly.
671 : 2^(log_pi * 64^-1) = 2^(4*log_pi) = (2^4)^log_pi = 16^log_pi
672 : (where 2 is the primitive element we used for the logs, an element
673 : of GF(2^8) ). */
674 :
675 : wb_t compact_log_pi0 = compact_ws( log_pi0, log_pi1 );
676 : wb_t compact_log_pi1 = compact_ws( log_pi2, log_pi3 );
677 : wb_t pi0 = exp_16( compact_log_pi0 );
678 : wb_t pi1 = exp_16( compact_log_pi1 );
679 :
680 : wb_st( output, pi0 );
681 : wb_st( output+32UL, pi1 );
682 :
683 : #else
684 39 : wb_t erased_vec0 = wb_ld( is_erased );
685 39 : wb_t erased_vec1 = wb_ld( is_erased + 32UL );
686 39 : wb_t to_transform0 = erased_vec0;
687 39 : wb_t to_transform1 = erased_vec1;
688 :
689 39 : FD_REEDSOL_FWHT_64( to_transform0, to_transform1 );
690 :
691 39 : ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 0 ) );
692 39 : ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 1 ) );
693 39 : ws_t transformed2 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 0 ) );
694 39 : ws_t transformed3 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 1 ) );
695 :
696 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
697 : 0<=product<256*255, so product is interpreted as unsigned. */
698 39 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_64 ) );
699 39 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_64 + 16UL ) );
700 39 : ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_64 + 32UL ) );
701 39 : ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_64 + 48UL ) );
702 :
703 : /* Compute mod 255, using the same approach as above. */
704 39 : product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( ws_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
705 39 : product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( ws_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
706 39 : product2 = ws_and( ws_bcast( 0xFF ), ws_add( product2, ws_shru( ws_mulhi( product2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
707 39 : product3 = ws_and( ws_bcast( 0xFF ), ws_add( product3, ws_shru( ws_mulhi( product3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
708 :
709 39 : wb_t compact_product0 = compact_ws( product0, product1 );
710 39 : wb_t compact_product1 = compact_ws( product2, product3 );
711 :
712 39 : FD_REEDSOL_FWHT_64( compact_product0, compact_product1 );
713 :
714 : /* Negate the ones corresponding to erasures */
715 39 : compact_product0 = wb_if( wb_eq( erased_vec0, wb_zero() ), compact_product0, wb_sub( wb_bcast( 255 ), compact_product0 ) );
716 39 : compact_product1 = wb_if( wb_eq( erased_vec1, wb_zero() ), compact_product1, wb_sub( wb_bcast( 255 ), compact_product1 ) );
717 :
718 39 : wb_t pi0 = exp_16( compact_product0 );
719 39 : wb_t pi1 = exp_16( compact_product1 );
720 39 : wb_st( output , pi0 );
721 39 : wb_st( output + 32UL, pi1 );
722 39 : #endif
723 : #else /* No AVX implementation */
724 :
725 : gen_pi_noavx_generic( is_erased, output, 64UL, fwht_l_twiddle_64 );
726 :
727 : #endif
728 39 : }
729 :
730 : void
731 : fd_reedsol_private_gen_pi_128( uchar const * is_erased,
732 6 : uchar * output ) {
733 6 : #if FD_REEDSOL_ARITH_IMPL>0
734 : #if FD_REEDSOL_PI_USE_SHORT
735 : ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased ) );
736 : ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 16UL ) );
737 : ws_t erased_vec2 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 32UL ) );
738 : ws_t erased_vec3 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 48UL ) );
739 : ws_t erased_vec4 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 64UL ) );
740 : ws_t erased_vec5 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 80UL ) );
741 : ws_t erased_vec6 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 96UL ) );
742 : ws_t erased_vec7 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 112UL ) );
743 :
744 : ws_t transformed0 = erased_vec0;
745 : ws_t transformed1 = erased_vec1;
746 : ws_t transformed2 = erased_vec2;
747 : ws_t transformed3 = erased_vec3;
748 : ws_t transformed4 = erased_vec4;
749 : ws_t transformed5 = erased_vec5;
750 : ws_t transformed6 = erased_vec6;
751 : ws_t transformed7 = erased_vec7;
752 : FD_REEDSOL_FWHT_128( transformed0, transformed1, transformed2, transformed3, transformed4, transformed5, transformed6, transformed7 ); /* FWHT( R~ ) */
753 : /* |transformed| <= 128 */
754 :
755 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
756 : -16256 <= product <= 32512 */
757 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_128 ) );
758 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_128 + 16UL ) );
759 : ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_128 + 32UL ) );
760 : ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_128 + 48UL ) );
761 : ws_t product4 = ws_mullo( transformed4, ws_ld( fwht_l_twiddle_128 + 64UL ) );
762 : ws_t product5 = ws_mullo( transformed5, ws_ld( fwht_l_twiddle_128 + 80UL ) );
763 : ws_t product6 = ws_mullo( transformed6, ws_ld( fwht_l_twiddle_128 + 96UL ) );
764 : ws_t product7 = ws_mullo( transformed7, ws_ld( fwht_l_twiddle_128 + 112UL ) );
765 :
766 : /* We need to reduce these mod 255 to prevent overflow in the next
767 : step. 0 <= product+64*255 <= 48832 < 2^16. The mod operation
768 : treats the input as unsigned though, so this is okay. */
769 : product0 = ws_add( product0, ws_bcast( (short)64*255 ) );
770 : product1 = ws_add( product1, ws_bcast( (short)64*255 ) );
771 : product2 = ws_add( product2, ws_bcast( (short)64*255 ) );
772 : product3 = ws_add( product3, ws_bcast( (short)64*255 ) );
773 : product4 = ws_add( product4, ws_bcast( (short)64*255 ) );
774 : product5 = ws_add( product5, ws_bcast( (short)64*255 ) );
775 : product6 = ws_add( product6, ws_bcast( (short)64*255 ) );
776 : product7 = ws_add( product7, ws_bcast( (short)64*255 ) );
777 :
778 : product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( ws_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
779 : product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( ws_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
780 : product2 = ws_and( ws_bcast( 0xFF ), ws_add( product2, ws_shru( ws_mulhi( product2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
781 : product3 = ws_and( ws_bcast( 0xFF ), ws_add( product3, ws_shru( ws_mulhi( product3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
782 : product4 = ws_and( ws_bcast( 0xFF ), ws_add( product4, ws_shru( ws_mulhi( product4, ws_bcast( (short)0x8081 ) ), 7 ) ) );
783 : product5 = ws_and( ws_bcast( 0xFF ), ws_add( product5, ws_shru( ws_mulhi( product5, ws_bcast( (short)0x8081 ) ), 7 ) ) );
784 : product6 = ws_and( ws_bcast( 0xFF ), ws_add( product6, ws_shru( ws_mulhi( product6, ws_bcast( (short)0x8081 ) ), 7 ) ) );
785 : product7 = ws_and( ws_bcast( 0xFF ), ws_add( product7, ws_shru( ws_mulhi( product7, ws_bcast( (short)0x8081 ) ), 7 ) ) );
786 :
787 : /* Now 0 <= product < 255 */
788 :
789 : /* log_pi is congruent (mod 255) to what the paper calls
790 : R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
791 : |log_pi| <= 128*255 */
792 : FD_REEDSOL_FWHT_128( product0, product1, product2, product3, product4, product5, product6, product7 );
793 : ws_t log_pi0 = product0;
794 : ws_t log_pi1 = product1;
795 : ws_t log_pi2 = product2;
796 : ws_t log_pi3 = product3;
797 : ws_t log_pi4 = product4;
798 : ws_t log_pi5 = product5;
799 : ws_t log_pi6 = product6;
800 : ws_t log_pi7 = product7;
801 :
802 : /* Negate the ones corresponding to erasures to compute 1/Pi' */
803 : log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
804 : log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
805 : log_pi2 = ws_adjust_sign( log_pi2, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec2, 1 ) ) );
806 : log_pi3 = ws_adjust_sign( log_pi3, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec3, 1 ) ) );
807 : log_pi4 = ws_adjust_sign( log_pi4, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec4, 1 ) ) );
808 : log_pi5 = ws_adjust_sign( log_pi5, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec5, 1 ) ) );
809 : log_pi6 = ws_adjust_sign( log_pi6, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec6, 1 ) ) );
810 : log_pi7 = ws_adjust_sign( log_pi7, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec7, 1 ) ) );
811 :
812 : log_pi0 = ws_add( log_pi0, ws_bcast( (short)(255*128) ) );
813 : log_pi1 = ws_add( log_pi1, ws_bcast( (short)(255*128) ) );
814 : log_pi2 = ws_add( log_pi2, ws_bcast( (short)(255*128) ) );
815 : log_pi3 = ws_add( log_pi3, ws_bcast( (short)(255*128) ) );
816 : log_pi4 = ws_add( log_pi4, ws_bcast( (short)(255*128) ) );
817 : log_pi5 = ws_add( log_pi5, ws_bcast( (short)(255*128) ) );
818 : log_pi6 = ws_add( log_pi6, ws_bcast( (short)(255*128) ) );
819 : log_pi7 = ws_add( log_pi7, ws_bcast( (short)(255*128) ) ); /* Now 0<= log_pi <= 65152 < 2^16 */
820 :
821 : /* GCC informs me that for a ushort x,
822 : (x%255) == 0xFF & ( x + (x*0x8081)>>23).
823 : We need at least 31 bits of precision for the product, so
824 : mulh_epu16 is perfect. */
825 : log_pi0 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi0, ws_shru( ws_mulhi( log_pi0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
826 : log_pi1 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi1, ws_shru( ws_mulhi( log_pi1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
827 : log_pi2 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi2, ws_shru( ws_mulhi( log_pi2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
828 : log_pi3 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi3, ws_shru( ws_mulhi( log_pi3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
829 : log_pi4 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi4, ws_shru( ws_mulhi( log_pi4, ws_bcast( (short)0x8081 ) ), 7 ) ) );
830 : log_pi5 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi5, ws_shru( ws_mulhi( log_pi5, ws_bcast( (short)0x8081 ) ), 7 ) ) );
831 : log_pi6 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi6, ws_shru( ws_mulhi( log_pi6, ws_bcast( (short)0x8081 ) ), 7 ) ) );
832 : log_pi7 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi7, ws_shru( ws_mulhi( log_pi7, ws_bcast( (short)0x8081 ) ), 7 ) ) );
833 : /* Now 0<=log_pi < 255 */
834 :
835 : /* Since our FWHT implementation is unscaled, we've computed a value
836 : 128 times larger than what we'd like. 128^-1 == 2 (mod 255), but
837 : we're just going to use this in the exponentiation, so we can
838 : compute this implicitly.
839 : 2^(log_pi * 128^-1) = 2^(2*log_pi) = (2^2)^log_pi = 4^log_pi
840 : (where 2 is the primitive element we used for the logs, an element
841 : of GF(2^8) ). */
842 :
843 : wb_t compact_log_pi0 = compact_ws( log_pi0, log_pi1 );
844 : wb_t compact_log_pi1 = compact_ws( log_pi2, log_pi3 );
845 : wb_t compact_log_pi2 = compact_ws( log_pi4, log_pi5 );
846 : wb_t compact_log_pi3 = compact_ws( log_pi6, log_pi7 );
847 : wb_t pi0 = exp_4( compact_log_pi0 );
848 : wb_t pi1 = exp_4( compact_log_pi1 );
849 : wb_t pi2 = exp_4( compact_log_pi2 );
850 : wb_t pi3 = exp_4( compact_log_pi3 );
851 :
852 : wb_st( output, pi0 );
853 : wb_st( output + 32UL, pi1 );
854 : wb_st( output + 64UL, pi2 );
855 : wb_st( output + 96UL, pi3 );
856 :
857 : #else
858 6 : wb_t erased_vec0 = wb_ld( is_erased );
859 6 : wb_t erased_vec1 = wb_ld( is_erased + 32UL );
860 6 : wb_t erased_vec2 = wb_ld( is_erased + 64UL );
861 6 : wb_t erased_vec3 = wb_ld( is_erased + 96UL );
862 6 : wb_t to_transform0 = erased_vec0;
863 6 : wb_t to_transform1 = erased_vec1;
864 6 : wb_t to_transform2 = erased_vec2;
865 6 : wb_t to_transform3 = erased_vec3;
866 :
867 6 : FD_REEDSOL_FWHT_128( to_transform0, to_transform1, to_transform2, to_transform3 );
868 :
869 6 : ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 0 ) );
870 6 : ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 1 ) );
871 6 : ws_t transformed2 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 0 ) );
872 6 : ws_t transformed3 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 1 ) );
873 6 : ws_t transformed4 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 0 ) );
874 6 : ws_t transformed5 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 1 ) );
875 6 : ws_t transformed6 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 0 ) );
876 6 : ws_t transformed7 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 1 ) );
877 :
878 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
879 : 0<=product<256*255, so product is interpreted as unsigned. */
880 6 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_128 ) );
881 6 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_128 + 16UL ) );
882 6 : ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_128 + 32UL ) );
883 6 : ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_128 + 48UL ) );
884 6 : ws_t product4 = ws_mullo( transformed4, ws_ld( fwht_l_twiddle_128 + 64UL ) );
885 6 : ws_t product5 = ws_mullo( transformed5, ws_ld( fwht_l_twiddle_128 + 80UL ) );
886 6 : ws_t product6 = ws_mullo( transformed6, ws_ld( fwht_l_twiddle_128 + 96UL ) );
887 6 : ws_t product7 = ws_mullo( transformed7, ws_ld( fwht_l_twiddle_128 + 112UL ) );
888 :
889 : /* Compute mod 255, using the same approach as above. */
890 6 : product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( ws_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
891 6 : product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( ws_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
892 6 : product2 = ws_and( ws_bcast( 0xFF ), ws_add( product2, ws_shru( ws_mulhi( product2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
893 6 : product3 = ws_and( ws_bcast( 0xFF ), ws_add( product3, ws_shru( ws_mulhi( product3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
894 6 : product4 = ws_and( ws_bcast( 0xFF ), ws_add( product4, ws_shru( ws_mulhi( product4, ws_bcast( (short)0x8081 ) ), 7 ) ) );
895 6 : product5 = ws_and( ws_bcast( 0xFF ), ws_add( product5, ws_shru( ws_mulhi( product5, ws_bcast( (short)0x8081 ) ), 7 ) ) );
896 6 : product6 = ws_and( ws_bcast( 0xFF ), ws_add( product6, ws_shru( ws_mulhi( product6, ws_bcast( (short)0x8081 ) ), 7 ) ) );
897 6 : product7 = ws_and( ws_bcast( 0xFF ), ws_add( product7, ws_shru( ws_mulhi( product7, ws_bcast( (short)0x8081 ) ), 7 ) ) );
898 6 : wb_t compact_product0 = compact_ws( product0, product1 );
899 6 : wb_t compact_product1 = compact_ws( product2, product3 );
900 6 : wb_t compact_product2 = compact_ws( product4, product5 );
901 6 : wb_t compact_product3 = compact_ws( product6, product7 );
902 :
903 6 : FD_REEDSOL_FWHT_128( compact_product0, compact_product1, compact_product2, compact_product3 );
904 :
905 : /* Negate the ones corresponding to erasures */
906 6 : compact_product0 = wb_if( wb_eq( erased_vec0, wb_zero() ), compact_product0, wb_sub( wb_bcast( 255 ), compact_product0 ) );
907 6 : compact_product1 = wb_if( wb_eq( erased_vec1, wb_zero() ), compact_product1, wb_sub( wb_bcast( 255 ), compact_product1 ) );
908 6 : compact_product2 = wb_if( wb_eq( erased_vec2, wb_zero() ), compact_product2, wb_sub( wb_bcast( 255 ), compact_product2 ) );
909 6 : compact_product3 = wb_if( wb_eq( erased_vec3, wb_zero() ), compact_product3, wb_sub( wb_bcast( 255 ), compact_product3 ) );
910 :
911 6 : wb_t pi0 = exp_4( compact_product0 );
912 6 : wb_t pi1 = exp_4( compact_product1 );
913 6 : wb_t pi2 = exp_4( compact_product2 );
914 6 : wb_t pi3 = exp_4( compact_product3 );
915 6 : wb_st( output, pi0 );
916 6 : wb_st( output + 32UL, pi1 );
917 6 : wb_st( output + 64UL, pi2 );
918 6 : wb_st( output + 96UL, pi3 );
919 6 : #endif
920 : #else /* No AVX implementation */
921 :
922 : gen_pi_noavx_generic( is_erased, output, 128UL, fwht_l_twiddle_128 );
923 :
924 : #endif
925 6 : }
926 :
927 : void
928 : fd_reedsol_private_gen_pi_256( uchar const * is_erased,
929 0 : uchar * output ) {
930 0 : #if FD_REEDSOL_ARITH_IMPL>0
931 : #if FD_REEDSOL_PI_USE_SHORT
932 : ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased ) );
933 : ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 16UL ) );
934 : ws_t erased_vec2 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 32UL ) );
935 : ws_t erased_vec3 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 48UL ) );
936 : ws_t erased_vec4 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 64UL ) );
937 : ws_t erased_vec5 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 80UL ) );
938 : ws_t erased_vec6 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 96UL ) );
939 : ws_t erased_vec7 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 112UL ) );
940 : ws_t erased_vec8 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 128UL ) );
941 : ws_t erased_vec9 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 144UL ) );
942 : ws_t erased_vec10 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 160UL ) );
943 : ws_t erased_vec11 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 176UL ) );
944 : ws_t erased_vec12 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 192UL ) );
945 : ws_t erased_vec13 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 208UL ) );
946 : ws_t erased_vec14 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 224UL ) );
947 : ws_t erased_vec15 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 240UL ) );
948 :
949 : ws_t transformed0 = erased_vec0;
950 : ws_t transformed1 = erased_vec1;
951 : ws_t transformed2 = erased_vec2;
952 : ws_t transformed3 = erased_vec3;
953 : ws_t transformed4 = erased_vec4;
954 : ws_t transformed5 = erased_vec5;
955 : ws_t transformed6 = erased_vec6;
956 : ws_t transformed7 = erased_vec7;
957 : ws_t transformed8 = erased_vec8;
958 : ws_t transformed9 = erased_vec9;
959 : ws_t transformed10 = erased_vec10;
960 : ws_t transformed11 = erased_vec11;
961 : ws_t transformed12 = erased_vec12;
962 : ws_t transformed13 = erased_vec13;
963 : ws_t transformed14 = erased_vec14;
964 : ws_t transformed15 = erased_vec15;
965 : FD_REEDSOL_FWHT_256( transformed0, transformed1, transformed2, transformed3, transformed4, transformed5, transformed6, transformed7,
966 : transformed8, transformed9, transformed10, transformed11, transformed12, transformed13, transformed14, transformed15 ); /* FWHT( R~ ) */
967 : /* |transformed| <= 256 */
968 :
969 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
970 : -32512 <= product <= 32512 */
971 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_256 ) );
972 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_256 + 16UL ) );
973 : ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_256 + 32UL ) );
974 : ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_256 + 48UL ) );
975 : ws_t product4 = ws_mullo( transformed4, ws_ld( fwht_l_twiddle_256 + 64UL ) );
976 : ws_t product5 = ws_mullo( transformed5, ws_ld( fwht_l_twiddle_256 + 80UL ) );
977 : ws_t product6 = ws_mullo( transformed6, ws_ld( fwht_l_twiddle_256 + 96UL ) );
978 : ws_t product7 = ws_mullo( transformed7, ws_ld( fwht_l_twiddle_256 + 112UL ) );
979 : ws_t product8 = ws_mullo( transformed8, ws_ld( fwht_l_twiddle_256 + 128UL ) );
980 : ws_t product9 = ws_mullo( transformed9, ws_ld( fwht_l_twiddle_256 + 144UL ) );
981 : ws_t product10 = ws_mullo( transformed10, ws_ld( fwht_l_twiddle_256 + 160UL ) );
982 : ws_t product11 = ws_mullo( transformed11, ws_ld( fwht_l_twiddle_256 + 176UL ) );
983 : ws_t product12 = ws_mullo( transformed12, ws_ld( fwht_l_twiddle_256 + 192UL ) );
984 : ws_t product13 = ws_mullo( transformed13, ws_ld( fwht_l_twiddle_256 + 208UL ) );
985 : ws_t product14 = ws_mullo( transformed14, ws_ld( fwht_l_twiddle_256 + 224UL ) );
986 : ws_t product15 = ws_mullo( transformed15, ws_ld( fwht_l_twiddle_256 + 240UL ) );
987 :
988 : /* We need to reduce these mod 255 to prevent overflow in the next
989 : step. 0 <= product+128*255 <= 65152 < 2^16. The mod operation
990 : treats the input as unsigned though, so this is okay (but hanging
991 : in there by a thread!). */
992 : product0 = ws_mod255( ws_add( product0, ws_bcast( (short)128*255 ) ) );
993 : product1 = ws_mod255( ws_add( product1, ws_bcast( (short)128*255 ) ) );
994 : product2 = ws_mod255( ws_add( product2, ws_bcast( (short)128*255 ) ) );
995 : product3 = ws_mod255( ws_add( product3, ws_bcast( (short)128*255 ) ) );
996 : product4 = ws_mod255( ws_add( product4, ws_bcast( (short)128*255 ) ) );
997 : product5 = ws_mod255( ws_add( product5, ws_bcast( (short)128*255 ) ) );
998 : product6 = ws_mod255( ws_add( product6, ws_bcast( (short)128*255 ) ) );
999 : product7 = ws_mod255( ws_add( product7, ws_bcast( (short)128*255 ) ) );
1000 : product8 = ws_mod255( ws_add( product8, ws_bcast( (short)128*255 ) ) );
1001 : product9 = ws_mod255( ws_add( product9, ws_bcast( (short)128*255 ) ) );
1002 : product10 = ws_mod255( ws_add( product10, ws_bcast( (short)128*255 ) ) );
1003 : product11 = ws_mod255( ws_add( product11, ws_bcast( (short)128*255 ) ) );
1004 : product12 = ws_mod255( ws_add( product12, ws_bcast( (short)128*255 ) ) );
1005 : product13 = ws_mod255( ws_add( product13, ws_bcast( (short)128*255 ) ) );
1006 : product14 = ws_mod255( ws_add( product14, ws_bcast( (short)128*255 ) ) );
1007 : product15 = ws_mod255( ws_add( product15, ws_bcast( (short)128*255 ) ) );
1008 :
1009 : /* Now 0 <= product < 255 */
1010 :
1011 : /* log_pi is congruent (mod 255) to what the paper calls
1012 : R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
1013 : If we do the FWHT in the normal way, it might overflow, so we need to inline it and stick a mod in the middle */
1014 : ws_t log_pi0 = ws_mod255( ws_add( product0, product8 ) ); ws_t log_pi1 = ws_mod255( ws_add( product1, product9 ) );
1015 : ws_t log_pi2 = ws_mod255( ws_add( product2, product10 ) ); ws_t log_pi3 = ws_mod255( ws_add( product3, product11 ) );
1016 : ws_t log_pi4 = ws_mod255( ws_add( product4, product12 ) ); ws_t log_pi5 = ws_mod255( ws_add( product5, product13 ) );
1017 : ws_t log_pi6 = ws_mod255( ws_add( product6, product14 ) ); ws_t log_pi7 = ws_mod255( ws_add( product7, product15 ) );
1018 : ws_t log_pi8 = ws_mod255( ws_add( ws_sub( product0, product8 ), ws_bcast( (short)255*2 ) ) );
1019 : ws_t log_pi9 = ws_mod255( ws_add( ws_sub( product1, product9 ), ws_bcast( (short)255*2 ) ) );
1020 : ws_t log_pi10 = ws_mod255( ws_add( ws_sub( product2, product10 ), ws_bcast( (short)255*2 ) ) );
1021 : ws_t log_pi11 = ws_mod255( ws_add( ws_sub( product3, product11 ), ws_bcast( (short)255*2 ) ) );
1022 : ws_t log_pi12 = ws_mod255( ws_add( ws_sub( product4, product12 ), ws_bcast( (short)255*2 ) ) );
1023 : ws_t log_pi13 = ws_mod255( ws_add( ws_sub( product5, product13 ), ws_bcast( (short)255*2 ) ) );
1024 : ws_t log_pi14 = ws_mod255( ws_add( ws_sub( product6, product14 ), ws_bcast( (short)255*2 ) ) );
1025 : ws_t log_pi15 = ws_mod255( ws_add( ws_sub( product7, product15 ), ws_bcast( (short)255*2 ) ) );
1026 :
1027 : FD_REEDSOL_FWHT_128( log_pi0, log_pi1, log_pi2, log_pi3, log_pi4, log_pi5, log_pi6, log_pi7 );
1028 : FD_REEDSOL_FWHT_128( log_pi8, log_pi9, log_pi10, log_pi11, log_pi12, log_pi13, log_pi14, log_pi15 );
1029 : /* Now |log_pi| <= 128*255 */
1030 :
1031 : /* Negate the ones corresponding to erasures to compute 1/Pi' */
1032 : log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
1033 : log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
1034 : log_pi2 = ws_adjust_sign( log_pi2, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec2, 1 ) ) );
1035 : log_pi3 = ws_adjust_sign( log_pi3, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec3, 1 ) ) );
1036 : log_pi4 = ws_adjust_sign( log_pi4, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec4, 1 ) ) );
1037 : log_pi5 = ws_adjust_sign( log_pi5, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec5, 1 ) ) );
1038 : log_pi6 = ws_adjust_sign( log_pi6, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec6, 1 ) ) );
1039 : log_pi7 = ws_adjust_sign( log_pi7, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec7, 1 ) ) );
1040 : log_pi8 = ws_adjust_sign( log_pi8, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec8, 1 ) ) );
1041 : log_pi9 = ws_adjust_sign( log_pi9, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec9, 1 ) ) );
1042 : log_pi10 = ws_adjust_sign( log_pi10, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec10, 1 ) ) );
1043 : log_pi11 = ws_adjust_sign( log_pi11, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec11, 1 ) ) );
1044 : log_pi12 = ws_adjust_sign( log_pi12, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec12, 1 ) ) );
1045 : log_pi13 = ws_adjust_sign( log_pi13, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec13, 1 ) ) );
1046 : log_pi14 = ws_adjust_sign( log_pi14, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec14, 1 ) ) );
1047 : log_pi15 = ws_adjust_sign( log_pi15, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec15, 1 ) ) );
1048 :
1049 : /* After the addition below, 0<= log_pi <= 65152 < 2^16. The mod
1050 : brings it back to 0 <= log_pi < 255. */
1051 : log_pi0 = ws_mod255( ws_add( log_pi0, ws_bcast( (short)(255*128) ) ) );
1052 : log_pi1 = ws_mod255( ws_add( log_pi1, ws_bcast( (short)(255*128) ) ) );
1053 : log_pi2 = ws_mod255( ws_add( log_pi2, ws_bcast( (short)(255*128) ) ) );
1054 : log_pi3 = ws_mod255( ws_add( log_pi3, ws_bcast( (short)(255*128) ) ) );
1055 : log_pi4 = ws_mod255( ws_add( log_pi4, ws_bcast( (short)(255*128) ) ) );
1056 : log_pi5 = ws_mod255( ws_add( log_pi5, ws_bcast( (short)(255*128) ) ) );
1057 : log_pi6 = ws_mod255( ws_add( log_pi6, ws_bcast( (short)(255*128) ) ) );
1058 : log_pi7 = ws_mod255( ws_add( log_pi7, ws_bcast( (short)(255*128) ) ) );
1059 : log_pi8 = ws_mod255( ws_add( log_pi8, ws_bcast( (short)(255*128) ) ) );
1060 : log_pi9 = ws_mod255( ws_add( log_pi9, ws_bcast( (short)(255*128) ) ) );
1061 : log_pi10 = ws_mod255( ws_add( log_pi10, ws_bcast( (short)(255*128) ) ) );
1062 : log_pi11 = ws_mod255( ws_add( log_pi11, ws_bcast( (short)(255*128) ) ) );
1063 : log_pi12 = ws_mod255( ws_add( log_pi12, ws_bcast( (short)(255*128) ) ) );
1064 : log_pi13 = ws_mod255( ws_add( log_pi13, ws_bcast( (short)(255*128) ) ) );
1065 : log_pi14 = ws_mod255( ws_add( log_pi14, ws_bcast( (short)(255*128) ) ) );
1066 : log_pi15 = ws_mod255( ws_add( log_pi15, ws_bcast( (short)(255*128) ) ) );
1067 :
1068 : /* Since our FWHT implementation is unscaled, we've computed a value
1069 : 256 times larger than what we'd like. 256^-1==1^-1 == 1 (mod 255),
1070 : so we don't need to do anything special. */
1071 :
1072 : wb_t compact_log_pi0 = compact_ws( log_pi0, log_pi1 );
1073 : wb_t compact_log_pi1 = compact_ws( log_pi2, log_pi3 );
1074 : wb_t compact_log_pi2 = compact_ws( log_pi4, log_pi5 );
1075 : wb_t compact_log_pi3 = compact_ws( log_pi6, log_pi7 );
1076 : wb_t compact_log_pi4 = compact_ws( log_pi8, log_pi9 );
1077 : wb_t compact_log_pi5 = compact_ws( log_pi10, log_pi11 );
1078 : wb_t compact_log_pi6 = compact_ws( log_pi12, log_pi13 );
1079 : wb_t compact_log_pi7 = compact_ws( log_pi14, log_pi15 );
1080 : wb_t pi0 = exp_2( compact_log_pi0 );
1081 : wb_t pi1 = exp_2( compact_log_pi1 );
1082 : wb_t pi2 = exp_2( compact_log_pi2 );
1083 : wb_t pi3 = exp_2( compact_log_pi3 );
1084 : wb_t pi4 = exp_2( compact_log_pi4 );
1085 : wb_t pi5 = exp_2( compact_log_pi5 );
1086 : wb_t pi6 = exp_2( compact_log_pi6 );
1087 : wb_t pi7 = exp_2( compact_log_pi7 );
1088 :
1089 : wb_st( output, pi0 );
1090 : wb_st( output + 32UL, pi1 );
1091 : wb_st( output + 64UL, pi2 );
1092 : wb_st( output + 96UL, pi3 );
1093 : wb_st( output + 128UL, pi4 );
1094 : wb_st( output + 160UL, pi5 );
1095 : wb_st( output + 192UL, pi6 );
1096 : wb_st( output + 224UL, pi7 );
1097 :
1098 : #else
1099 0 : wb_t erased_vec0 = wb_ld( is_erased );
1100 0 : wb_t erased_vec1 = wb_ld( is_erased + 32UL );
1101 0 : wb_t erased_vec2 = wb_ld( is_erased + 64UL );
1102 0 : wb_t erased_vec3 = wb_ld( is_erased + 96UL );
1103 0 : wb_t erased_vec4 = wb_ld( is_erased + 128UL );
1104 0 : wb_t erased_vec5 = wb_ld( is_erased + 160UL );
1105 0 : wb_t erased_vec6 = wb_ld( is_erased + 192UL );
1106 0 : wb_t erased_vec7 = wb_ld( is_erased + 224UL );
1107 0 : wb_t to_transform0 = erased_vec0;
1108 0 : wb_t to_transform1 = erased_vec1;
1109 0 : wb_t to_transform2 = erased_vec2;
1110 0 : wb_t to_transform3 = erased_vec3;
1111 0 : wb_t to_transform4 = erased_vec4;
1112 0 : wb_t to_transform5 = erased_vec5;
1113 0 : wb_t to_transform6 = erased_vec6;
1114 0 : wb_t to_transform7 = erased_vec7;
1115 :
1116 0 : FD_REEDSOL_FWHT_256( to_transform0, to_transform1, to_transform2, to_transform3,
1117 0 : to_transform4, to_transform5, to_transform6, to_transform7 );
1118 :
1119 0 : ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 0 ) );
1120 0 : ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 1 ) );
1121 0 : ws_t transformed2 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 0 ) );
1122 0 : ws_t transformed3 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 1 ) );
1123 0 : ws_t transformed4 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 0 ) );
1124 0 : ws_t transformed5 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 1 ) );
1125 0 : ws_t transformed6 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 0 ) );
1126 0 : ws_t transformed7 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 1 ) );
1127 0 : ws_t transformed8 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform4, 0 ) );
1128 0 : ws_t transformed9 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform4, 1 ) );
1129 0 : ws_t transformed10 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform5, 0 ) );
1130 0 : ws_t transformed11 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform5, 1 ) );
1131 0 : ws_t transformed12 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform6, 0 ) );
1132 0 : ws_t transformed13 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform6, 1 ) );
1133 0 : ws_t transformed14 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform7, 0 ) );
1134 0 : ws_t transformed15 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform7, 1 ) );
1135 :
1136 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
1137 : 0<=product<256*255, so product is interpreted as unsigned. */
1138 0 : ws_t product0 = ws_mod255( ws_mullo( transformed0, ws_ld( fwht_l_twiddle_256 ) ) );
1139 0 : ws_t product1 = ws_mod255( ws_mullo( transformed1, ws_ld( fwht_l_twiddle_256 + 16UL ) ) );
1140 0 : ws_t product2 = ws_mod255( ws_mullo( transformed2, ws_ld( fwht_l_twiddle_256 + 32UL ) ) );
1141 0 : ws_t product3 = ws_mod255( ws_mullo( transformed3, ws_ld( fwht_l_twiddle_256 + 48UL ) ) );
1142 0 : ws_t product4 = ws_mod255( ws_mullo( transformed4, ws_ld( fwht_l_twiddle_256 + 64UL ) ) );
1143 0 : ws_t product5 = ws_mod255( ws_mullo( transformed5, ws_ld( fwht_l_twiddle_256 + 80UL ) ) );
1144 0 : ws_t product6 = ws_mod255( ws_mullo( transformed6, ws_ld( fwht_l_twiddle_256 + 96UL ) ) );
1145 0 : ws_t product7 = ws_mod255( ws_mullo( transformed7, ws_ld( fwht_l_twiddle_256 + 112UL ) ) );
1146 0 : ws_t product8 = ws_mod255( ws_mullo( transformed8, ws_ld( fwht_l_twiddle_256 + 128UL ) ) );
1147 0 : ws_t product9 = ws_mod255( ws_mullo( transformed9, ws_ld( fwht_l_twiddle_256 + 144UL ) ) );
1148 0 : ws_t product10 = ws_mod255( ws_mullo( transformed10, ws_ld( fwht_l_twiddle_256 + 160UL ) ) );
1149 0 : ws_t product11 = ws_mod255( ws_mullo( transformed11, ws_ld( fwht_l_twiddle_256 + 176UL ) ) );
1150 0 : ws_t product12 = ws_mod255( ws_mullo( transformed12, ws_ld( fwht_l_twiddle_256 + 192UL ) ) );
1151 0 : ws_t product13 = ws_mod255( ws_mullo( transformed13, ws_ld( fwht_l_twiddle_256 + 208UL ) ) );
1152 0 : ws_t product14 = ws_mod255( ws_mullo( transformed14, ws_ld( fwht_l_twiddle_256 + 224UL ) ) );
1153 0 : ws_t product15 = ws_mod255( ws_mullo( transformed15, ws_ld( fwht_l_twiddle_256 + 240UL ) ) );
1154 :
1155 0 : wb_t compact_product0 = compact_ws( product0, product1 );
1156 0 : wb_t compact_product1 = compact_ws( product2, product3 );
1157 0 : wb_t compact_product2 = compact_ws( product4, product5 );
1158 0 : wb_t compact_product3 = compact_ws( product6, product7 );
1159 0 : wb_t compact_product4 = compact_ws( product8, product9 );
1160 0 : wb_t compact_product5 = compact_ws( product10, product11 );
1161 0 : wb_t compact_product6 = compact_ws( product12, product13 );
1162 0 : wb_t compact_product7 = compact_ws( product14, product15 );
1163 :
1164 0 : FD_REEDSOL_FWHT_256( compact_product0, compact_product1, compact_product2, compact_product3,
1165 0 : compact_product4, compact_product5, compact_product6, compact_product7 );
1166 :
1167 : /* Negate the ones corresponding to erasures */
1168 0 : compact_product0 = wb_if( wb_eq( erased_vec0, wb_zero() ), compact_product0, wb_sub( wb_bcast( 255 ), compact_product0 ) );
1169 0 : compact_product1 = wb_if( wb_eq( erased_vec1, wb_zero() ), compact_product1, wb_sub( wb_bcast( 255 ), compact_product1 ) );
1170 0 : compact_product2 = wb_if( wb_eq( erased_vec2, wb_zero() ), compact_product2, wb_sub( wb_bcast( 255 ), compact_product2 ) );
1171 0 : compact_product3 = wb_if( wb_eq( erased_vec3, wb_zero() ), compact_product3, wb_sub( wb_bcast( 255 ), compact_product3 ) );
1172 0 : compact_product4 = wb_if( wb_eq( erased_vec4, wb_zero() ), compact_product4, wb_sub( wb_bcast( 255 ), compact_product4 ) );
1173 0 : compact_product5 = wb_if( wb_eq( erased_vec5, wb_zero() ), compact_product5, wb_sub( wb_bcast( 255 ), compact_product5 ) );
1174 0 : compact_product6 = wb_if( wb_eq( erased_vec6, wb_zero() ), compact_product6, wb_sub( wb_bcast( 255 ), compact_product6 ) );
1175 0 : compact_product7 = wb_if( wb_eq( erased_vec7, wb_zero() ), compact_product7, wb_sub( wb_bcast( 255 ), compact_product7 ) );
1176 :
1177 0 : wb_t pi0 = exp_2( compact_product0 );
1178 0 : wb_t pi1 = exp_2( compact_product1 );
1179 0 : wb_t pi2 = exp_2( compact_product2 );
1180 0 : wb_t pi3 = exp_2( compact_product3 );
1181 0 : wb_t pi4 = exp_2( compact_product4 );
1182 0 : wb_t pi5 = exp_2( compact_product5 );
1183 0 : wb_t pi6 = exp_2( compact_product6 );
1184 0 : wb_t pi7 = exp_2( compact_product7 );
1185 0 : wb_st( output, pi0 );
1186 0 : wb_st( output + 32UL, pi1 );
1187 0 : wb_st( output + 64UL, pi2 );
1188 0 : wb_st( output + 96UL, pi3 );
1189 0 : wb_st( output + 128UL, pi4 );
1190 0 : wb_st( output + 160UL, pi5 );
1191 0 : wb_st( output + 192UL, pi6 );
1192 0 : wb_st( output + 224UL, pi7 );
1193 0 : #endif
1194 : #else /* No AVX implementation */
1195 :
1196 : gen_pi_noavx_generic( is_erased, output, 256UL, fwht_l_twiddle_256 );
1197 :
1198 : #endif
1199 0 : }
|