Line data Source code
1 : #include "fd_reedsol_private.h"
2 :
3 : /* TODO: Move this high-level overview
4 :
5 : The main Lin, et al. paper proposes a clever method for dealing with
6 : erasures. Suppose there is a polynomial P(x) of degree <k, and we
7 : know it's value at k inputs, but not necessarily at inputs 0, 1, ...
8 : k-1. We construct a polynomial Pi(x) (that's the uppercase Greek
9 : letter, not shorthand for P_i) with zeros at the erasures, i.e. at
10 : the x in [0, n) for which we don't know P(x). We don't care what the
11 : value of Pi(x) is at the non-erasures other than that we know the
12 : value. This means we know the value of the product P(x)Pi(x) for
13 : each x in [0, n). Additionally, since the order of Pi(x) is n-k, the
14 : order of P(x)Pi(x) is <n. Now we're back in the regime of
15 : Reed-Solomon encoding: we know the first n values of a polynomial of
16 : order <n, so we can use the FFT and IFFT operators to evaluate this
17 : polynomial computationally efficiently wherever we please.
18 :
19 : However, we don't actually care about the value of P(x)Pi(x); we want
20 : the value of P(x). To accomplish this, we take the formal derivative
21 : ("formal" in the sense that these are polynomials over a finite field
22 : not real numbers, but we ignore that for a minute):
23 : (P(x)Pi(x))' = P'(x)Pi(x) + P(x)Pi'(x).
24 : At erasures, Pi(x) = 0 by construction, so
25 : P(x) = (P(x)Pi(x))' / Pi'(x)
26 :
27 : Now then, all we need is a fast way to compute the value of the
28 : formal derivative at some points given its value at 0, 1, ..., n-1.
29 : It turns out, the special basis we use for the rest of the
30 : Reed-Solomon computation gives us a computationally efficient way to
31 : compute formal derivatives.
32 :
33 : Thus, the overall algorithm is this:
34 : 1. Compute the value of P(x)Pi(x) at x in [0, n)
35 : 2. Use the IFFT to represent the polynomial in the coefficient basis.
36 : 3. Take the formal derivative in the coefficient basis.
37 : 4. Use the FFT to compute the value of (P(x)Pi(x))' for x in [0, n).
38 : 5. Compute Pi'(x) directly.
39 : 6. Divide the results of step 4 and 5.
40 :
41 : That's roughly the approach this implementation uses. The paper gives
42 : some tips on how to optimize the computation of Pi and Pi' using the
43 : Fast Walsh-Hadamard transform in Appendix A that the code in this
44 : implementation also uses. */
45 :
46 : #if FD_REEDSOL_ARITH_IMPL>0
47 :
48 : /* When using AVX, the representation used for internal computation can
49 : be done with unsigned chars or with shorts. They give the same
50 : result in the end, but have different performance characteristics.
51 : It's not always obvious which is faster, so this is a compile-time
52 : switch for it.
53 :
54 : When FD_REEDSOL_PI_USE_SHORT==1, the FWHT operates on signed
55 : integers, so some care is needed to make sure overflow can't happen.
56 :
57 : When FD_REESOL_PI_USE_SHORT==0, the FWHT operates on a slightly
58 : overcomplete representation of the integers mod 255 (yes,
59 : unfortunately not mod 256). In particular, the value 255 is allowed,
60 : which is interchangeable with 0. */
61 :
62 : #ifndef FD_REEDSOL_PI_USE_SHORT
63 : #define FD_REEDSOL_PI_USE_SHORT 0
64 : #endif
65 :
66 : /* Define some helper macros like what we have in util/simd for a vector
67 : of shorts. */
68 :
69 : #include "../../util/simd/fd_sse.h"
70 :
71 : #define ws_adjust_sign(a,b) _mm256_sign_epi16( (a), (b) ) /* scales elements in a by the sign of the corresponding element of b */
72 :
73 : static inline ws_t
74 0 : ws_mod255( ws_t x ) {
75 : /* GCC informs me that for a ushort x,
76 : (x%255) == 0xFF & ( x + (x*0x8081)>>23).
77 : We need at least 31 bits of precision for the product, so
78 : mulh_epu16 is perfect. */
79 0 : return ws_and( ws_bcast( 0xFF ), ws_add( x, ws_shru( wh_mulhi( x, ws_bcast( (short)0x8081 ) ), 7 ) ) );
80 0 : }
81 :
82 : /* The following macros implement the unscaled Fast Walsh-Hadamard
83 : transform. As alluded to above, this gives us a way to compute Pi
84 : and Pi' in O(n lg n) time. These are designed for use within this
85 : file, not external use.
86 :
87 : Unlike the rest of the similar-seeming components in fd_reedsol (e.g.
88 : FFT, PPT), this computes the transform within a single (or few) AVX
89 : vectors, not in parallel across each component of the vector. I.e. if
90 : FD_REEDSOL_ARITH_IMPL>0, to compute a 16-element FWHD, you pass one
91 : AVX vector (16*short), not 16 vectors.
92 :
93 : Also unlike the rest of the similar-seeming components in fd_reedsol,
94 : this works on the group Z/255Z (integers mod 255). Since 255 is not
95 : a prime, this is not a field, but the FD_REEDSOL_FWHT only needs addition,
96 : subtraction, and division by powers of 2 (which have inverses mod
97 : 255), so it's not a problem.
98 :
99 : The typical FWHT multiplies by a factor of 1/sqrt(2) at each step.
100 : To convert the unscaled version to the scaled version, divide the
101 : result by sqrt(2)^lg(N). Since we often do two transforms, we need
102 : to divide by N ( = (sqrt(2)^lg(N))^2 ).
103 : */
104 :
105 : #if FD_REEDSOL_PI_USE_SHORT
106 :
107 : #define FD_REEDSOL_FWHT_16( x ) do { ws_t _x = (x); \
108 : _x = ws_add( _mm256_setr_m128i( _mm256_extracti128_si256( _x, 1 ), _mm256_extracti128_si256( _x, 0 ) ), \
109 : ws_adjust_sign( _x, _mm256_setr_epi16( 1,1,1,1, 1,1,1,1, -1,-1,-1,-1, -1,-1,-1,-1 ) ) ); \
110 : _x = ws_add( _mm256_shuffle_epi32( _x, 0x4E ), \
111 : ws_adjust_sign( _x, _mm256_setr_epi16( 1,1,1,1, -1,-1,-1,-1, 1,1,1,1, -1,-1,-1,-1 ) ) ); \
112 : _x = ws_add( _mm256_shuffle_epi32( _x, 0xB1 ), \
113 : ws_adjust_sign( _x, _mm256_setr_epi16( 1,1,-1,-1, 1,1,-1,-1, 1,1,-1,-1, 1,1,-1,-1 ) ) ); \
114 : _x = ws_add( _mm256_shuffle_epi8( _x, _mm256_setr_epi8( 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, \
115 : 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ) ), \
116 : ws_adjust_sign( _x, _mm256_setr_epi16( 1,-1,1,-1, 1,-1,1,-1, 1,-1,1,-1, 1,-1,1,-1 ) ) ); \
117 : (x) = _x; } while( 0 )
118 :
119 : #define FD_REEDSOL_FWHT_32( x0, x1 ) do { \
120 : ws_t _y0i = (x0); ws_t _y1i = (x1); \
121 : ws_t _y0 = ws_add( _y0i, _y1i ); ws_t _y1 = ws_sub( _y0i, _y1i ); \
122 : FD_REEDSOL_FWHT_16( _y0 ); FD_REEDSOL_FWHT_16( _y1 ); \
123 : (x0) = _y0; (x1) = _y1; \
124 : } while( 0 )
125 :
126 : #define FD_REEDSOL_FWHT_64( x0, x1, x2, x3 ) do { \
127 : ws_t _z0, _z1, _z2, _z3; ws_t _z0i, _z1i, _z2i, _z3i; \
128 : _z0i = (x0); _z1i = (x1); _z2i = (x2); _z3i = (x3); \
129 : _z0 = ws_add( _z0i, _z2i ); _z1 = ws_add( _z1i, _z3i ); _z2 = ws_sub( _z0i, _z2i ); _z3 = ws_sub( _z1i, _z3i ); \
130 : FD_REEDSOL_FWHT_32( _z0, _z1 ); FD_REEDSOL_FWHT_32( _z2, _z3 ); \
131 : (x0) = _z0; (x1) = _z1; (x2) = _z2; (x3) = _z3; \
132 : } while( 0 )
133 :
134 : #define FD_REEDSOL_FWHT_128( x0, x1, x2, x3, x4, x5, x6, x7 ) do { \
135 : ws_t _w0, _w1, _w2, _w3, _w4, _w5, _w6, _w7; \
136 : ws_t _w0i, _w1i, _w2i, _w3i, _w4i, _w5i, _w6i, _w7i; \
137 : _w0i = (x0); _w1i = (x1); _w2i = (x2); _w3i = (x3); \
138 : _w4i = (x4); _w5i = (x5); _w6i = (x6); _w7i = (x7); \
139 : _w0 = ws_add( _w0i, _w4i ); _w1 = ws_add( _w1i, _w5i ); _w2 = ws_add( _w2i, _w6i ); _w3 = ws_add( _w3i, _w7i ); \
140 : _w4 = ws_sub( _w0i, _w4i ); _w5 = ws_sub( _w1i, _w5i ); _w6 = ws_sub( _w2i, _w6i ); _w7 = ws_sub( _w3i, _w7i ); \
141 : FD_REEDSOL_FWHT_64( _w0, _w1, _w2, _w3 ); FD_REEDSOL_FWHT_64( _w4, _w5, _w6, _w7 ); \
142 : (x0) = _w0; (x1) = _w1; (x2) = _w2; (x3) = _w3; (x4) = _w4; (x5) = _w5; (x6) = _w6; (x7) = _w7; \
143 : } while( 0 )
144 :
145 : #define FD_REEDSOL_FWHT_256( x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 ) do { \
146 : ws_t _v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7, _v8, _v9, _v10, _v11, _v12, _v13, _v14, _v15; \
147 : ws_t _v0i, _v1i, _v2i, _v3i, _v4i, _v5i, _v6i, _v7i, _v8i, _v9i, _v10i, _v11i, _v12i, _v13i, _v14i, _v15i; \
148 : _v0i = (x0); _v1i = (x1); _v2i = (x2); _v3i = (x3); \
149 : _v4i = (x4); _v5i = (x5); _v6i = (x6); _v7i = (x7); \
150 : _v8i = (x8); _v9i = (x9); _v10i = (x10); _v11i = (x11); \
151 : _v12i = (x12); _v13i = (x13); _v14i = (x14); _v15i = (x15); \
152 : _v0 = ws_add( _v0i, _v8i ); _v1 = ws_add( _v1i, _v9i ); _v2 = ws_add( _v2i, _v10i ); _v3 = ws_add( _v3i, _v11i ); \
153 : _v4 = ws_add( _v4i, _v12i ); _v5 = ws_add( _v5i, _v13i ); _v6 = ws_add( _v6i, _v14i ); _v7 = ws_add( _v7i, _v15i ); \
154 : _v8 = ws_sub( _v0i, _v8i ); _v9 = ws_sub( _v1i, _v9i ); _v10 = ws_sub( _v2i, _v10i ); _v11 = ws_sub( _v3i, _v11i ); \
155 : _v12 = ws_sub( _v4i, _v12i ); _v13 = ws_sub( _v5i, _v13i ); _v14 = ws_sub( _v6i, _v14i ); _v15 = ws_sub( _v7i, _v15i ); \
156 : FD_REEDSOL_FWHT_128( _v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7 ); FD_REEDSOL_FWHT_128( _v8, _v9, _v10, _v11, _v12, _v13, _v14, _v15 ); \
157 : (x0) = _v0; (x1) = _v1; (x2) = _v2; (x3) = _v3; (x4) = _v4; (x5) = _v5; (x6) = _v6; (x7) = _v7; \
158 : (x8) = _v8; (x9) = _v9; (x10) = _v10; (x11) = _v11; (x12) = _v12; (x13) = _v13; (x14) = _v14; (x15) = _v15; \
159 : } while( 0 )
160 :
161 : #else /* FD_REEDSOL_PI_USE_SHORT */
162 :
163 : static inline wb_t
164 1698 : add_mod_255( wb_t a, wb_t b ) {
165 1698 : wb_t sum = wb_add( a, b );
166 1698 : wb_t overflowed = wb_lt( sum, a );
167 1698 : return wb_sub( sum, overflowed );
168 1698 : }
169 :
170 294 : #define FD_REEDSOL_FWHT_16( x ) do { wb_t _x = (x); \
171 294 : wb_t negated, unshifted, shifted; \
172 294 : /* Shift by 8 elements (8B) */ \
173 294 : negated = wb_sub( wb_bcast( 0xFF ), _x ); \
174 294 : unshifted = _mm256_blend_epi32( _x, negated, 0xCC ); \
175 294 : shifted = _mm256_shuffle_epi32( _x, 0x4E ); \
176 294 : _x = add_mod_255( unshifted, shifted ); \
177 294 : /* Shift by 4 elements (4B) */ \
178 294 : negated = wb_sub( wb_bcast( 0xFF ), _x ); \
179 294 : unshifted = _mm256_blend_epi32( _x, negated, 0xAA ); \
180 294 : shifted = _mm256_shuffle_epi32( _x, 0xB1 ); \
181 294 : _x = add_mod_255( unshifted, shifted ); \
182 294 : /* Shift by 2 elements (2B) */ \
183 294 : negated = wb_sub( wb_bcast( 0xFF ), _x ); \
184 294 : unshifted = _mm256_blend_epi16( _x, negated, 0xAA ); \
185 294 : shifted = wb_exch_adj_pair( _x ); \
186 294 : _x = add_mod_255( unshifted, shifted ); \
187 294 : /* Shift by 1 element (1B) */ \
188 294 : negated = wb_sub( wb_bcast( 0xFF ), _x ); \
189 294 : unshifted = _mm256_blendv_epi8( _x, negated, wb_bcast_pair( 0x01, 0xFF ) ); \
190 294 : shifted = wb_exch_adj( _x ); \
191 294 : _x = add_mod_255( unshifted, shifted ); \
192 294 : (x) = _x; \
193 294 : } while( 0 )
194 :
195 270 : #define FD_REEDSOL_FWHT_32( x ) do { wb_t _y = (x); \
196 270 : wb_t negated, unshifted, shifted; \
197 270 : /* Shift by 16 elements (16B) */ \
198 270 : negated = wb_sub( wb_bcast( 0xFF ), _y ); \
199 270 : unshifted = _mm256_blend_epi32( _y, negated, 0xF0 ); \
200 270 : shifted = _mm256_setr_m128i( _mm256_extracti128_si256( _y, 1 ), _mm256_extracti128_si256( _y, 0 ) ); \
201 270 : _y = add_mod_255( unshifted, shifted ); \
202 270 : FD_REEDSOL_FWHT_16( _y ); \
203 270 : (x) = _y; \
204 270 : } while( 0 )
205 :
206 102 : #define FD_REEDSOL_FWHT_64( x0, x1 ) do { wb_t _z0i = (x0); wb_t _z1i = (x1); \
207 102 : wb_t _z0 = add_mod_255( _z0i, _z1i ); wb_t _z1 = add_mod_255( _z0i, wb_sub( wb_bcast( 0xFF ), _z1i ) ); \
208 102 : FD_REEDSOL_FWHT_32( _z0 ); FD_REEDSOL_FWHT_32( _z1 ); \
209 102 : (x0) = _z0; (x1) = _z1; \
210 102 : } while( 0 )
211 :
212 12 : #define FD_REEDSOL_FWHT_128( x0, x1, x2, x3 ) do { wb_t _w0i = (x0); wb_t _w1i = (x1); wb_t _w2i = (x2); wb_t _w3i = (x3); \
213 12 : wb_t _w0, _w1, _w2, _w3; \
214 12 : _w0 = add_mod_255( _w0i, _w2i ); _w1 = add_mod_255( _w1i, _w3i ); \
215 12 : _w2 = add_mod_255( _w0i, wb_sub( wb_bcast( 0xFF ), _w2i ) ); _w3 = add_mod_255( _w1i, wb_sub( wb_bcast( 0xFF ), _w3i ) ); \
216 12 : FD_REEDSOL_FWHT_64( _w0, _w1 ); FD_REEDSOL_FWHT_64( _w2, _w3 ); \
217 12 : (x0) = _w0; (x1) = _w1; (x2) = _w2; (x3) = _w3; \
218 12 : } while( 0 )
219 :
220 0 : #define FD_REEDSOL_FWHT_256( x0, x1, x2, x3, x4, x5, x6, x7 ) do { \
221 0 : wb_t _v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7; \
222 0 : wb_t _v0i, _v1i, _v2i, _v3i, _v4i, _v5i, _v6i, _v7i; \
223 0 : _v0i = (x0); _v1i = (x1); _v2i = (x2); _v3i = (x3); \
224 0 : _v4i = (x4); _v5i = (x5); _v6i = (x6); _v7i = (x7); \
225 0 : _v0 = add_mod_255( _v0i, _v4i ); _v1 = add_mod_255( _v1i, _v5i ); \
226 0 : _v2 = add_mod_255( _v2i, _v6i ); _v3 = add_mod_255( _v3i, _v7i ); \
227 0 : _v4 = add_mod_255( _v0i, wb_sub( wb_bcast( 0xFF ), _v4i ) ); _v5 = add_mod_255( _v1i, wb_sub( wb_bcast( 0xFF ), _v5i ) ); \
228 0 : _v6 = add_mod_255( _v2i, wb_sub( wb_bcast( 0xFF ), _v6i ) ); _v7 = add_mod_255( _v3i, wb_sub( wb_bcast( 0xFF ), _v7i ) ); \
229 0 : FD_REEDSOL_FWHT_128( _v0, _v1, _v2, _v3 ); FD_REEDSOL_FWHT_128( _v4, _v5, _v6, _v7 ); \
230 0 : (x0) = _v0; (x1) = _v1; (x2) = _v2; (x3) = _v3; (x4) = _v4; (x5) = _v5; (x6) = _v6; (x7) = _v7; \
231 0 : } while( 0 )
232 : #endif
233 :
234 : /* Casts each element of a to a uchar, forming a 16-element uchar vector. Then
235 : * casts each element of b to a uchar, forming a second 16-element uchar
236 : * vector. Concatenates the two 16-element vectors to form a single
237 : * 32-element wb_t (a first, then b). */
238 : static inline wb_t
239 147 : compact_ws( ws_t a, ws_t b ) {
240 : /* There's also _mm256_packus_epi16, but it's no better than this */
241 147 : wb_t shuffled0 = _mm256_shuffle_epi8(a, wb( 0, 2, 4, 6, 8, 10, 12, 14, 128,128,128,128,128,128,128,128,
242 147 : 128, 128,128,128,128,128,128,128, 0, 2, 4, 6, 8, 10, 12, 14 ) );
243 147 : wb_t shuffled1 = _mm256_shuffle_epi8(b, wb( 0, 2, 4, 6, 8, 10, 12, 14, 128,128,128,128,128,128,128,128,
244 147 : 128, 128,128,128,128,128,128,128, 0, 2, 4, 6, 8, 10, 12, 14 ) );
245 147 : return _mm256_setr_m128i(
246 147 : _mm_or_si128( _mm256_extracti128_si256( shuffled0, 0 ), _mm256_extracti128_si256( shuffled0, 1 ) ),
247 147 : _mm_or_si128( _mm256_extracti128_si256( shuffled1, 0 ), _mm256_extracti128_si256( shuffled1, 1 ) ) );
248 147 : }
249 :
250 : /* exp_{n}( x ) computes n^x_i in GF(2^8) for each byte x_i in the
251 : vector x. That's exponentiation, not xor. For example, exp_76
252 : interprets 76 as an element of GF(2^8) and x_i as an integer, and
253 : computes the product of multiplying GF(76) times itself x_i times.
254 : Recall that exponentiation is an operator from (GF(2^8) x (Z/255Z))
255 : -> GF(2^8), so x is interpreted mod 255. (equivalently, observe
256 : n^255=1). As an input, x^255 is okay and is the same as x^0. */
257 : static inline wb_t
258 12 : exp_76( wb_t x ) {
259 : /* Decompose x = xh3*0x80 + xh2*0x40 + xh1*0x20 + xh0*0x10 + xl
260 : where 0<=xl<16 and 0<=xh_j<1 for each j. Then
261 :
262 : 76^x = (76^xl) * (76^0x10)^xh0 * (76^0x20)^xh1 *
263 : (76^0x40)^xh2 (76^0x80)^xh3
264 : = (76^xl) * 2^xh0 * 4^xh1 * 16^xh2 * 29^xh3.
265 :
266 : We use vpshub to implement the 4-bit lookup table 76^xl. The for
267 : the rest, we're either multiplying by a constant or not doing the
268 : multiply, so we can use our normal GF_MUL with a blend. */
269 :
270 12 : wb_t low = wb_and( x, wb_bcast( 0xF ) );
271 12 : wb_t exp_low = _mm256_shuffle_epi8( wb( 1, 76, 157, 70, 95, 253, 217, 129, 133, 168, 230, 227, 130, 81, 18, 44,
272 12 : 1, 76, 157, 70, 95, 253, 217, 129, 133, 168, 230, 227, 130, 81, 18, 44 ),
273 12 : low );
274 12 : wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 2 ), _mm256_slli_epi16( x, 3 ) );
275 12 : wb_t with1 = _mm256_blendv_epi8( with0, GF_MUL( with0, 4 ), _mm256_slli_epi16( x, 2 ) );
276 12 : wb_t with2 = _mm256_blendv_epi8( with1, GF_MUL( with1, 16 ), _mm256_slli_epi16( x, 1 ) );
277 12 : wb_t with3 = _mm256_blendv_epi8( with2, GF_MUL( with2, 29 ), x );
278 12 : return with3;
279 12 : }
280 :
281 : static inline wb_t
282 33 : exp_29( wb_t x ) {
283 33 : wb_t low = wb_and( x, wb_bcast( 0xF ) );
284 33 : wb_t exp_low = _mm256_shuffle_epi8( wb( 1, 29, 76, 143, 157, 106, 70, 93, 95, 101, 253, 254, 217, 13, 129, 59,
285 33 : 1, 29, 76, 143, 157, 106, 70, 93, 95, 101, 253, 254, 217, 13, 129, 59 ),
286 33 : low );
287 33 : wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 133 ), _mm256_slli_epi16( x, 3 ) );
288 33 : wb_t with1 = _mm256_blendv_epi8( with0, GF_MUL( with0, 2 ), _mm256_slli_epi16( x, 2 ) );
289 33 : wb_t with2 = _mm256_blendv_epi8( with1, GF_MUL( with1, 4 ), _mm256_slli_epi16( x, 1 ) );
290 33 : wb_t with3 = _mm256_blendv_epi8( with2, GF_MUL( with2, 16 ), x );
291 33 : return with3;
292 33 : }
293 : static inline wb_t
294 78 : exp_16( wb_t x ) {
295 78 : wb_t low = wb_and( x, wb_bcast( 0xF ) );
296 78 : wb_t exp_low = _mm256_shuffle_epi8( wb( 1, 16, 29, 205, 76, 180, 143, 24, 157, 37, 106, 238, 70, 20, 93, 185,
297 78 : 1, 16, 29, 205, 76, 180, 143, 24, 157, 37, 106, 238, 70, 20, 93, 185 ),
298 78 : low );
299 78 : wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 95 ), _mm256_slli_epi16( x, 3 ) );
300 78 : wb_t with1 = _mm256_blendv_epi8( with0, GF_MUL( with0, 133 ), _mm256_slli_epi16( x, 2 ) );
301 78 : wb_t with2 = _mm256_blendv_epi8( with1, GF_MUL( with1, 2 ), _mm256_slli_epi16( x, 1 ) );
302 78 : wb_t with3 = _mm256_blendv_epi8( with2, GF_MUL( with2, 4 ), x );
303 78 : return with3;
304 78 : }
305 : static inline wb_t
306 24 : exp_4( wb_t x ) {
307 24 : wb_t low = wb_and( x, wb_bcast( 0xF ) );
308 24 : wb_t exp_low = _mm256_shuffle_epi8( wb( 1, 4, 16, 64, 29, 116, 205, 19, 76, 45, 180, 234, 143, 6, 24, 96,
309 24 : 1, 4, 16, 64, 29, 116, 205, 19, 76, 45, 180, 234, 143, 6, 24, 96 ),
310 24 : low );
311 24 : wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 157 ), _mm256_slli_epi16( x, 3 ) );
312 24 : wb_t with1 = _mm256_blendv_epi8( with0, GF_MUL( with0, 95 ), _mm256_slli_epi16( x, 2 ) );
313 24 : wb_t with2 = _mm256_blendv_epi8( with1, GF_MUL( with1, 133 ), _mm256_slli_epi16( x, 1 ) );
314 24 : wb_t with3 = _mm256_blendv_epi8( with2, GF_MUL( with2, 2 ), x );
315 24 : return with3;
316 24 : }
317 :
318 : static inline wb_t
319 0 : exp_2( wb_t x ) {
320 0 : wb_t low = wb_and( x, wb_bcast( 0xF ) );
321 0 : wb_t exp_low = _mm256_shuffle_epi8( wb( 1, 2, 4, 8, 16, 32, 64, 128, 29, 58, 116, 232, 205, 135, 19, 38,
322 0 : 1, 2, 4, 8, 16, 32, 64, 128, 29, 58, 116, 232, 205, 135, 19, 38),
323 0 : low );
324 0 : wb_t with0 = _mm256_blendv_epi8( exp_low, GF_MUL( exp_low, 76 ), _mm256_slli_epi16( x, 3 ) );
325 0 : wb_t with1 = _mm256_blendv_epi8( with0, GF_MUL( with0, 157 ), _mm256_slli_epi16( x, 2 ) );
326 0 : wb_t with2 = _mm256_blendv_epi8( with1, GF_MUL( with1, 95 ), _mm256_slli_epi16( x, 1 ) );
327 0 : wb_t with3 = _mm256_blendv_epi8( with2, GF_MUL( with2, 133 ), x );
328 0 : return with3;
329 0 : }
330 :
331 : #endif /* FD_REEDSOL_ARITH_IMPL>0 */
332 :
333 : /* l_twiddle_{N} stores the size N FWHT of what the paper calls L~, i.e.
334 : ( 0, Log(1), Log(2), Log(3), ... Log(N-1) )
335 :
336 : The discrete log uses a primitive element of 2, and the FWHT is taken
337 : mod 255, which means all of the values can fit in a uchar. However,
338 : Intel doesn't give us a multiplication instruction for 8-bit
339 : integers, which means that we'd have to zero-extend these values
340 : anyways.
341 :
342 : Although L~ for a smaller size is a subset of that for a larger size,
343 : because we also precompute the value of the FWHT, we store the
344 : variables separately. Perhaps a good compiler could
345 : constant-propagate through the AVX instructions, but it's just 5
346 : values of N, so I prefer not to depend on that. */
347 : static const short fwht_l_twiddle_16 [ 16 ] = {0xca,0xa1,0x6a,0xa9,0x73,0xfc,0xe2,0x44,0x93,0x74,0x08,0x7f,0x96,0x8c,0x42,0xf2};
348 : static const short fwht_l_twiddle_32 [ 32 ] = {0x24,0x8f,0xc2,0x7e,0x49,0x89,0x74,0xdc,0x4f,0x95,0x43,0xb4,0x09,0xba,0x03,0x83,
349 : 0x71,0xb3,0x12,0xd4,0x9d,0x70,0x51,0xab,0xd7,0x53,0xcc,0x4a,0x24,0x5e,0x81,0x62};
350 : static const short fwht_l_twiddle_64 [ 64 ] = {0x05,0x81,0x9a,0x82,0x07,0x7c,0x3c,0xbe,0xd3,0xbc,0xed,0x23,0xc2,0x24,0xee,0xc8,
351 : 0x3f,0x5d,0x11,0x18,0x8a,0xf9,0x1c,0x4b,0x0e,0x02,0x8e,0xe4,0x77,0x8c,0x97,0x6d,
352 : 0x43,0x9d,0xea,0x7a,0x8b,0x96,0xac,0xfa,0xca,0x6e,0x98,0x46,0x4f,0x51,0x17,0x3e,
353 : 0xa3,0x0a,0x13,0x91,0xb0,0xe6,0x86,0x0c,0xa1,0xa4,0x0b,0xaf,0xd0,0x30,0x6b,0x57};
354 : static const short fwht_l_twiddle_128[ 128 ] = {0xfe,0x89,0x15,0xeb,0x48,0xea,0x04,0xfe,0x32,0xd9,0xca,0x2c,0x1e,0x58,0x8d,0xed,
355 : 0x6f,0x36,0x53,0x24,0xb2,0x27,0x3e,0x06,0xec,0x96,0x41,0x05,0xbe,0x1d,0xb1,0xdd,
356 : 0x18,0x64,0xf4,0xc3,0x16,0x0a,0x2e,0x00,0xde,0x34,0xaf,0x42,0xd7,0x5e,0x92,0x02,
357 : 0xbf,0x5a,0x6a,0x97,0xe1,0x39,0xd0,0xf6,0x66,0x86,0xb5,0x61,0x8a,0xa2,0x8f,0x49,
358 : 0x0b,0x79,0x20,0x19,0xc5,0x0e,0x74,0x7e,0x75,0x9f,0x11,0x1a,0x67,0xef,0x50,0xa3,
359 : 0x0f,0x84,0xce,0x0c,0x62,0xcc,0xf9,0x90,0x2f,0x6d,0xdb,0xc4,0x30,0xfb,0x7d,0xfc,
360 : 0x6e,0xd6,0xe0,0x31,0x01,0x23,0x2b,0xf5,0xb6,0xa8,0x81,0x4a,0xc6,0x44,0x9b,0x7a,
361 : 0x87,0xb9,0xbb,0x8b,0x7f,0x94,0x3c,0x21,0xdc,0xc2,0x60,0xfd,0x17,0xbd,0x47,0x65};
362 : static const short fwht_l_twiddle_256[ 256 ] = {0x00,0xfc,0xfb,0x15,0x2d,0xfa,0xc1,0x14,0x62,0x2c,0xd9,0xf9,0xc0,0x45,0x13,0xe8,
363 : 0x01,0x61,0x86,0x2b,0xd8,0xba,0xf8,0x5d,0xbf,0x7a,0x44,0x6a,0x07,0x12,0xf1,0xe7,
364 : 0x00,0xdc,0x60,0x0a,0x1f,0x85,0x1c,0x2a,0x8b,0xd7,0x92,0xb9,0xf7,0x82,0x5c,0xad,
365 : 0x19,0xbe,0xb1,0x79,0x43,0x3d,0x69,0x9e,0x06,0x75,0x11,0x27,0x70,0xf0,0xd2,0xe6,
366 : 0xfe,0x2f,0xdb,0xea,0x88,0x5f,0x7c,0x09,0x0c,0x1e,0x8d,0x84,0x1b,0x3f,0x29,0xd4,
367 : 0x31,0x8a,0x8f,0xd6,0x91,0xcb,0xb8,0xc9,0xf6,0xb6,0x81,0x39,0xc7,0x5b,0x55,0xac,
368 : 0x18,0x65,0xbd,0xf4,0x22,0xb0,0xb4,0x78,0x7f,0x42,0x34,0x3c,0x68,0x37,0x9d,0x4e,
369 : 0xc5,0x05,0x96,0x74,0x10,0x59,0x26,0x9a,0x6f,0xa3,0xef,0x53,0x4b,0xd1,0xaa,0xe5,
370 : 0xfd,0x16,0x2e,0xc2,0x63,0xda,0x46,0xe9,0x02,0x87,0xbb,0x5e,0x7b,0x6b,0x08,0xf2,
371 : 0xdd,0x0b,0x20,0x1d,0x8c,0x93,0x83,0xae,0x1a,0xb2,0x3e,0x9f,0x76,0x28,0x71,0xd3,
372 : 0x30,0xeb,0x89,0x7d,0x0d,0x8e,0x40,0xd5,0x32,0x90,0xcc,0xca,0xb7,0x3a,0xc8,0x56,
373 : 0x66,0xf5,0x23,0xb5,0x80,0x35,0x38,0x4f,0xc6,0x97,0x5a,0x9b,0xa4,0x54,0x4c,0xab,
374 : 0x17,0xc3,0x64,0x47,0x03,0xbc,0x6c,0xf3,0xde,0x21,0x94,0xaf,0xb3,0xa0,0x77,0x72,
375 : 0xec,0x7e,0x0e,0x41,0x33,0xcd,0x3b,0x57,0x67,0x24,0x36,0x50,0x98,0x9c,0xa5,0x4d,
376 : 0xc4,0x48,0x04,0x6d,0xdf,0x95,0xa1,0x73,0xed,0x0f,0xce,0x58,0x25,0x51,0x99,0xa6,
377 : 0x49,0x6e,0xe0,0xa2,0xee,0xcf,0x52,0xa7,0x4a,0xe1,0xd0,0xa8,0xe2,0xa9,0xe3,0xe4};
378 :
379 : #if FD_REEDSOL_ARITH_IMPL==0
380 : static void
381 : gen_pi_noavx_generic( uchar const * is_erased,
382 : uchar * output,
383 : ulong sz,
384 : const short * l_twiddle ) {
385 : long scratch[ 256 ];
386 :
387 : for( ulong i=0UL; i<sz; i++ ) scratch[ i ] = is_erased[ i ];
388 :
389 : /* Unscaled FWHT */
390 : for( ulong h=1UL; h<sz; h<<=1 ) {
391 : for( ulong i=0UL; i<sz; i += 2UL*h ) for( ulong j=i; j<i+h; j++ ) {
392 : long x = scratch[ j ];
393 : long y = scratch[ j+h ];
394 : scratch[ j ] = x+y;
395 : scratch[ j+h ] = x-y;
396 : }
397 : }
398 :
399 : for( ulong i=0UL; i<sz; i++ ) scratch[ i ] *= l_twiddle[ i ];
400 :
401 : for( ulong h=1UL; h<sz; h<<=1 ) {
402 : for( ulong i=0UL; i<sz; i += 2UL*h ) for( ulong j=i; j<i+h; j++ ) {
403 : long x = scratch[ j ];
404 : long y = scratch[ j+h ];
405 : scratch[ j ] = x+y;
406 : scratch[ j+h ] = x-y;
407 : }
408 : }
409 :
410 : /* Negate the ones corresponding to erasures to compute 1/Pi' */
411 : for( ulong i=0UL; i<sz; i++ ) scratch[ i ] *= fd_long_if( is_erased[ i ], -1L, 1L );
412 :
413 : /* To fix the FWHT scaling, we need to multiply by sz^-1 mod 255.
414 : Given that sz is always a power of 2, this is not too bad.
415 : Let s = lg(sz). Note that 2^8 == 256 == 1 (mod 255),
416 : so then (2^s) * (2^(8-s)) == 1 (mod 255).
417 : This implies that sz^-1 = 2^(8-s) (mod 255), and we can compute
418 : 2^(8-s) by 256/s with normal integer division. */
419 : long sz_inv = 256L/(long)sz;
420 : /* The % operator in C doesn't work like the mathematical mod
421 : operation for negative numbers, so we need to add in a shift to
422 : make any input non-negative. We can compute the smallest possible
423 : value at this point and it's -142397, so we add 255*559=142545. */
424 : for( ulong i=0UL; i<sz; i++ ) scratch[ i ] = (sz_inv*(scratch[ i ] + 255L*559L)) % 255L;
425 :
426 : for( ulong i=0UL; i<sz; i++ ) output[ i ] = (uchar)gf_arith_invlog_tbl[ scratch[ i ] ];
427 : }
428 : #endif
429 :
430 : void
431 : fd_reedsol_private_gen_pi_16( uchar const * is_erased,
432 12 : uchar * output ) {
433 12 : #if FD_REEDSOL_ARITH_IMPL>0
434 : #if FD_REEDSOL_PI_USE_SHORT
435 : ws_t erased_vec = _mm256_cvtepu8_epi16( vb_ld( is_erased ) );
436 :
437 : ws_t transformed = erased_vec;
438 : FD_REEDSOL_FWHT_16( transformed ); /* FWHT( R~ ) */
439 : /* |transformed| <= 16 */
440 :
441 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
442 : |product| <= 16*255, but definitely may be negative */
443 : ws_t product = ws_mullo( transformed, ws_ld( fwht_l_twiddle_16 ) );
444 :
445 : /* log_pi is congruent (mod 255) to what the paper calls
446 : R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
447 : Let H be the 16x16 Hadamard matrix. Then
448 : log_pi = (H * diag( fwht_l_twiddle_16 ) * H) * erased_vec
449 : |---------------------------------|
450 : M
451 : The part labeled M is a matrix known ahead of time, so we can bound
452 : log_pi relatively easily when combined with the fact
453 : 0<=erased_vec<=1 (just set the negative entries to 0, and multiply
454 : by all 1s. Then do the same, setting the positive entries to 0
455 : instead). This tells us |log_pi| <= 4489 */
456 : FD_REEDSOL_FWHT_16( product );
457 : ws_t log_pi = product;
458 :
459 : /* Negate the ones corresponding to erasures to compute 1/Pi' */
460 : log_pi = ws_adjust_sign( log_pi, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec, 1 ) ) );
461 :
462 : log_pi = ws_add( log_pi, ws_bcast( (short)(255*18) ) ); /* Now 0<= log_pi <= 9079 < 2^15 */
463 :
464 : /* GCC informs me that for a ushort x,
465 : (x%255) == 0xFF & ( x + (x*0x8081)>>23).
466 : We need at least 31 bits of precision for the product, so
467 : mulh_epu16 is perfect. */
468 : log_pi = ws_and( ws_bcast( 0xFF ), ws_add( log_pi, ws_shru( wh_mulhi( log_pi, ws_bcast( (short)0x8081 ) ), 7 ) ) );
469 : /* Now 0<=log_pi < 255 */
470 :
471 : /* Since our FWHT implementation is unscaled, we've computed a value
472 : 16 times larger than what we'd like. 16^-1 == 16 (mod 255), but
473 : we're just going to use this in the exponentiation, so we can
474 : compute this implicitly.
475 : 2^(log_pi * 16^-1) = 2^(16*log_pi) = (2^16)^log_pi = 76^log_pi
476 : (where 2 is the primitive element we used for the logs, an element
477 : of GF(2^8) ). */
478 :
479 : wb_t compact_log_pi = compact_ws( log_pi, ws_zero() );
480 : wb_t pi = exp_76( compact_log_pi );
481 :
482 : vb_st( output, _mm256_extracti128_si256( pi, 0 ) );
483 :
484 : #else
485 12 : wb_t erased_vec = _mm256_setr_m128i( vb_ldu( is_erased ), _mm_setzero_si128() );
486 12 : wb_t to_transform = erased_vec;
487 12 : FD_REEDSOL_FWHT_16( to_transform );
488 12 : ws_t transformed = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform, 0 ) );
489 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
490 : 0<=product<256*255, so product is interpreted as unsigned. */
491 12 : ws_t product = ws_mullo( transformed, ws_ld( fwht_l_twiddle_16 ) );
492 :
493 : /* Compute mod 255, using the same approach as above. */
494 12 : product = ws_and( ws_bcast( 0xFF ), ws_add( product, ws_shru( wh_mulhi( product, ws_bcast( (short)0x8081 ) ), 7 ) ) );
495 12 : wb_t compact_product = compact_ws( product, ws_zero() );
496 :
497 12 : FD_REEDSOL_FWHT_16( compact_product );
498 :
499 : /* Negate the ones corresponding to erasures */
500 12 : compact_product = wb_if( wb_eq( erased_vec, wb_zero() ), compact_product, wb_sub( wb_bcast( 255 ), compact_product ) );
501 :
502 12 : wb_t pi = exp_76( compact_product );
503 12 : vb_st( output, _mm256_extracti128_si256( pi, 0 ) );
504 12 : #endif
505 : #else /* No AVX implementation */
506 :
507 : gen_pi_noavx_generic( is_erased, output, 16UL, fwht_l_twiddle_16 );
508 :
509 : #endif
510 12 : }
511 :
512 : void
513 : fd_reedsol_private_gen_pi_32( uchar const * is_erased,
514 33 : uchar * output ) {
515 33 : #if FD_REEDSOL_ARITH_IMPL>0
516 : #if FD_REEDSOL_PI_USE_SHORT
517 : ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased ) );
518 : ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 16UL ) );
519 :
520 : ws_t transformed0 = erased_vec0;
521 : ws_t transformed1 = erased_vec1;
522 : FD_REEDSOL_FWHT_32( transformed0, transformed1 ); /* FWHT( R~ ) */
523 : /* |transformed| <= 32 */
524 :
525 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
526 : |product| <= 32*255, but definitely may be negative */
527 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_32 ) );
528 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_32 + 16UL ) );
529 :
530 : /* log_pi is congruent (mod 255) to what the paper calls
531 : R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
532 : |log_pi| <= 6945 using the same approach as above. */
533 : FD_REEDSOL_FWHT_32( product0, product1 );
534 : ws_t log_pi0 = product0;
535 : ws_t log_pi1 = product1;
536 :
537 : /* Negate the ones corresponding to erasures to compute 1/Pi' */
538 : log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
539 : log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
540 :
541 : log_pi0 = ws_add( log_pi0, ws_bcast( (short)(255*28) ) );
542 : log_pi1 = ws_add( log_pi1, ws_bcast( (short)(255*28) ) ); /* Now 0<= log_pi <= 14085 < 2^15 */
543 :
544 : /* GCC informs me that for a ushort x,
545 : (x%255) == 0xFF & ( x + (x*0x8081)>>23).
546 : We need at least 31 bits of precision for the product, so
547 : mulh_epu16 is perfect. */
548 : log_pi0 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi0, ws_shru( wh_mulhi( log_pi0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
549 : log_pi1 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi1, ws_shru( wh_mulhi( log_pi1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
550 : /* Now 0<=log_pi < 255 */
551 :
552 : /* Since our FWHT implementation is unscaled, we've computed a value
553 : 32 times larger than what we'd like. 32^-1 == 8 (mod 255), but
554 : we're just going to use this in the exponentiation, so we can
555 : compute this implicitly.
556 : 2^(log_pi * 32^-1) = 2^(8*log_pi) = (2^8)^log_pi = 29^log_pi
557 : (where 2 is the primitive element we used for the logs, an element
558 : of GF(2^8) ). */
559 :
560 : wb_t compact_log_pi = compact_ws( log_pi0, log_pi1 );
561 : wb_t pi = exp_29( compact_log_pi );
562 :
563 : wb_st( output, pi );
564 :
565 : #else
566 33 : wb_t erased_vec = wb_ld( is_erased );
567 33 : wb_t to_transform = erased_vec;
568 33 : FD_REEDSOL_FWHT_32( to_transform );
569 33 : ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform, 0 ) );
570 33 : ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform, 1 ) );
571 :
572 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
573 : 0<=product<256*255, so product is interpreted as unsigned. */
574 33 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_32 ) );
575 33 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_32 + 16UL ) );
576 :
577 : /* Compute mod 255, using the same approach as above. */
578 33 : product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( wh_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
579 33 : product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( wh_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
580 33 : wb_t compact_product = compact_ws( product0, product1 );
581 :
582 33 : FD_REEDSOL_FWHT_32( compact_product );
583 :
584 : /* Negate the ones corresponding to erasures */
585 33 : compact_product = wb_if( wb_eq( erased_vec, wb_zero() ), compact_product, wb_sub( wb_bcast( 255 ), compact_product ) );
586 :
587 33 : wb_t pi = exp_29( compact_product );
588 33 : wb_st( output, pi );
589 33 : #endif
590 : #else /* No AVX implementation */
591 :
592 : gen_pi_noavx_generic( is_erased, output, 32UL, fwht_l_twiddle_32 );
593 :
594 : #endif
595 33 : }
596 :
597 : void
598 : fd_reedsol_private_gen_pi_64( uchar const * is_erased,
599 39 : uchar * output ) {
600 39 : #if FD_REEDSOL_ARITH_IMPL>0
601 : #if FD_REEDSOL_PI_USE_SHORT
602 : ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased ) );
603 : ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 16UL ) );
604 : ws_t erased_vec2 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 32UL ) );
605 : ws_t erased_vec3 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 48UL ) );
606 :
607 : ws_t transformed0 = erased_vec0;
608 : ws_t transformed1 = erased_vec1;
609 : ws_t transformed2 = erased_vec2;
610 : ws_t transformed3 = erased_vec3;
611 : FD_REEDSOL_FWHT_64( transformed0, transformed1, transformed2, transformed3 ); /* FWHT( R~ ) */
612 : /* |transformed| <= 64 */
613 :
614 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
615 : |product| <= 64*255, but definitely may be negative */
616 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_64 ) );
617 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_64 + 16UL ) );
618 : ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_64 + 32UL ) );
619 : ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_64 + 48UL ) );
620 :
621 : /* log_pi is congruent (mod 255) to what the paper calls
622 : R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
623 : |log_pi| <= 18918 using the same approach as above. */
624 : FD_REEDSOL_FWHT_64( product0, product1, product2, product3 );
625 : ws_t log_pi0 = product0;
626 : ws_t log_pi1 = product1;
627 : ws_t log_pi2 = product2;
628 : ws_t log_pi3 = product3;
629 :
630 : /* Negate the ones corresponding to erasures to compute 1/Pi' */
631 : log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
632 : log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
633 : log_pi2 = ws_adjust_sign( log_pi2, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec2, 1 ) ) );
634 : log_pi3 = ws_adjust_sign( log_pi3, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec3, 1 ) ) );
635 :
636 : log_pi0 = ws_add( log_pi0, ws_bcast( (short)(255*75) ) );
637 : log_pi1 = ws_add( log_pi1, ws_bcast( (short)(255*75) ) );
638 : log_pi2 = ws_add( log_pi2, ws_bcast( (short)(255*75) ) );
639 : log_pi3 = ws_add( log_pi3, ws_bcast( (short)(255*75) ) );
640 : /* Now 0<= log_pi <= 38043 < 2^16 (okay, since the next step treats it as unsigned */
641 :
642 : /* GCC informs me that for a ushort x,
643 : (x%255) == 0xFF & ( x + (x*0x8081)>>23).
644 : We need at least 31 bits of precision for the product, so
645 : mulh_epu16 is perfect. */
646 : log_pi0 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi0, ws_shru( wh_mulhi( log_pi0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
647 : log_pi1 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi1, ws_shru( wh_mulhi( log_pi1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
648 : log_pi2 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi2, ws_shru( wh_mulhi( log_pi2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
649 : log_pi3 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi3, ws_shru( wh_mulhi( log_pi3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
650 : /* Now 0<=log_pi < 255 */
651 :
652 : /* Since our FWHT implementation is unscaled, we've computed a value
653 : 64 times larger than what we'd like. 64^-1 == 4 (mod 255), but
654 : we're just going to use this in the exponentiation, so we can
655 : compute this implicitly.
656 : 2^(log_pi * 64^-1) = 2^(4*log_pi) = (2^4)^log_pi = 16^log_pi
657 : (where 2 is the primitive element we used for the logs, an element
658 : of GF(2^8) ). */
659 :
660 : wb_t compact_log_pi0 = compact_ws( log_pi0, log_pi1 );
661 : wb_t compact_log_pi1 = compact_ws( log_pi2, log_pi3 );
662 : wb_t pi0 = exp_16( compact_log_pi0 );
663 : wb_t pi1 = exp_16( compact_log_pi1 );
664 :
665 : wb_st( output, pi0 );
666 : wb_st( output+32UL, pi1 );
667 :
668 : #else
669 39 : wb_t erased_vec0 = wb_ld( is_erased );
670 39 : wb_t erased_vec1 = wb_ld( is_erased + 32UL );
671 39 : wb_t to_transform0 = erased_vec0;
672 39 : wb_t to_transform1 = erased_vec1;
673 :
674 39 : FD_REEDSOL_FWHT_64( to_transform0, to_transform1 );
675 :
676 39 : ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 0 ) );
677 39 : ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 1 ) );
678 39 : ws_t transformed2 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 0 ) );
679 39 : ws_t transformed3 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 1 ) );
680 :
681 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
682 : 0<=product<256*255, so product is interpreted as unsigned. */
683 39 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_64 ) );
684 39 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_64 + 16UL ) );
685 39 : ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_64 + 32UL ) );
686 39 : ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_64 + 48UL ) );
687 :
688 : /* Compute mod 255, using the same approach as above. */
689 39 : product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( wh_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
690 39 : product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( wh_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
691 39 : product2 = ws_and( ws_bcast( 0xFF ), ws_add( product2, ws_shru( wh_mulhi( product2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
692 39 : product3 = ws_and( ws_bcast( 0xFF ), ws_add( product3, ws_shru( wh_mulhi( product3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
693 :
694 39 : wb_t compact_product0 = compact_ws( product0, product1 );
695 39 : wb_t compact_product1 = compact_ws( product2, product3 );
696 :
697 39 : FD_REEDSOL_FWHT_64( compact_product0, compact_product1 );
698 :
699 : /* Negate the ones corresponding to erasures */
700 39 : compact_product0 = wb_if( wb_eq( erased_vec0, wb_zero() ), compact_product0, wb_sub( wb_bcast( 255 ), compact_product0 ) );
701 39 : compact_product1 = wb_if( wb_eq( erased_vec1, wb_zero() ), compact_product1, wb_sub( wb_bcast( 255 ), compact_product1 ) );
702 :
703 39 : wb_t pi0 = exp_16( compact_product0 );
704 39 : wb_t pi1 = exp_16( compact_product1 );
705 39 : wb_st( output , pi0 );
706 39 : wb_st( output + 32UL, pi1 );
707 39 : #endif
708 : #else /* No AVX implementation */
709 :
710 : gen_pi_noavx_generic( is_erased, output, 64UL, fwht_l_twiddle_64 );
711 :
712 : #endif
713 39 : }
714 :
715 : void
716 : fd_reedsol_private_gen_pi_128( uchar const * is_erased,
717 6 : uchar * output ) {
718 6 : #if FD_REEDSOL_ARITH_IMPL>0
719 : #if FD_REEDSOL_PI_USE_SHORT
720 : ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased ) );
721 : ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 16UL ) );
722 : ws_t erased_vec2 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 32UL ) );
723 : ws_t erased_vec3 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 48UL ) );
724 : ws_t erased_vec4 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 64UL ) );
725 : ws_t erased_vec5 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 80UL ) );
726 : ws_t erased_vec6 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 96UL ) );
727 : ws_t erased_vec7 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 112UL ) );
728 :
729 : ws_t transformed0 = erased_vec0;
730 : ws_t transformed1 = erased_vec1;
731 : ws_t transformed2 = erased_vec2;
732 : ws_t transformed3 = erased_vec3;
733 : ws_t transformed4 = erased_vec4;
734 : ws_t transformed5 = erased_vec5;
735 : ws_t transformed6 = erased_vec6;
736 : ws_t transformed7 = erased_vec7;
737 : FD_REEDSOL_FWHT_128( transformed0, transformed1, transformed2, transformed3, transformed4, transformed5, transformed6, transformed7 ); /* FWHT( R~ ) */
738 : /* |transformed| <= 128 */
739 :
740 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
741 : -16256 <= product <= 32512 */
742 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_128 ) );
743 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_128 + 16UL ) );
744 : ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_128 + 32UL ) );
745 : ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_128 + 48UL ) );
746 : ws_t product4 = ws_mullo( transformed4, ws_ld( fwht_l_twiddle_128 + 64UL ) );
747 : ws_t product5 = ws_mullo( transformed5, ws_ld( fwht_l_twiddle_128 + 80UL ) );
748 : ws_t product6 = ws_mullo( transformed6, ws_ld( fwht_l_twiddle_128 + 96UL ) );
749 : ws_t product7 = ws_mullo( transformed7, ws_ld( fwht_l_twiddle_128 + 112UL ) );
750 :
751 : /* We need to reduce these mod 255 to prevent overflow in the next
752 : step. 0 <= product+64*255 <= 48832 < 2^16. The mod operation
753 : treats the input as unsigned though, so this is okay. */
754 : product0 = ws_add( product0, ws_bcast( (short)64*255 ) );
755 : product1 = ws_add( product1, ws_bcast( (short)64*255 ) );
756 : product2 = ws_add( product2, ws_bcast( (short)64*255 ) );
757 : product3 = ws_add( product3, ws_bcast( (short)64*255 ) );
758 : product4 = ws_add( product4, ws_bcast( (short)64*255 ) );
759 : product5 = ws_add( product5, ws_bcast( (short)64*255 ) );
760 : product6 = ws_add( product6, ws_bcast( (short)64*255 ) );
761 : product7 = ws_add( product7, ws_bcast( (short)64*255 ) );
762 :
763 : product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( wh_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
764 : product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( wh_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
765 : product2 = ws_and( ws_bcast( 0xFF ), ws_add( product2, ws_shru( wh_mulhi( product2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
766 : product3 = ws_and( ws_bcast( 0xFF ), ws_add( product3, ws_shru( wh_mulhi( product3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
767 : product4 = ws_and( ws_bcast( 0xFF ), ws_add( product4, ws_shru( wh_mulhi( product4, ws_bcast( (short)0x8081 ) ), 7 ) ) );
768 : product5 = ws_and( ws_bcast( 0xFF ), ws_add( product5, ws_shru( wh_mulhi( product5, ws_bcast( (short)0x8081 ) ), 7 ) ) );
769 : product6 = ws_and( ws_bcast( 0xFF ), ws_add( product6, ws_shru( wh_mulhi( product6, ws_bcast( (short)0x8081 ) ), 7 ) ) );
770 : product7 = ws_and( ws_bcast( 0xFF ), ws_add( product7, ws_shru( wh_mulhi( product7, ws_bcast( (short)0x8081 ) ), 7 ) ) );
771 :
772 : /* Now 0 <= product < 255 */
773 :
774 : /* log_pi is congruent (mod 255) to what the paper calls
775 : R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
776 : |log_pi| <= 128*255 */
777 : FD_REEDSOL_FWHT_128( product0, product1, product2, product3, product4, product5, product6, product7 );
778 : ws_t log_pi0 = product0;
779 : ws_t log_pi1 = product1;
780 : ws_t log_pi2 = product2;
781 : ws_t log_pi3 = product3;
782 : ws_t log_pi4 = product4;
783 : ws_t log_pi5 = product5;
784 : ws_t log_pi6 = product6;
785 : ws_t log_pi7 = product7;
786 :
787 : /* Negate the ones corresponding to erasures to compute 1/Pi' */
788 : log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
789 : log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
790 : log_pi2 = ws_adjust_sign( log_pi2, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec2, 1 ) ) );
791 : log_pi3 = ws_adjust_sign( log_pi3, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec3, 1 ) ) );
792 : log_pi4 = ws_adjust_sign( log_pi4, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec4, 1 ) ) );
793 : log_pi5 = ws_adjust_sign( log_pi5, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec5, 1 ) ) );
794 : log_pi6 = ws_adjust_sign( log_pi6, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec6, 1 ) ) );
795 : log_pi7 = ws_adjust_sign( log_pi7, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec7, 1 ) ) );
796 :
797 : log_pi0 = ws_add( log_pi0, ws_bcast( (short)(255*128) ) );
798 : log_pi1 = ws_add( log_pi1, ws_bcast( (short)(255*128) ) );
799 : log_pi2 = ws_add( log_pi2, ws_bcast( (short)(255*128) ) );
800 : log_pi3 = ws_add( log_pi3, ws_bcast( (short)(255*128) ) );
801 : log_pi4 = ws_add( log_pi4, ws_bcast( (short)(255*128) ) );
802 : log_pi5 = ws_add( log_pi5, ws_bcast( (short)(255*128) ) );
803 : log_pi6 = ws_add( log_pi6, ws_bcast( (short)(255*128) ) );
804 : log_pi7 = ws_add( log_pi7, ws_bcast( (short)(255*128) ) ); /* Now 0<= log_pi <= 65152 < 2^16 */
805 :
806 : /* GCC informs me that for a ushort x,
807 : (x%255) == 0xFF & ( x + (x*0x8081)>>23).
808 : We need at least 31 bits of precision for the product, so
809 : mulh_epu16 is perfect. */
810 : log_pi0 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi0, ws_shru( wh_mulhi( log_pi0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
811 : log_pi1 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi1, ws_shru( wh_mulhi( log_pi1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
812 : log_pi2 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi2, ws_shru( wh_mulhi( log_pi2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
813 : log_pi3 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi3, ws_shru( wh_mulhi( log_pi3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
814 : log_pi4 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi4, ws_shru( wh_mulhi( log_pi4, ws_bcast( (short)0x8081 ) ), 7 ) ) );
815 : log_pi5 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi5, ws_shru( wh_mulhi( log_pi5, ws_bcast( (short)0x8081 ) ), 7 ) ) );
816 : log_pi6 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi6, ws_shru( wh_mulhi( log_pi6, ws_bcast( (short)0x8081 ) ), 7 ) ) );
817 : log_pi7 = ws_and( ws_bcast( 0xFF ), ws_add( log_pi7, ws_shru( wh_mulhi( log_pi7, ws_bcast( (short)0x8081 ) ), 7 ) ) );
818 : /* Now 0<=log_pi < 255 */
819 :
820 : /* Since our FWHT implementation is unscaled, we've computed a value
821 : 128 times larger than what we'd like. 128^-1 == 2 (mod 255), but
822 : we're just going to use this in the exponentiation, so we can
823 : compute this implicitly.
824 : 2^(log_pi * 128^-1) = 2^(2*log_pi) = (2^2)^log_pi = 4^log_pi
825 : (where 2 is the primitive element we used for the logs, an element
826 : of GF(2^8) ). */
827 :
828 : wb_t compact_log_pi0 = compact_ws( log_pi0, log_pi1 );
829 : wb_t compact_log_pi1 = compact_ws( log_pi2, log_pi3 );
830 : wb_t compact_log_pi2 = compact_ws( log_pi4, log_pi5 );
831 : wb_t compact_log_pi3 = compact_ws( log_pi6, log_pi7 );
832 : wb_t pi0 = exp_4( compact_log_pi0 );
833 : wb_t pi1 = exp_4( compact_log_pi1 );
834 : wb_t pi2 = exp_4( compact_log_pi2 );
835 : wb_t pi3 = exp_4( compact_log_pi3 );
836 :
837 : wb_st( output, pi0 );
838 : wb_st( output + 32UL, pi1 );
839 : wb_st( output + 64UL, pi2 );
840 : wb_st( output + 96UL, pi3 );
841 :
842 : #else
843 6 : wb_t erased_vec0 = wb_ld( is_erased );
844 6 : wb_t erased_vec1 = wb_ld( is_erased + 32UL );
845 6 : wb_t erased_vec2 = wb_ld( is_erased + 64UL );
846 6 : wb_t erased_vec3 = wb_ld( is_erased + 96UL );
847 6 : wb_t to_transform0 = erased_vec0;
848 6 : wb_t to_transform1 = erased_vec1;
849 6 : wb_t to_transform2 = erased_vec2;
850 6 : wb_t to_transform3 = erased_vec3;
851 :
852 6 : FD_REEDSOL_FWHT_128( to_transform0, to_transform1, to_transform2, to_transform3 );
853 :
854 6 : ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 0 ) );
855 6 : ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 1 ) );
856 6 : ws_t transformed2 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 0 ) );
857 6 : ws_t transformed3 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 1 ) );
858 6 : ws_t transformed4 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 0 ) );
859 6 : ws_t transformed5 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 1 ) );
860 6 : ws_t transformed6 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 0 ) );
861 6 : ws_t transformed7 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 1 ) );
862 :
863 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
864 : 0<=product<256*255, so product is interpreted as unsigned. */
865 6 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_128 ) );
866 6 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_128 + 16UL ) );
867 6 : ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_128 + 32UL ) );
868 6 : ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_128 + 48UL ) );
869 6 : ws_t product4 = ws_mullo( transformed4, ws_ld( fwht_l_twiddle_128 + 64UL ) );
870 6 : ws_t product5 = ws_mullo( transformed5, ws_ld( fwht_l_twiddle_128 + 80UL ) );
871 6 : ws_t product6 = ws_mullo( transformed6, ws_ld( fwht_l_twiddle_128 + 96UL ) );
872 6 : ws_t product7 = ws_mullo( transformed7, ws_ld( fwht_l_twiddle_128 + 112UL ) );
873 :
874 : /* Compute mod 255, using the same approach as above. */
875 6 : product0 = ws_and( ws_bcast( 0xFF ), ws_add( product0, ws_shru( wh_mulhi( product0, ws_bcast( (short)0x8081 ) ), 7 ) ) );
876 6 : product1 = ws_and( ws_bcast( 0xFF ), ws_add( product1, ws_shru( wh_mulhi( product1, ws_bcast( (short)0x8081 ) ), 7 ) ) );
877 6 : product2 = ws_and( ws_bcast( 0xFF ), ws_add( product2, ws_shru( wh_mulhi( product2, ws_bcast( (short)0x8081 ) ), 7 ) ) );
878 6 : product3 = ws_and( ws_bcast( 0xFF ), ws_add( product3, ws_shru( wh_mulhi( product3, ws_bcast( (short)0x8081 ) ), 7 ) ) );
879 6 : product4 = ws_and( ws_bcast( 0xFF ), ws_add( product4, ws_shru( wh_mulhi( product4, ws_bcast( (short)0x8081 ) ), 7 ) ) );
880 6 : product5 = ws_and( ws_bcast( 0xFF ), ws_add( product5, ws_shru( wh_mulhi( product5, ws_bcast( (short)0x8081 ) ), 7 ) ) );
881 6 : product6 = ws_and( ws_bcast( 0xFF ), ws_add( product6, ws_shru( wh_mulhi( product6, ws_bcast( (short)0x8081 ) ), 7 ) ) );
882 6 : product7 = ws_and( ws_bcast( 0xFF ), ws_add( product7, ws_shru( wh_mulhi( product7, ws_bcast( (short)0x8081 ) ), 7 ) ) );
883 6 : wb_t compact_product0 = compact_ws( product0, product1 );
884 6 : wb_t compact_product1 = compact_ws( product2, product3 );
885 6 : wb_t compact_product2 = compact_ws( product4, product5 );
886 6 : wb_t compact_product3 = compact_ws( product6, product7 );
887 :
888 6 : FD_REEDSOL_FWHT_128( compact_product0, compact_product1, compact_product2, compact_product3 );
889 :
890 : /* Negate the ones corresponding to erasures */
891 6 : compact_product0 = wb_if( wb_eq( erased_vec0, wb_zero() ), compact_product0, wb_sub( wb_bcast( 255 ), compact_product0 ) );
892 6 : compact_product1 = wb_if( wb_eq( erased_vec1, wb_zero() ), compact_product1, wb_sub( wb_bcast( 255 ), compact_product1 ) );
893 6 : compact_product2 = wb_if( wb_eq( erased_vec2, wb_zero() ), compact_product2, wb_sub( wb_bcast( 255 ), compact_product2 ) );
894 6 : compact_product3 = wb_if( wb_eq( erased_vec3, wb_zero() ), compact_product3, wb_sub( wb_bcast( 255 ), compact_product3 ) );
895 :
896 6 : wb_t pi0 = exp_4( compact_product0 );
897 6 : wb_t pi1 = exp_4( compact_product1 );
898 6 : wb_t pi2 = exp_4( compact_product2 );
899 6 : wb_t pi3 = exp_4( compact_product3 );
900 6 : wb_st( output, pi0 );
901 6 : wb_st( output + 32UL, pi1 );
902 6 : wb_st( output + 64UL, pi2 );
903 6 : wb_st( output + 96UL, pi3 );
904 6 : #endif
905 : #else /* No AVX implementation */
906 :
907 : gen_pi_noavx_generic( is_erased, output, 128UL, fwht_l_twiddle_128 );
908 :
909 : #endif
910 6 : }
911 :
912 : void
913 : fd_reedsol_private_gen_pi_256( uchar const * is_erased,
914 0 : uchar * output ) {
915 0 : #if FD_REEDSOL_ARITH_IMPL>0
916 : #if FD_REEDSOL_PI_USE_SHORT
917 : ws_t erased_vec0 = _mm256_cvtepu8_epi16( vb_ld( is_erased ) );
918 : ws_t erased_vec1 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 16UL ) );
919 : ws_t erased_vec2 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 32UL ) );
920 : ws_t erased_vec3 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 48UL ) );
921 : ws_t erased_vec4 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 64UL ) );
922 : ws_t erased_vec5 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 80UL ) );
923 : ws_t erased_vec6 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 96UL ) );
924 : ws_t erased_vec7 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 112UL ) );
925 : ws_t erased_vec8 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 128UL ) );
926 : ws_t erased_vec9 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 144UL ) );
927 : ws_t erased_vec10 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 160UL ) );
928 : ws_t erased_vec11 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 176UL ) );
929 : ws_t erased_vec12 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 192UL ) );
930 : ws_t erased_vec13 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 208UL ) );
931 : ws_t erased_vec14 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 224UL ) );
932 : ws_t erased_vec15 = _mm256_cvtepu8_epi16( vb_ld( is_erased + 240UL ) );
933 :
934 : ws_t transformed0 = erased_vec0;
935 : ws_t transformed1 = erased_vec1;
936 : ws_t transformed2 = erased_vec2;
937 : ws_t transformed3 = erased_vec3;
938 : ws_t transformed4 = erased_vec4;
939 : ws_t transformed5 = erased_vec5;
940 : ws_t transformed6 = erased_vec6;
941 : ws_t transformed7 = erased_vec7;
942 : ws_t transformed8 = erased_vec8;
943 : ws_t transformed9 = erased_vec9;
944 : ws_t transformed10 = erased_vec10;
945 : ws_t transformed11 = erased_vec11;
946 : ws_t transformed12 = erased_vec12;
947 : ws_t transformed13 = erased_vec13;
948 : ws_t transformed14 = erased_vec14;
949 : ws_t transformed15 = erased_vec15;
950 : FD_REEDSOL_FWHT_256( transformed0, transformed1, transformed2, transformed3, transformed4, transformed5, transformed6, transformed7,
951 : transformed8, transformed9, transformed10, transformed11, transformed12, transformed13, transformed14, transformed15 ); /* FWHT( R~ ) */
952 : /* |transformed| <= 256 */
953 :
954 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255 .
955 : -32512 <= product <= 32512 */
956 : ws_t product0 = ws_mullo( transformed0, ws_ld( fwht_l_twiddle_256 ) );
957 : ws_t product1 = ws_mullo( transformed1, ws_ld( fwht_l_twiddle_256 + 16UL ) );
958 : ws_t product2 = ws_mullo( transformed2, ws_ld( fwht_l_twiddle_256 + 32UL ) );
959 : ws_t product3 = ws_mullo( transformed3, ws_ld( fwht_l_twiddle_256 + 48UL ) );
960 : ws_t product4 = ws_mullo( transformed4, ws_ld( fwht_l_twiddle_256 + 64UL ) );
961 : ws_t product5 = ws_mullo( transformed5, ws_ld( fwht_l_twiddle_256 + 80UL ) );
962 : ws_t product6 = ws_mullo( transformed6, ws_ld( fwht_l_twiddle_256 + 96UL ) );
963 : ws_t product7 = ws_mullo( transformed7, ws_ld( fwht_l_twiddle_256 + 112UL ) );
964 : ws_t product8 = ws_mullo( transformed8, ws_ld( fwht_l_twiddle_256 + 128UL ) );
965 : ws_t product9 = ws_mullo( transformed9, ws_ld( fwht_l_twiddle_256 + 144UL ) );
966 : ws_t product10 = ws_mullo( transformed10, ws_ld( fwht_l_twiddle_256 + 160UL ) );
967 : ws_t product11 = ws_mullo( transformed11, ws_ld( fwht_l_twiddle_256 + 176UL ) );
968 : ws_t product12 = ws_mullo( transformed12, ws_ld( fwht_l_twiddle_256 + 192UL ) );
969 : ws_t product13 = ws_mullo( transformed13, ws_ld( fwht_l_twiddle_256 + 208UL ) );
970 : ws_t product14 = ws_mullo( transformed14, ws_ld( fwht_l_twiddle_256 + 224UL ) );
971 : ws_t product15 = ws_mullo( transformed15, ws_ld( fwht_l_twiddle_256 + 240UL ) );
972 :
973 : /* We need to reduce these mod 255 to prevent overflow in the next
974 : step. 0 <= product+128*255 <= 65152 < 2^16. The mod operation
975 : treats the input as unsigned though, so this is okay (but hanging
976 : in there by a thread!). */
977 : product0 = ws_mod255( ws_add( product0, ws_bcast( (short)128*255 ) ) );
978 : product1 = ws_mod255( ws_add( product1, ws_bcast( (short)128*255 ) ) );
979 : product2 = ws_mod255( ws_add( product2, ws_bcast( (short)128*255 ) ) );
980 : product3 = ws_mod255( ws_add( product3, ws_bcast( (short)128*255 ) ) );
981 : product4 = ws_mod255( ws_add( product4, ws_bcast( (short)128*255 ) ) );
982 : product5 = ws_mod255( ws_add( product5, ws_bcast( (short)128*255 ) ) );
983 : product6 = ws_mod255( ws_add( product6, ws_bcast( (short)128*255 ) ) );
984 : product7 = ws_mod255( ws_add( product7, ws_bcast( (short)128*255 ) ) );
985 : product8 = ws_mod255( ws_add( product8, ws_bcast( (short)128*255 ) ) );
986 : product9 = ws_mod255( ws_add( product9, ws_bcast( (short)128*255 ) ) );
987 : product10 = ws_mod255( ws_add( product10, ws_bcast( (short)128*255 ) ) );
988 : product11 = ws_mod255( ws_add( product11, ws_bcast( (short)128*255 ) ) );
989 : product12 = ws_mod255( ws_add( product12, ws_bcast( (short)128*255 ) ) );
990 : product13 = ws_mod255( ws_add( product13, ws_bcast( (short)128*255 ) ) );
991 : product14 = ws_mod255( ws_add( product14, ws_bcast( (short)128*255 ) ) );
992 : product15 = ws_mod255( ws_add( product15, ws_bcast( (short)128*255 ) ) );
993 :
994 : /* Now 0 <= product < 255 */
995 :
996 : /* log_pi is congruent (mod 255) to what the paper calls
997 : R_w = FWHT( FWHT( L~ ) * FWHT( R~ ) ).
998 : If we do the FWHT in the normal way, it might overflow, so we need to inline it and stick a mod in the middle */
999 : ws_t log_pi0 = ws_mod255( ws_add( product0, product8 ) ); ws_t log_pi1 = ws_mod255( ws_add( product1, product9 ) );
1000 : ws_t log_pi2 = ws_mod255( ws_add( product2, product10 ) ); ws_t log_pi3 = ws_mod255( ws_add( product3, product11 ) );
1001 : ws_t log_pi4 = ws_mod255( ws_add( product4, product12 ) ); ws_t log_pi5 = ws_mod255( ws_add( product5, product13 ) );
1002 : ws_t log_pi6 = ws_mod255( ws_add( product6, product14 ) ); ws_t log_pi7 = ws_mod255( ws_add( product7, product15 ) );
1003 : ws_t log_pi8 = ws_mod255( ws_add( ws_sub( product0, product8 ), ws_bcast( (short)255*2 ) ) );
1004 : ws_t log_pi9 = ws_mod255( ws_add( ws_sub( product1, product9 ), ws_bcast( (short)255*2 ) ) );
1005 : ws_t log_pi10 = ws_mod255( ws_add( ws_sub( product2, product10 ), ws_bcast( (short)255*2 ) ) );
1006 : ws_t log_pi11 = ws_mod255( ws_add( ws_sub( product3, product11 ), ws_bcast( (short)255*2 ) ) );
1007 : ws_t log_pi12 = ws_mod255( ws_add( ws_sub( product4, product12 ), ws_bcast( (short)255*2 ) ) );
1008 : ws_t log_pi13 = ws_mod255( ws_add( ws_sub( product5, product13 ), ws_bcast( (short)255*2 ) ) );
1009 : ws_t log_pi14 = ws_mod255( ws_add( ws_sub( product6, product14 ), ws_bcast( (short)255*2 ) ) );
1010 : ws_t log_pi15 = ws_mod255( ws_add( ws_sub( product7, product15 ), ws_bcast( (short)255*2 ) ) );
1011 :
1012 : FD_REEDSOL_FWHT_128( log_pi0, log_pi1, log_pi2, log_pi3, log_pi4, log_pi5, log_pi6, log_pi7 );
1013 : FD_REEDSOL_FWHT_128( log_pi8, log_pi9, log_pi10, log_pi11, log_pi12, log_pi13, log_pi14, log_pi15 );
1014 : /* Now |log_pi| <= 128*255 */
1015 :
1016 : /* Negate the ones corresponding to erasures to compute 1/Pi' */
1017 : log_pi0 = ws_adjust_sign( log_pi0, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec0, 1 ) ) );
1018 : log_pi1 = ws_adjust_sign( log_pi1, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec1, 1 ) ) );
1019 : log_pi2 = ws_adjust_sign( log_pi2, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec2, 1 ) ) );
1020 : log_pi3 = ws_adjust_sign( log_pi3, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec3, 1 ) ) );
1021 : log_pi4 = ws_adjust_sign( log_pi4, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec4, 1 ) ) );
1022 : log_pi5 = ws_adjust_sign( log_pi5, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec5, 1 ) ) );
1023 : log_pi6 = ws_adjust_sign( log_pi6, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec6, 1 ) ) );
1024 : log_pi7 = ws_adjust_sign( log_pi7, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec7, 1 ) ) );
1025 : log_pi8 = ws_adjust_sign( log_pi8, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec8, 1 ) ) );
1026 : log_pi9 = ws_adjust_sign( log_pi9, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec9, 1 ) ) );
1027 : log_pi10 = ws_adjust_sign( log_pi10, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec10, 1 ) ) );
1028 : log_pi11 = ws_adjust_sign( log_pi11, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec11, 1 ) ) );
1029 : log_pi12 = ws_adjust_sign( log_pi12, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec12, 1 ) ) );
1030 : log_pi13 = ws_adjust_sign( log_pi13, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec13, 1 ) ) );
1031 : log_pi14 = ws_adjust_sign( log_pi14, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec14, 1 ) ) );
1032 : log_pi15 = ws_adjust_sign( log_pi15, ws_sub( ws_bcast( 1 ), ws_shl( erased_vec15, 1 ) ) );
1033 :
1034 : /* After the addition below, 0<= log_pi <= 65152 < 2^16. The mod
1035 : brings it back to 0 <= log_pi < 255. */
1036 : log_pi0 = ws_mod255( ws_add( log_pi0, ws_bcast( (short)(255*128) ) ) );
1037 : log_pi1 = ws_mod255( ws_add( log_pi1, ws_bcast( (short)(255*128) ) ) );
1038 : log_pi2 = ws_mod255( ws_add( log_pi2, ws_bcast( (short)(255*128) ) ) );
1039 : log_pi3 = ws_mod255( ws_add( log_pi3, ws_bcast( (short)(255*128) ) ) );
1040 : log_pi4 = ws_mod255( ws_add( log_pi4, ws_bcast( (short)(255*128) ) ) );
1041 : log_pi5 = ws_mod255( ws_add( log_pi5, ws_bcast( (short)(255*128) ) ) );
1042 : log_pi6 = ws_mod255( ws_add( log_pi6, ws_bcast( (short)(255*128) ) ) );
1043 : log_pi7 = ws_mod255( ws_add( log_pi7, ws_bcast( (short)(255*128) ) ) );
1044 : log_pi8 = ws_mod255( ws_add( log_pi8, ws_bcast( (short)(255*128) ) ) );
1045 : log_pi9 = ws_mod255( ws_add( log_pi9, ws_bcast( (short)(255*128) ) ) );
1046 : log_pi10 = ws_mod255( ws_add( log_pi10, ws_bcast( (short)(255*128) ) ) );
1047 : log_pi11 = ws_mod255( ws_add( log_pi11, ws_bcast( (short)(255*128) ) ) );
1048 : log_pi12 = ws_mod255( ws_add( log_pi12, ws_bcast( (short)(255*128) ) ) );
1049 : log_pi13 = ws_mod255( ws_add( log_pi13, ws_bcast( (short)(255*128) ) ) );
1050 : log_pi14 = ws_mod255( ws_add( log_pi14, ws_bcast( (short)(255*128) ) ) );
1051 : log_pi15 = ws_mod255( ws_add( log_pi15, ws_bcast( (short)(255*128) ) ) );
1052 :
1053 : /* Since our FWHT implementation is unscaled, we've computed a value
1054 : 256 times larger than what we'd like. 256^-1==1^-1 == 1 (mod 255),
1055 : so we don't need to do anything special. */
1056 :
1057 : wb_t compact_log_pi0 = compact_ws( log_pi0, log_pi1 );
1058 : wb_t compact_log_pi1 = compact_ws( log_pi2, log_pi3 );
1059 : wb_t compact_log_pi2 = compact_ws( log_pi4, log_pi5 );
1060 : wb_t compact_log_pi3 = compact_ws( log_pi6, log_pi7 );
1061 : wb_t compact_log_pi4 = compact_ws( log_pi8, log_pi9 );
1062 : wb_t compact_log_pi5 = compact_ws( log_pi10, log_pi11 );
1063 : wb_t compact_log_pi6 = compact_ws( log_pi12, log_pi13 );
1064 : wb_t compact_log_pi7 = compact_ws( log_pi14, log_pi15 );
1065 : wb_t pi0 = exp_2( compact_log_pi0 );
1066 : wb_t pi1 = exp_2( compact_log_pi1 );
1067 : wb_t pi2 = exp_2( compact_log_pi2 );
1068 : wb_t pi3 = exp_2( compact_log_pi3 );
1069 : wb_t pi4 = exp_2( compact_log_pi4 );
1070 : wb_t pi5 = exp_2( compact_log_pi5 );
1071 : wb_t pi6 = exp_2( compact_log_pi6 );
1072 : wb_t pi7 = exp_2( compact_log_pi7 );
1073 :
1074 : wb_st( output, pi0 );
1075 : wb_st( output + 32UL, pi1 );
1076 : wb_st( output + 64UL, pi2 );
1077 : wb_st( output + 96UL, pi3 );
1078 : wb_st( output + 128UL, pi4 );
1079 : wb_st( output + 160UL, pi5 );
1080 : wb_st( output + 192UL, pi6 );
1081 : wb_st( output + 224UL, pi7 );
1082 :
1083 : #else
1084 0 : wb_t erased_vec0 = wb_ld( is_erased );
1085 0 : wb_t erased_vec1 = wb_ld( is_erased + 32UL );
1086 0 : wb_t erased_vec2 = wb_ld( is_erased + 64UL );
1087 0 : wb_t erased_vec3 = wb_ld( is_erased + 96UL );
1088 0 : wb_t erased_vec4 = wb_ld( is_erased + 128UL );
1089 0 : wb_t erased_vec5 = wb_ld( is_erased + 160UL );
1090 0 : wb_t erased_vec6 = wb_ld( is_erased + 192UL );
1091 0 : wb_t erased_vec7 = wb_ld( is_erased + 224UL );
1092 0 : wb_t to_transform0 = erased_vec0;
1093 0 : wb_t to_transform1 = erased_vec1;
1094 0 : wb_t to_transform2 = erased_vec2;
1095 0 : wb_t to_transform3 = erased_vec3;
1096 0 : wb_t to_transform4 = erased_vec4;
1097 0 : wb_t to_transform5 = erased_vec5;
1098 0 : wb_t to_transform6 = erased_vec6;
1099 0 : wb_t to_transform7 = erased_vec7;
1100 :
1101 0 : FD_REEDSOL_FWHT_256( to_transform0, to_transform1, to_transform2, to_transform3,
1102 0 : to_transform4, to_transform5, to_transform6, to_transform7 );
1103 :
1104 0 : ws_t transformed0 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 0 ) );
1105 0 : ws_t transformed1 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform0, 1 ) );
1106 0 : ws_t transformed2 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 0 ) );
1107 0 : ws_t transformed3 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform1, 1 ) );
1108 0 : ws_t transformed4 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 0 ) );
1109 0 : ws_t transformed5 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform2, 1 ) );
1110 0 : ws_t transformed6 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 0 ) );
1111 0 : ws_t transformed7 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform3, 1 ) );
1112 0 : ws_t transformed8 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform4, 0 ) );
1113 0 : ws_t transformed9 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform4, 1 ) );
1114 0 : ws_t transformed10 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform5, 0 ) );
1115 0 : ws_t transformed11 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform5, 1 ) );
1116 0 : ws_t transformed12 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform6, 0 ) );
1117 0 : ws_t transformed13 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform6, 1 ) );
1118 0 : ws_t transformed14 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform7, 0 ) );
1119 0 : ws_t transformed15 = _mm256_cvtepu8_epi16( _mm256_extracti128_si256( to_transform7, 1 ) );
1120 :
1121 : /* product is congruent to FWHT( R~) * FWHT( L~ ) mod 255.
1122 : 0<=product<256*255, so product is interpreted as unsigned. */
1123 0 : ws_t product0 = ws_mod255( ws_mullo( transformed0, ws_ld( fwht_l_twiddle_256 ) ) );
1124 0 : ws_t product1 = ws_mod255( ws_mullo( transformed1, ws_ld( fwht_l_twiddle_256 + 16UL ) ) );
1125 0 : ws_t product2 = ws_mod255( ws_mullo( transformed2, ws_ld( fwht_l_twiddle_256 + 32UL ) ) );
1126 0 : ws_t product3 = ws_mod255( ws_mullo( transformed3, ws_ld( fwht_l_twiddle_256 + 48UL ) ) );
1127 0 : ws_t product4 = ws_mod255( ws_mullo( transformed4, ws_ld( fwht_l_twiddle_256 + 64UL ) ) );
1128 0 : ws_t product5 = ws_mod255( ws_mullo( transformed5, ws_ld( fwht_l_twiddle_256 + 80UL ) ) );
1129 0 : ws_t product6 = ws_mod255( ws_mullo( transformed6, ws_ld( fwht_l_twiddle_256 + 96UL ) ) );
1130 0 : ws_t product7 = ws_mod255( ws_mullo( transformed7, ws_ld( fwht_l_twiddle_256 + 112UL ) ) );
1131 0 : ws_t product8 = ws_mod255( ws_mullo( transformed8, ws_ld( fwht_l_twiddle_256 + 128UL ) ) );
1132 0 : ws_t product9 = ws_mod255( ws_mullo( transformed9, ws_ld( fwht_l_twiddle_256 + 144UL ) ) );
1133 0 : ws_t product10 = ws_mod255( ws_mullo( transformed10, ws_ld( fwht_l_twiddle_256 + 160UL ) ) );
1134 0 : ws_t product11 = ws_mod255( ws_mullo( transformed11, ws_ld( fwht_l_twiddle_256 + 176UL ) ) );
1135 0 : ws_t product12 = ws_mod255( ws_mullo( transformed12, ws_ld( fwht_l_twiddle_256 + 192UL ) ) );
1136 0 : ws_t product13 = ws_mod255( ws_mullo( transformed13, ws_ld( fwht_l_twiddle_256 + 208UL ) ) );
1137 0 : ws_t product14 = ws_mod255( ws_mullo( transformed14, ws_ld( fwht_l_twiddle_256 + 224UL ) ) );
1138 0 : ws_t product15 = ws_mod255( ws_mullo( transformed15, ws_ld( fwht_l_twiddle_256 + 240UL ) ) );
1139 :
1140 0 : wb_t compact_product0 = compact_ws( product0, product1 );
1141 0 : wb_t compact_product1 = compact_ws( product2, product3 );
1142 0 : wb_t compact_product2 = compact_ws( product4, product5 );
1143 0 : wb_t compact_product3 = compact_ws( product6, product7 );
1144 0 : wb_t compact_product4 = compact_ws( product8, product9 );
1145 0 : wb_t compact_product5 = compact_ws( product10, product11 );
1146 0 : wb_t compact_product6 = compact_ws( product12, product13 );
1147 0 : wb_t compact_product7 = compact_ws( product14, product15 );
1148 :
1149 0 : FD_REEDSOL_FWHT_256( compact_product0, compact_product1, compact_product2, compact_product3,
1150 0 : compact_product4, compact_product5, compact_product6, compact_product7 );
1151 :
1152 : /* Negate the ones corresponding to erasures */
1153 0 : compact_product0 = wb_if( wb_eq( erased_vec0, wb_zero() ), compact_product0, wb_sub( wb_bcast( 255 ), compact_product0 ) );
1154 0 : compact_product1 = wb_if( wb_eq( erased_vec1, wb_zero() ), compact_product1, wb_sub( wb_bcast( 255 ), compact_product1 ) );
1155 0 : compact_product2 = wb_if( wb_eq( erased_vec2, wb_zero() ), compact_product2, wb_sub( wb_bcast( 255 ), compact_product2 ) );
1156 0 : compact_product3 = wb_if( wb_eq( erased_vec3, wb_zero() ), compact_product3, wb_sub( wb_bcast( 255 ), compact_product3 ) );
1157 0 : compact_product4 = wb_if( wb_eq( erased_vec4, wb_zero() ), compact_product4, wb_sub( wb_bcast( 255 ), compact_product4 ) );
1158 0 : compact_product5 = wb_if( wb_eq( erased_vec5, wb_zero() ), compact_product5, wb_sub( wb_bcast( 255 ), compact_product5 ) );
1159 0 : compact_product6 = wb_if( wb_eq( erased_vec6, wb_zero() ), compact_product6, wb_sub( wb_bcast( 255 ), compact_product6 ) );
1160 0 : compact_product7 = wb_if( wb_eq( erased_vec7, wb_zero() ), compact_product7, wb_sub( wb_bcast( 255 ), compact_product7 ) );
1161 :
1162 0 : wb_t pi0 = exp_2( compact_product0 );
1163 0 : wb_t pi1 = exp_2( compact_product1 );
1164 0 : wb_t pi2 = exp_2( compact_product2 );
1165 0 : wb_t pi3 = exp_2( compact_product3 );
1166 0 : wb_t pi4 = exp_2( compact_product4 );
1167 0 : wb_t pi5 = exp_2( compact_product5 );
1168 0 : wb_t pi6 = exp_2( compact_product6 );
1169 0 : wb_t pi7 = exp_2( compact_product7 );
1170 0 : wb_st( output, pi0 );
1171 0 : wb_st( output + 32UL, pi1 );
1172 0 : wb_st( output + 64UL, pi2 );
1173 0 : wb_st( output + 96UL, pi3 );
1174 0 : wb_st( output + 128UL, pi4 );
1175 0 : wb_st( output + 160UL, pi5 );
1176 0 : wb_st( output + 192UL, pi6 );
1177 0 : wb_st( output + 224UL, pi7 );
1178 0 : #endif
1179 : #else /* No AVX implementation */
1180 :
1181 : gen_pi_noavx_generic( is_erased, output, 256UL, fwht_l_twiddle_256 );
1182 :
1183 : #endif
1184 0 : }
|