Line data Source code
1 : #include "../../util/simd/fd_avx.h"
2 : #include <immintrin.h>
3 :
4 : /* This is not a proper header and so should not be included from
5 : anywhere besides fd_base58.c and test_base58_avx.c. As such, it has
6 : no include guard. */
7 :
8 6928197 : #define wuc_t __m256i
9 0 : static inline wuc_t wuc_ld( uchar const * p ) { return _mm256_load_si256( (__m256i const *)p ); }
10 277143 : static inline wuc_t wuc_ldu( uchar const * p ) { return _mm256_loadu_si256( (__m256i const *)p ); }
11 0 : static inline void wuc_st( uchar * p, wuc_t i ) { _mm256_store_si256( (__m256i *)p, i ); }
12 42 : static inline void wuc_stu( uchar * p, wuc_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
13 :
14 : /* Converts a vector with 4 terms in intermediate form ( digits in
15 : [0,58^5) ) into 20 digits of raw base58 (<58) stored in two groups of
16 : 10 in the lower 10 bytes of each 128-bit half of the vector. */
17 :
18 : static inline wuc_t
19 831387 : intermediate_to_raw( wl_t intermediate ) {
20 :
21 : /* The computation we need to do here mathematically is
22 : y=(floor(x/58^k) % 58) for various values of k. It seems that the
23 : best way to compute it (at least what the compiler generates in the
24 : scalar case) is by computing z = floor(x/58^k). y = z -
25 : 58*floor(z/58). Simplifying, gives, y = floor(x/58^k) -
26 : 58*floor(x/58^(k+1)) (Note, to see that the floors simplify like
27 : that, represent x in its base58 expansion and then consider that
28 : dividing by 58^k is just shifting right by k places.) This means we
29 : can reuse a lot of values!
30 :
31 : We can do the divisions with "magic multiplication" (i.e. multiply
32 : and shift). There's a tradeoff between ILP and register pressure
33 : to make here: we can load a constant for each value of k and just
34 : compute the division directly, or we could use one constant for
35 : division by 58 and apply it repeatedly. I don't know if this is
36 : optimal, but I use two constants, one for /58 and the other for
37 : /58^2. We need to take advantage of the fact the input is
38 : <58^5<2^32 to produce constants that fit in uints so that we can
39 : use mul_epu32. */
40 :
41 831387 : wl_t cA = wl_bcast( (long)2369637129U ); /* =2^37/58 */
42 831387 : wl_t cB = wl_bcast( (long)1307386003U ); /* =2^42/58^2 */
43 831387 : wl_t _58 = wl_bcast( (long)58UL );
44 :
45 : /* Divide each ulong in r by {58, 58^2=3364}, taking the floor of the
46 : division. I used gcc to convert the division to magic
47 : multiplication. */
48 :
49 831387 : # define DIV58(r) wl_shru( _mm256_mul_epu32( r, cA ), 37)
50 2494161 : # define DIV3364(r) wl_shru( _mm256_mul_epu32( wl_shru( r, 2 ), cB ), 40)
51 :
52 : /* div(k) stores floor(x/58^k). rem(k) stores div(k) % 58 */
53 831387 : wl_t div0 = intermediate;
54 831387 : wl_t div1 = DIV58(div0);
55 831387 : wl_t rem0 = wl_sub( div0, _mm256_mul_epu32( div1, _58 ) );
56 :
57 831387 : wl_t div2 = DIV3364(div0);
58 831387 : wl_t rem1 = wl_sub( div1, _mm256_mul_epu32( div2, _58 ) );
59 :
60 831387 : wl_t div3 = DIV3364(div1);
61 831387 : wl_t rem2 = wl_sub( div2, _mm256_mul_epu32( div3, _58 ) );
62 :
63 831387 : wl_t div4 = DIV3364(div2);
64 831387 : wl_t rem3 = wl_sub( div3, _mm256_mul_epu32( div4, _58 ) );
65 :
66 831387 : wl_t rem4 = div4; /* We know the values are less than 58 at this point */
67 :
68 831387 : # undef DIV58
69 831387 : # undef DIV3364
70 :
71 : /* Okay, we have all 20 terms we need at this point, but they're
72 : spread out over 5 registers. Each value is stored as an 8B long,
73 : even though it's less than 58, so 7 of those bytes are 0. That
74 : means we're only taking up 4 bytes in each register. We need to
75 : get them to a more compact form, but the correct order (in terms of
76 : place value and recalling where the input vector comes from) is:
77 : (letters in the right column correspond to diagram below)
78 :
79 : the first value in rem4 (a)
80 : the first value in rem3 (b)
81 : ...
82 : the first value in rem0 (e)
83 : the second value in rem4 (f)
84 : ...
85 : the fourth value in rem0 (t)
86 :
87 : The fact that moves that cross the 128 bit boundary are tricky in
88 : AVX makes this difficult, forcing an inconvenient output format.
89 :
90 : First, we'll use _mm256_shuffle_epi8 to move the second value in
91 : each half to byte 5:
92 :
93 : [ a 0 0 0 0 0 0 0 f 0 0 0 0 0 0 0 | k 0 0 0 0 0 0 0 p 0 0 0 0 0 0 0 ] ->
94 : [ a 0 0 0 0 f 0 0 0 0 0 0 0 0 0 0 | k 0 0 0 0 p 0 0 0 0 0 0 0 0 0 0 ]
95 :
96 : Then for the vectors other than rem4, we'll shuffle them the same
97 : way, but then shift them left (which corresponds to right in the
98 : picture...) and OR them together. */
99 :
100 831387 : __m256i shuffle1 = _mm256_setr_epi8( 0, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
101 831387 : 0, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 );
102 :
103 831387 : wuc_t shift4 = _mm256_shuffle_epi8( rem4, shuffle1);
104 831387 : wuc_t shift3 = _mm256_slli_si256( _mm256_shuffle_epi8( rem3, shuffle1), 1);
105 831387 : wuc_t shift2 = _mm256_slli_si256( _mm256_shuffle_epi8( rem2, shuffle1), 2);
106 831387 : wuc_t shift1 = _mm256_slli_si256( _mm256_shuffle_epi8( rem1, shuffle1), 3);
107 831387 : wuc_t shift0 = _mm256_slli_si256( _mm256_shuffle_epi8( rem0, shuffle1), 4);
108 831387 : wuc_t shift = _mm256_or_si256( _mm256_or_si256(
109 831387 : _mm256_or_si256( shift4, shift3),
110 831387 : _mm256_or_si256( shift2, shift1) ),
111 831387 : shift0 );
112 :
113 : /* The final value is:
114 : [ a b c d e f g h i j 0 0 0 0 0 0 | k l m n o p q r s t 0 0 0 0 0 0 ]
115 : */
116 :
117 831387 : return shift;
118 831387 : }
119 :
120 : /* Converts each byte in the AVX2 register from raw base58 [0,58) to
121 : base58 digits ('1'-'z', with some skips). Anything not in the range
122 : [0, 58) will be mapped arbitrarily, but won't affect other bytes. */
123 :
124 554244 : static inline wuc_t raw_to_base58( wuc_t in ) {
125 :
126 : /* <30 cycles for two vectors (64 conversions) */
127 : /* We'll perform the map as an arithmetic expression,
128 : b58ch(x) = '1' + x + 7*[x>8] + [x>16] + [x>21] + 6*[x>32] + [x>43]
129 : (using Knuth bracket notation, which maps true/false to 1/0).
130 :
131 : cmpgt uses 0xFF for true and 0x00 for false. This is very
132 : convenient, because most of the time we just want to skip one
133 : character, so we can add 1 by subtracting 0xFF (=-1). */
134 :
135 554244 : __m256i gt0 = _mm256_cmpgt_epi8( in, _mm256_set1_epi8( 8) ); /* skip 7 */
136 554244 : __m256i gt1 = _mm256_cmpgt_epi8( in, _mm256_set1_epi8(16) );
137 554244 : __m256i gt2 = _mm256_cmpgt_epi8( in, _mm256_set1_epi8(21) );
138 554244 : __m256i gt3 = _mm256_cmpgt_epi8( in, _mm256_set1_epi8(32) ); /* skip 6*/
139 554244 : __m256i gt4 = _mm256_cmpgt_epi8( in, _mm256_set1_epi8(43) );
140 :
141 : /* Intel doesn't give us an epi8 multiplication instruction, but since
142 : we know the input is all in {0, -1}, we can just AND both values
143 : with -7 to get {0, -7}. Similarly for 6. */
144 :
145 554244 : __m256i gt0_7 = _mm256_and_si256( gt0, _mm256_set1_epi8( -7 ) );
146 554244 : __m256i gt3_6 = _mm256_and_si256( gt3, _mm256_set1_epi8( -6 ) );
147 :
148 : /* Add up all the negative offsets. */
149 554244 : __m256i sum = _mm256_add_epi8(
150 554244 : _mm256_add_epi8(
151 554244 : _mm256_add_epi8( _mm256_set1_epi8( -'1' ), gt1 ), /* Yes, that's the negative character value of '1' */
152 554244 : _mm256_add_epi8( gt2, gt4 ) ),
153 554244 : _mm256_add_epi8( gt0_7, gt3_6 ) );
154 :
155 554244 : return _mm256_sub_epi8( in, sum );
156 554244 : }
157 :
158 : /* count_leading_zeros_{n} counts the number of zero bytes prior to the
159 : first non-zero byte in the first n bytes. If all n bytes are zero,
160 : returns n. Return value is in [0, n]. For the two-vector cases, in0
161 : contains the first 32 bytes and in1 contains the second 32 bytes. */
162 :
163 : static inline ulong
164 42 : count_leading_zeros_26( wuc_t in ) {
165 42 : ulong mask0 = (ulong)(uint)_mm256_movemask_epi8( _mm256_cmpeq_epi8( in, _mm256_setzero_si256() ));
166 42 : ulong mask = fd_ulong_mask_lsb( 27 ) ^ (mask0 & fd_ulong_mask_lsb( 26 )); /* Flips the low 26 bits and puts a 1 in bit 26 */
167 42 : return (ulong)fd_ulong_find_lsb( mask );
168 42 : }
169 :
170 : static inline ulong
171 277059 : count_leading_zeros_32( wuc_t in ) {
172 277059 : ulong mask = fd_ulong_mask_lsb( 33 ) ^ (ulong)(uint)_mm256_movemask_epi8( _mm256_cmpeq_epi8( in, _mm256_setzero_si256() ));
173 277059 : return (ulong)fd_ulong_find_lsb( mask );
174 277059 : }
175 :
176 : static inline ulong
177 : count_leading_zeros_45( wuc_t in0,
178 277059 : wuc_t in1 ) {
179 277059 : ulong mask0 = (ulong)(uint)_mm256_movemask_epi8( _mm256_cmpeq_epi8( in0, _mm256_setzero_si256() ));
180 277059 : ulong mask1 = (ulong)(uint)_mm256_movemask_epi8( _mm256_cmpeq_epi8( in1, _mm256_setzero_si256() ));
181 277059 : ulong mask = fd_ulong_mask_lsb( 46 ) ^ (((mask1 & fd_ulong_mask_lsb( 13 ))<<32) | mask0);
182 277059 : return (ulong)fd_ulong_find_lsb( mask );
183 277059 : }
184 :
185 : static inline ulong
186 : count_leading_zeros_64( wuc_t in0,
187 84 : wuc_t in1 ) {
188 84 : ulong mask0 = (ulong)(uint)_mm256_movemask_epi8( _mm256_cmpeq_epi8( in0, _mm256_setzero_si256() ));
189 84 : ulong mask1 = (ulong)(uint)_mm256_movemask_epi8( _mm256_cmpeq_epi8( in1, _mm256_setzero_si256() ));
190 84 : ulong mask = ~((mask1<<32) | mask0);
191 84 : return (ulong)fd_ulong_find_lsb_w_default( mask, 64 );
192 84 : }
193 :
194 : /* ten_per_slot_down_{32,64}: Packs {45,90} raw base58 digits stored in
195 : the bizarre groups of 10 format from intermediate_to_raw into {2,3}
196 : AVX2 registers with the digits stored contiguously. */
197 :
198 : /* In this diagram, one letter represents one byte.
199 : [ aaaaaaaaaa000000 bbbbbbbbbb000000 ]
200 : [ cccccccccc000000 dddddddddd000000 ]
201 : [ eeeee00000000000 0 ]
202 : [ aaaaaaaaaa000000 ]
203 : [ 0000000000bbbbbb ] ( >> 10B)
204 : [ bbbb000000000000 ] (<< 6B)
205 : [ 0000cccccccccc00 ] (>> 4B)
206 : [ 00000000000000dd ] (>> 14B)
207 : [ dddddddd00000000 ] (<< 2)
208 : [ 00000000eeeee000 ] (>> 8)
209 : 0 1 2
210 : In the diagram above, memory addresses increase from left to right.
211 : AVX instructions see the world from a little-endian perspective,
212 : where shifting left by one byte increases the numerical value, which
213 : is equivalent to moving the data one byte later in memory, which
214 : would show in the diagram as moving the values to the right. */
215 :
216 : #define ten_per_slot_down_32( in0, in1, in2, out0, out1 ) \
217 277059 : do { \
218 277059 : __m128i lo0 = _mm256_extractf128_si256( in0, 0 ); \
219 277059 : __m128i hi0 = _mm256_extractf128_si256( in0, 1 ); \
220 277059 : __m128i lo1 = _mm256_extractf128_si256( in1, 0 ); \
221 277059 : __m128i hi1 = _mm256_extractf128_si256( in1, 1 ); \
222 277059 : __m128i lo2 = _mm256_extractf128_si256( in2, 0 ); \
223 277059 : \
224 277059 : __m128i o0 = _mm_or_si128( lo0, _mm_slli_si128( hi0, 10 )); \
225 277059 : __m128i o1 = _mm_or_si128( _mm_or_si128( \
226 277059 : _mm_srli_si128( hi0, 6 ), \
227 277059 : _mm_slli_si128( lo1, 4 ) \
228 277059 : ), _mm_slli_si128( hi1, 14 )); \
229 277059 : __m128i o2 = _mm_or_si128( _mm_srli_si128( hi1, 2 ), _mm_slli_si128( lo2, 8 )); \
230 277059 : out0 = _mm256_set_m128i( o1, o0 ); \
231 277059 : out1 = _mm256_set_m128i( _mm_setzero_si128( ), o2 ); \
232 277059 : } while( 0 )
233 :
234 : /* In this diagram, one letter represents one byte.
235 : (... snip (see diagram above) ... )
236 : [ eeeeeeeeee000000 ffffffffff000000 ]
237 : [ gggggggggg000000 hhhhhhhhhh000000 ]
238 : [ iiiiiiiiii000000 0 ]
239 : [ 00000000eeeeeeee ] (>> 8)
240 : [ ee00000000000000 ] (<< 8)
241 : [ 00ffffffffff0000 ] (>> 2)
242 : [ 000000000000gggg ] (>> 12)
243 : [ gggggg0000000000 ] (<< 4)
244 : [ 000000hhhhhhhhhh ] (>> 6)
245 : [ iiiiiiiiii000000 ]
246 : 2 3 4 5
247 : */
248 :
249 : #define ten_per_slot_down_64( in0, in1, in2, in3, in4, out0, out1, out2 ) \
250 42 : do { \
251 42 : __m128i lo0 = _mm256_extractf128_si256( in0, 0 ); \
252 42 : __m128i hi0 = _mm256_extractf128_si256( in0, 1 ); \
253 42 : __m128i lo1 = _mm256_extractf128_si256( in1, 0 ); \
254 42 : __m128i hi1 = _mm256_extractf128_si256( in1, 1 ); \
255 42 : __m128i lo2 = _mm256_extractf128_si256( in2, 0 ); \
256 42 : __m128i hi2 = _mm256_extractf128_si256( in2, 1 ); \
257 42 : __m128i lo3 = _mm256_extractf128_si256( in3, 0 ); \
258 42 : __m128i hi3 = _mm256_extractf128_si256( in3, 1 ); \
259 42 : __m128i lo4 = _mm256_extractf128_si256( in4, 0 ); \
260 42 : \
261 42 : __m128i o0 = _mm_or_si128( lo0, _mm_slli_si128( hi0, 10 )); \
262 42 : __m128i o1 = _mm_or_si128( _mm_or_si128( \
263 42 : _mm_srli_si128( hi0, 6 ), \
264 42 : _mm_slli_si128( lo1, 4 ) \
265 42 : ), _mm_slli_si128( hi1, 14 )); \
266 42 : __m128i o2 = _mm_or_si128( _mm_srli_si128( hi1, 2 ), _mm_slli_si128( lo2, 8 )); \
267 42 : __m128i o3 = _mm_or_si128( _mm_or_si128( \
268 42 : _mm_srli_si128( lo2, 8 ), \
269 42 : _mm_slli_si128( hi2, 2 ) \
270 42 : ), _mm_slli_si128( lo3, 12 )); \
271 42 : __m128i o4 = _mm_or_si128( _mm_srli_si128( lo3, 4 ), _mm_slli_si128( hi3, 6 )); \
272 42 : out0 = _mm256_set_m128i( o1, o0 ); \
273 42 : out1 = _mm256_set_m128i( o3, o2 ); \
274 42 : out2 = _mm256_set_m128i( lo4, o4 ); \
275 42 : } while( 0 )
276 :
|