LCOV - code coverage report
Current view: top level - ballet/base58 - fd_base58_avx.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 112 114 98.2 %
Date: 2024-11-13 11:58:15 Functions: 8 20 40.0 %

          Line data    Source code
       1             : #include "../../util/simd/fd_avx.h"
       2             : #include <immintrin.h>
       3             : 
       4             : /* This is not a proper header and so should not be included from
       5             :    anywhere besides fd_base58.c and test_base58_avx.c.  As such, it has
       6             :    no include guard. */
       7             : 
       8     6928197 : #define wuc_t __m256i
       9           0 : static inline wuc_t wuc_ld(  uchar const * p   ) { return _mm256_load_si256(  (__m256i const *)p ); }
      10      277143 : static inline wuc_t wuc_ldu( uchar const * p   ) { return _mm256_loadu_si256( (__m256i const *)p ); }
      11           0 : static inline void wuc_st(  uchar * p, wuc_t i ) { _mm256_store_si256(  (__m256i *)p, i ); }
      12          42 : static inline void wuc_stu( uchar * p, wuc_t i ) { _mm256_storeu_si256( (__m256i *)p, i ); }
      13             : 
      14             : /* Converts a vector with 4 terms in intermediate form ( digits in
      15             :    [0,58^5) ) into 20 digits of raw base58 (<58) stored in two groups of
      16             :    10 in the lower 10 bytes of each 128-bit half of the vector. */
      17             : 
      18             : static inline wuc_t
      19      831387 : intermediate_to_raw( wl_t intermediate ) {
      20             : 
      21             :   /* The computation we need to do here mathematically is
      22             :      y=(floor(x/58^k) % 58) for various values of k.  It seems that the
      23             :      best way to compute it (at least what the compiler generates in the
      24             :      scalar case) is by computing z = floor(x/58^k). y = z -
      25             :      58*floor(z/58).  Simplifying, gives, y = floor(x/58^k) -
      26             :      58*floor(x/58^(k+1)) (Note, to see that the floors simplify like
      27             :      that, represent x in its base58 expansion and then consider that
      28             :      dividing by 58^k is just shifting right by k places.) This means we
      29             :      can reuse a lot of values!
      30             : 
      31             :      We can do the divisions with "magic multiplication" (i.e. multiply
      32             :      and shift).  There's a tradeoff between ILP and register pressure
      33             :      to make here: we can load a constant for each value of k and just
      34             :      compute the division directly, or we could use one constant for
      35             :      division by 58 and apply it repeatedly.  I don't know if this is
      36             :      optimal, but I use two constants, one for /58 and the other for
      37             :      /58^2.  We need to take advantage of the fact the input is
      38             :      <58^5<2^32 to produce constants that fit in uints so that we can
      39             :      use mul_epu32. */
      40             : 
      41      831387 :   wl_t cA  = wl_bcast( (long)2369637129U ); /* =2^37/58 */
      42      831387 :   wl_t cB  = wl_bcast( (long)1307386003U ); /* =2^42/58^2 */
      43      831387 :   wl_t _58 = wl_bcast( (long)58UL );
      44             : 
      45             :   /* Divide each ulong in r by {58, 58^2=3364}, taking the floor of the
      46             :      division.  I used gcc to convert the division to magic
      47             :      multiplication. */
      48             : 
      49      831387 : # define DIV58(r)    wl_shru( _mm256_mul_epu32( r,               cA ), 37)
      50     2494161 : # define DIV3364(r)  wl_shru( _mm256_mul_epu32( wl_shru( r, 2 ), cB ), 40)
      51             : 
      52             :   /* div(k) stores floor(x/58^k). rem(k) stores div(k) % 58 */
      53      831387 :   wl_t div0 = intermediate;
      54      831387 :   wl_t div1 = DIV58(div0);
      55      831387 :   wl_t rem0 = wl_sub( div0, _mm256_mul_epu32( div1, _58 ) );
      56             : 
      57      831387 :   wl_t div2 = DIV3364(div0);
      58      831387 :   wl_t rem1 = wl_sub( div1, _mm256_mul_epu32( div2, _58 ) );
      59             : 
      60      831387 :   wl_t div3 = DIV3364(div1);
      61      831387 :   wl_t rem2 = wl_sub( div2, _mm256_mul_epu32( div3, _58 ) );
      62             : 
      63      831387 :   wl_t div4 = DIV3364(div2);
      64      831387 :   wl_t rem3 = wl_sub( div3, _mm256_mul_epu32( div4, _58 ) );
      65             : 
      66      831387 :   wl_t rem4 = div4; /* We know the values are less than 58 at this point */
      67             : 
      68      831387 : # undef DIV58
      69      831387 : # undef DIV3364
      70             : 
      71             :   /* Okay, we have all 20 terms we need at this point, but they're
      72             :      spread out over 5 registers. Each value is stored as an 8B long,
      73             :      even though it's less than 58, so 7 of those bytes are 0.  That
      74             :      means we're only taking up 4 bytes in each register.  We need to
      75             :      get them to a more compact form, but the correct order (in terms of
      76             :      place value and recalling where the input vector comes from) is:
      77             :      (letters in the right column correspond to diagram below)
      78             : 
      79             :         the first value in rem4  (a)
      80             :         the first value in rem3  (b)
      81             :         ...
      82             :         the first value in rem0  (e)
      83             :         the second value in rem4 (f)
      84             :         ...
      85             :         the fourth value in rem0 (t)
      86             : 
      87             :      The fact that moves that cross the 128 bit boundary are tricky in
      88             :      AVX makes this difficult, forcing an inconvenient output format.
      89             : 
      90             :      First, we'll use _mm256_shuffle_epi8 to move the second value in
      91             :      each half to byte 5:
      92             : 
      93             :        [ a 0 0 0 0 0 0 0  f 0 0 0 0 0 0 0 | k 0 0 0 0 0 0 0  p 0 0 0 0 0 0 0 ] ->
      94             :        [ a 0 0 0 0 f 0 0  0 0 0 0 0 0 0 0 | k 0 0 0 0 p 0 0  0 0 0 0 0 0 0 0 ]
      95             : 
      96             :      Then for the vectors other than rem4, we'll shuffle them the same
      97             :      way, but then shift them left (which corresponds to right in the
      98             :      picture...) and OR them together.  */
      99             : 
     100      831387 :   __m256i shuffle1 = _mm256_setr_epi8( 0, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     101      831387 :                                        0, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 );
     102             : 
     103      831387 :   wuc_t shift4 =                    _mm256_shuffle_epi8( rem4, shuffle1);
     104      831387 :   wuc_t shift3 = _mm256_slli_si256( _mm256_shuffle_epi8( rem3, shuffle1), 1);
     105      831387 :   wuc_t shift2 = _mm256_slli_si256( _mm256_shuffle_epi8( rem2, shuffle1), 2);
     106      831387 :   wuc_t shift1 = _mm256_slli_si256( _mm256_shuffle_epi8( rem1, shuffle1), 3);
     107      831387 :   wuc_t shift0 = _mm256_slli_si256( _mm256_shuffle_epi8( rem0, shuffle1), 4);
     108      831387 :   wuc_t shift  = _mm256_or_si256( _mm256_or_si256(
     109      831387 :                                       _mm256_or_si256( shift4, shift3),
     110      831387 :                                       _mm256_or_si256( shift2, shift1) ),
     111      831387 :                                   shift0 );
     112             : 
     113             :   /* The final value is:
     114             :     [ a b c d e f g h i j 0 0 0 0 0 0 | k l m n o p q r s t 0 0 0 0 0 0 ]
     115             :     */
     116             : 
     117      831387 :   return shift;
     118      831387 : }
     119             : 
     120             : /* Converts each byte in the AVX2 register from raw base58 [0,58) to
     121             :    base58 digits ('1'-'z', with some skips).  Anything not in the range
     122             :    [0, 58) will be mapped arbitrarily, but won't affect other bytes. */
     123             : 
     124      554244 : static inline wuc_t raw_to_base58( wuc_t in ) {
     125             : 
     126             :   /* <30 cycles for two vectors (64 conversions) */
     127             :   /* We'll perform the map as an arithmetic expression,
     128             :      b58ch(x) = '1' + x + 7*[x>8] + [x>16] + [x>21] + 6*[x>32] + [x>43]
     129             :      (using Knuth bracket notation, which maps true/false to 1/0).
     130             : 
     131             :      cmpgt uses 0xFF for true and 0x00 for false.  This is very
     132             :      convenient, because most of the time we just want to skip one
     133             :      character, so we can add 1 by subtracting 0xFF (=-1). */
     134             : 
     135      554244 :   __m256i gt0 = _mm256_cmpgt_epi8( in, _mm256_set1_epi8( 8) ); /* skip 7 */
     136      554244 :   __m256i gt1 = _mm256_cmpgt_epi8( in, _mm256_set1_epi8(16) );
     137      554244 :   __m256i gt2 = _mm256_cmpgt_epi8( in, _mm256_set1_epi8(21) );
     138      554244 :   __m256i gt3 = _mm256_cmpgt_epi8( in, _mm256_set1_epi8(32) ); /* skip 6*/
     139      554244 :   __m256i gt4 = _mm256_cmpgt_epi8( in, _mm256_set1_epi8(43) );
     140             : 
     141             :   /* Intel doesn't give us an epi8 multiplication instruction, but since
     142             :      we know the input is all in {0, -1}, we can just AND both values
     143             :      with -7 to get {0, -7}. Similarly for 6. */
     144             : 
     145      554244 :   __m256i gt0_7 = _mm256_and_si256( gt0, _mm256_set1_epi8( -7 ) );
     146      554244 :   __m256i gt3_6 = _mm256_and_si256( gt3, _mm256_set1_epi8( -6 ) );
     147             : 
     148             :   /* Add up all the negative offsets. */
     149      554244 :   __m256i sum = _mm256_add_epi8(
     150      554244 :                   _mm256_add_epi8(
     151      554244 :                     _mm256_add_epi8( _mm256_set1_epi8( -'1' ), gt1 ), /* Yes, that's the negative character value of '1' */
     152      554244 :                     _mm256_add_epi8( gt2,                      gt4 ) ),
     153      554244 :                   _mm256_add_epi8(   gt0_7,                    gt3_6 ) );
     154             : 
     155      554244 :   return _mm256_sub_epi8( in, sum );
     156      554244 : }
     157             : 
     158             : /* count_leading_zeros_{n} counts the number of zero bytes prior to the
     159             :    first non-zero byte in the first n bytes.  If all n bytes are zero,
     160             :    returns n.  Return value is in [0, n].  For the two-vector cases, in0
     161             :    contains the first 32 bytes and in1 contains the second 32 bytes. */
     162             : 
     163             : static inline ulong
     164          42 : count_leading_zeros_26( wuc_t in ) {
     165          42 :   ulong mask0 = (ulong)(uint)_mm256_movemask_epi8( _mm256_cmpeq_epi8( in, _mm256_setzero_si256() ));
     166          42 :   ulong mask  = fd_ulong_mask_lsb( 27 ) ^ (mask0 & fd_ulong_mask_lsb( 26 )); /* Flips the low 26 bits and puts a 1 in bit 26 */
     167          42 :   return (ulong)fd_ulong_find_lsb( mask );
     168          42 : }
     169             : 
     170             : static inline ulong
     171      277059 : count_leading_zeros_32( wuc_t in ) {
     172      277059 :   ulong mask = fd_ulong_mask_lsb( 33 ) ^ (ulong)(uint)_mm256_movemask_epi8( _mm256_cmpeq_epi8( in, _mm256_setzero_si256() ));
     173      277059 :   return (ulong)fd_ulong_find_lsb( mask );
     174      277059 : }
     175             : 
     176             : static inline ulong
     177             : count_leading_zeros_45( wuc_t in0,
     178      277059 :                         wuc_t in1 ) {
     179      277059 :   ulong mask0 = (ulong)(uint)_mm256_movemask_epi8( _mm256_cmpeq_epi8( in0, _mm256_setzero_si256() ));
     180      277059 :   ulong mask1 = (ulong)(uint)_mm256_movemask_epi8( _mm256_cmpeq_epi8( in1, _mm256_setzero_si256() ));
     181      277059 :   ulong mask = fd_ulong_mask_lsb( 46 ) ^ (((mask1 & fd_ulong_mask_lsb( 13 ))<<32) | mask0);
     182      277059 :   return (ulong)fd_ulong_find_lsb( mask );
     183      277059 : }
     184             : 
     185             : static inline ulong
     186             : count_leading_zeros_64( wuc_t in0,
     187          84 :                         wuc_t in1 ) {
     188          84 :   ulong mask0 = (ulong)(uint)_mm256_movemask_epi8( _mm256_cmpeq_epi8( in0, _mm256_setzero_si256() ));
     189          84 :   ulong mask1 = (ulong)(uint)_mm256_movemask_epi8( _mm256_cmpeq_epi8( in1, _mm256_setzero_si256() ));
     190          84 :   ulong mask = ~((mask1<<32) | mask0);
     191          84 :   return (ulong)fd_ulong_find_lsb_w_default( mask, 64 );
     192          84 : }
     193             : 
     194             : /* ten_per_slot_down_{32,64}: Packs {45,90} raw base58 digits stored in
     195             :    the bizarre groups of 10 format from intermediate_to_raw into {2,3}
     196             :    AVX2 registers with the digits stored contiguously. */
     197             : 
     198             : /* In this diagram, one letter represents one byte.
     199             :     [ aaaaaaaaaa000000 bbbbbbbbbb000000 ]
     200             :                                         [ cccccccccc000000 dddddddddd000000 ]
     201             :                                                                             [ eeeee00000000000 0 ]
     202             :     [ aaaaaaaaaa000000 ]
     203             :     [ 0000000000bbbbbb ] ( >> 10B)
     204             :                      [ bbbb000000000000 ] (<< 6B)
     205             :                      [ 0000cccccccccc00 ] (>> 4B)
     206             :                      [ 00000000000000dd ] (>> 14B)
     207             :                                         [ dddddddd00000000 ] (<< 2)
     208             :                                         [ 00000000eeeee000 ] (>> 8)
     209             :         0                   1                   2
     210             :     In the diagram above, memory addresses increase from left to right.
     211             :     AVX instructions see the world from a little-endian perspective,
     212             :     where shifting left by one byte increases the numerical value, which
     213             :     is equivalent to moving the data one byte later in memory, which
     214             :     would show in the diagram as moving the values to the right. */
     215             : 
     216             : #define ten_per_slot_down_32( in0, in1, in2, out0, out1 )                           \
     217      277059 :   do {                                                                              \
     218      277059 :     __m128i lo0 = _mm256_extractf128_si256( in0, 0 );                               \
     219      277059 :     __m128i hi0 = _mm256_extractf128_si256( in0, 1 );                               \
     220      277059 :     __m128i lo1 = _mm256_extractf128_si256( in1, 0 );                               \
     221      277059 :     __m128i hi1 = _mm256_extractf128_si256( in1, 1 );                               \
     222      277059 :     __m128i lo2 = _mm256_extractf128_si256( in2, 0 );                               \
     223      277059 :                                                                                     \
     224      277059 :     __m128i o0 = _mm_or_si128( lo0, _mm_slli_si128( hi0, 10 ));                     \
     225      277059 :     __m128i o1 = _mm_or_si128( _mm_or_si128(                                        \
     226      277059 :                                     _mm_srli_si128( hi0, 6 ),                       \
     227      277059 :                                     _mm_slli_si128( lo1, 4 )                        \
     228      277059 :                                     ), _mm_slli_si128( hi1, 14 ));                  \
     229      277059 :     __m128i o2 = _mm_or_si128( _mm_srli_si128( hi1, 2 ), _mm_slli_si128( lo2, 8 )); \
     230      277059 :     out0 = _mm256_set_m128i( o1, o0 );                                              \
     231      277059 :     out1 = _mm256_set_m128i( _mm_setzero_si128( ), o2 );                            \
     232      277059 :   } while( 0 )
     233             : 
     234             : /* In this diagram, one letter represents one byte.
     235             :    (... snip (see diagram above) ... )
     236             :     [ eeeeeeeeee000000 ffffffffff000000 ]
     237             :                                         [ gggggggggg000000 hhhhhhhhhh000000 ]
     238             :                                                                             [ iiiiiiiiii000000 0 ]
     239             :     [ 00000000eeeeeeee ] (>> 8)
     240             :                      [ ee00000000000000 ] (<< 8)
     241             :                      [ 00ffffffffff0000 ] (>> 2)
     242             :                      [ 000000000000gggg ] (>> 12)
     243             :                                         [ gggggg0000000000 ] (<< 4)
     244             :                                         [ 000000hhhhhhhhhh ] (>> 6)
     245             :                                                            [ iiiiiiiiii000000 ]
     246             :           2               3                   4                   5
     247             : */
     248             : 
     249             : #define ten_per_slot_down_64( in0, in1, in2, in3, in4, out0, out1, out2 )           \
     250          42 :   do {                                                                              \
     251          42 :     __m128i lo0 = _mm256_extractf128_si256( in0, 0 );                               \
     252          42 :     __m128i hi0 = _mm256_extractf128_si256( in0, 1 );                               \
     253          42 :     __m128i lo1 = _mm256_extractf128_si256( in1, 0 );                               \
     254          42 :     __m128i hi1 = _mm256_extractf128_si256( in1, 1 );                               \
     255          42 :     __m128i lo2 = _mm256_extractf128_si256( in2, 0 );                               \
     256          42 :     __m128i hi2 = _mm256_extractf128_si256( in2, 1 );                               \
     257          42 :     __m128i lo3 = _mm256_extractf128_si256( in3, 0 );                               \
     258          42 :     __m128i hi3 = _mm256_extractf128_si256( in3, 1 );                               \
     259          42 :     __m128i lo4 = _mm256_extractf128_si256( in4, 0 );                               \
     260          42 :                                                                                     \
     261          42 :     __m128i o0 = _mm_or_si128( lo0, _mm_slli_si128( hi0, 10 ));                     \
     262          42 :     __m128i o1 = _mm_or_si128( _mm_or_si128(                                        \
     263          42 :                                     _mm_srli_si128( hi0, 6 ),                       \
     264          42 :                                     _mm_slli_si128( lo1, 4 )                        \
     265          42 :                                     ), _mm_slli_si128( hi1, 14 ));                  \
     266          42 :     __m128i o2 = _mm_or_si128( _mm_srli_si128( hi1, 2 ), _mm_slli_si128( lo2, 8 )); \
     267          42 :     __m128i o3 = _mm_or_si128( _mm_or_si128(                                        \
     268          42 :                                     _mm_srli_si128( lo2, 8 ),                       \
     269          42 :                                     _mm_slli_si128( hi2, 2 )                        \
     270          42 :                                     ), _mm_slli_si128( lo3, 12 ));                  \
     271          42 :     __m128i o4 = _mm_or_si128( _mm_srli_si128( lo3, 4 ), _mm_slli_si128( hi3, 6 )); \
     272          42 :     out0 = _mm256_set_m128i( o1, o0  );                                             \
     273          42 :     out1 = _mm256_set_m128i( o3, o2  );                                             \
     274          42 :     out2 = _mm256_set_m128i( lo4, o4 );                                             \
     275          42 :   } while( 0 )
     276             : 

Generated by: LCOV version 1.14