LCOV - code coverage report
Current view: top level - ballet/pack - fd_chkdup.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 181 183 98.9 %
Date: 2024-11-13 11:58:15 Functions: 11 18 61.1 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_ballet_pack_fd_chkdup_h
       2             : #define HEADER_fd_src_ballet_pack_fd_chkdup_h
       3             : #include "../fd_ballet_base.h"
       4             : #include "../txn/fd_txn.h"
       5             : 
       6             : /* fd_chkdup declares a set of functions for ultra-HPC checking if a
       7             :    list of account addresses contains any duplicates.  It's important
       8             :    that this be fast, because a transaction containing duplicate account
       9             :    addresses fails to sanitize and is not charged a fee.  Although this
      10             :    check can (and really ought) to be done in parallel, perhaps in the
      11             :    verify tiles, right now, it's done in pack, which means it's serial
      12             :    and on the critical path.
      13             : 
      14             :    On platforms with AVX, the current implementation uses a fast initial
      15             :    check which may have false positives (thinking there are duplicates
      16             :    when there aren't).  Any transaction that fails the initial check is
      17             :    then subjected to the full, precise check.  Without AVX, all
      18             :    transactions use the slow path. */
      19             : 
      20             : /* The functions are defined in the header to facilitate inlining since
      21             :    they take 10s of cycles in the good case, but should probably be
      22             :    treated as if they were defined in a .c file. */
      23             : 
      24             : #ifndef FD_CHKDUP_IMPL
      25             : # if FD_HAS_AVX512
      26       16251 : #   define FD_CHKDUP_IMPL 2
      27             : # elif FD_HAS_AVX
      28       16502 : #   define FD_CHKDUP_IMPL 1
      29             : # else
      30             : #   define FD_CHKDUP_IMPL 0
      31             : # endif
      32             : #endif
      33             : 
      34             : 
      35             : #if FD_CHKDUP_IMPL==2
      36             : # define FD_CHKDUP_ALIGN ( 64UL)
      37             : #elif FD_CHKDUP_IMPL==1
      38             : # define FD_CHKDUP_ALIGN ( 32UL)
      39             : #elif FD_CHKDUP_IMPL==0
      40             : # define FD_CHKDUP_ALIGN ( 32UL)
      41             : #else
      42             : # error "Unrecognized value of FD_CHKDUP_IMPL"
      43             : #endif
      44             : 
      45             : 
      46             : # define FD_CHKDUP_FOOTPRINT  FD_LAYOUT_FINI( FD_LAYOUT_APPEND(                                         \
      47             :                                                              FD_LAYOUT_APPEND( FD_LAYOUT_INIT,               \
      48             :                                                              FD_CHKDUP_ALIGN, 32*FD_CHKDUP_IMPL ), \
      49             :                                                              32UL,                 (1UL<<8)*32UL    ),       \
      50             :                                                    FD_CHKDUP_ALIGN )
      51             : 
      52             : FD_STATIC_ASSERT( (1UL<<8)==2*FD_TXN_ACCT_ADDR_MAX, "hash table size" );
      53             : 
      54             : /* Fixed size (just over 8kB) and safe for declaration on the stack or
      55             :    inclusion in a struct. */
      56             : struct fd_chkdup_private;
      57             : typedef struct fd_chkdup_private fd_chkdup_t;
      58             : 
      59             : FD_PROTOTYPES_BEGIN
      60             : /* fd_chkdup_{footprint, align} return the footprint and alignment of
      61             :    the scratch memory that duplicate detection requires. */
      62           0 : static inline ulong fd_chkdup_footprint( void ) { return FD_CHKDUP_FOOTPRINT; }
      63           0 : static inline ulong fd_chkdup_align    ( void ) { return FD_CHKDUP_ALIGN;     }
      64             : 
      65             : /* fd_chkdup_new formats an appropriately sized region of memory for use
      66             :    in duplicate address detection.  shmem must point to the first byte
      67             :    of a region of memory with the appropriate alignment and footprint.
      68             :    rng must be a pointer to a local join of an RNG.  Some slots of the
      69             :    RNG will be consumed, but no interest in the RNG will be retained
      70             :    after the function returns.  Returns shmem on success and NULL on
      71             :    failure (logs details).  The only failure cases are if shmem is NULL
      72             :    or not aligned.
      73             : 
      74             :    fd_chkdup_join joins the caller to the formatted region of memory.
      75             :    Returns shmem.
      76             : 
      77             :    fd_chkdup_leave unjoins the caller to chkdup.  Returns chkdup.
      78             :    fd_chkdup_delete unformats the region of memory.  Returns a pointer
      79             :    to the unformatted memory region. */
      80             : static inline void *        fd_chkdup_new   ( void * shmem, fd_rng_t * rng );
      81             : static inline fd_chkdup_t * fd_chkdup_join  ( void * shmem                 );
      82             : static inline void *        fd_chkdup_leave ( fd_chkdup_t * chkdup         );
      83             : static inline void *        fd_chkdup_delete( void * shmem                 );
      84             : 
      85             : 
      86             : /* fd_chkdup_check{,_slow,_fast} check a list of account addresses for
      87             :    any duplicate addresses, i.e. an account address that appears twice
      88             :    in the list.  The list does not need to be sorted or have any
      89             :    particular order.  The list may be decomposed into two sublists
      90             :    (list0 and list1) to facilitate 0-copy usage with address lookup
      91             :    tables, but list0 and list1 are logically concatenated prior to
      92             :    checking for duplicates.
      93             : 
      94             :    chkdup is a pointer to a valid local join of a chkdup object.
      95             : 
      96             :    list0 and list1 point to the first account address of the respective
      97             :    sublists.  The memory they point to need not have any particular
      98             :    alignment.  list0==NULL is okay only if list0_cnt==0, and similarly
      99             :    for list1.  list0 is accessed with indices [0, list0_cnt) and list1
     100             :    is accessed with indices [0, list1_cnt).  list0 and list1 must not
     101             :    overlap.  Requires list0_cnt+list1_cnt<=128, and the function is
     102             :    somewhat tuned for smaller values.
     103             : 
     104             :    fd_chkdup_check and the _slow version return 1 if the list of
     105             :    transactions contains at least one duplicated account address and 0
     106             :    otherwise (implying each account address in the provided list is
     107             :    unique).
     108             : 
     109             :    fd_chkdup_check_fast returns 1 if the list of transactions contains
     110             :    at least one duplicated account address and typically returns 0 if
     111             :    each account address in the provided list is unique, but may
     112             :    sometimes spuriiously return 1 even without duplicates.
     113             : 
     114             :    WARNING: the _fast version MAY HAVE FALSE POSITIVES.  You probably
     115             :    want the un-suffixed version, which is precise.  It uses the fast
     116             :    version as a fast-path and then does a slower full check if the
     117             :    fast-path suggests there may be a duplicate.
     118             : 
     119             :    However, it's also worth calling out again that the _fast version
     120             :    only makes errors in one direction.  If the list contains duplicates,
     121             :    it will definitely return 1.  If it returns 0, the list definitely
     122             :    does not contain duplicates. (Those two statements are equivalent).
     123             :    */
     124             : static inline int
     125             : fd_chkdup_check     ( fd_chkdup_t          * chkdup,
     126             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     127             :                       fd_acct_addr_t const * list1,     ulong list1_cnt );
     128             : static inline int
     129             : fd_chkdup_check_slow( fd_chkdup_t          * chkdup,
     130             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     131             :                       fd_acct_addr_t const * list1,     ulong list1_cnt );
     132             : static inline int
     133             : fd_chkdup_check_fast( fd_chkdup_t          * chkdup,
     134             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     135             :                       fd_acct_addr_t const * list1,     ulong list1_cnt );
     136             : 
     137             : 
     138             : /*   -----  Implementation details and discussion follow------
     139             : 
     140             :    The fast path implementation is somewhat interesting.  The normal way
     141             :    to do this is with a Bloom filter, but Bloom filters do lots of
     142             :    unpredictable reads and the pack tile is somewhat cache sensitive.
     143             :    Instead, this implements a variant on a Bloom filter that lives
     144             :    entirely in AVX registers.
     145             : 
     146             :    Basically, we use C W-bit words stored in (C*W)/256 AVX2 registers or
     147             :    (C*W)/512 AVX512 registers.  Each word is a modified Bloom filter
     148             :    with one associated hash function.  For each account address, we
     149             :    compute C hashes, giving (up to) C positions across the AVX
     150             :    registers.  If any of those positions have an unset bit, then we know
     151             :    we have not seen the account address before.  Finally, we then set
     152             :    all the positions.
     153             : 
     154             :    The only difference between this idea and a normal Bloom filter is
     155             :    that sometimes the hash function may not select a bit.  There's a
     156             :    tradeoff to be made here: suppose you insert R account addresses, and
     157             :    R is at least almost as large as W.  Then each word fills up, and
     158             :    false positives become increasingly likely.  Only testing and
     159             :    inserting a bit, say, half the time, effectively halves C, but
     160             :    prevents each word from getting saturated as quickly, and makes the
     161             :    algorithm effective for larger values of R.  We use Intel's quirky
     162             :    behavior to get this for free.
     163             : 
     164             :    (Note: The R and C notation is supposed to suggest a tabular layout
     165             :    in which the account addresses are rows and the words are columns.)
     166             : 
     167             :    The key insight into making this work quickly is that the vpsllv{d,q}
     168             :    variable left shift instructions are cheap (1 cycle, can execute on
     169             :    port 0 or 1 for AVX2, still 1 cycle on port 0 for AVX512).  If we can
     170             :    compute the hashes in parallel with SIMD, then we can variably shift
     171             :    bcast(0x1) quickly, and select several bits at a time.  The rest is
     172             :    just bitwise logic, which is extremely cheap with AVX.  This approach
     173             :    constrains W to be either 32 or 64, and C to be a multiple of the
     174             :    number of words in a vector, but those are both pretty acceptable
     175             :    constraints.
     176             : 
     177             :    The final ingredient is a cheap hash function that places the hashes
     178             :    in the appropriate position for vpsllv.  We just xor the account
     179             :    address with some validator-specific entropy and use a mask to select
     180             :    certain bits.
     181             : 
     182             :    The slow-path implementation uses a hash table to check for
     183             :    duplicates.  This is slower than sorting for transactions with only a
     184             :    few account addresses, but substantially faster than sorting for
     185             :    transactions with large numbers of account addresses, which is when
     186             :    the slow-path matters more anyway.
     187             : 
     188             : 
     189             :    You can see from above there are a variety of knobs to tune.  Should
     190             :    W be 32 or 64?  How big should C be?  How many bits should we mask
     191             :    off for the hash, which controls the frequency with which we skip a
     192             :    word, neither checking nor inserting a bit?  It would be good to have
     193             :    a rigorous understanding of the false positive rate as a function of
     194             :    these parameters so we can make a decision that minimizes the
     195             :    expected compute required.
     196             : 
     197             :    Unfortunately, the false positive computation is tricky.  The key
     198             :    difficulty is that whether processing account address r results in a
     199             :    false positive is not independent from whether processing account
     200             :    address r+1 results in a false positive.  This leads to an obnoxious
     201             :    inclusion-exclusion formula which quickly becomes more unwieldy than
     202             :    I (or Sage) can handle.
     203             : 
     204             :    A dynamic-programming-ish algorithm can compute the false positive
     205             :    rate in approximately O(R*2^R) time.  To start, we just want to
     206             :    understand a single word/column.  Suppose k bits in the word have
     207             :    been set.  Then there are (W-k) hash values that set a new bit, and
     208             :    (V+k) hash values that don't set a new, where V is the number of hash
     209             :    values that don't select a bit.  The hash values that set a bit are
     210             :    also exactly the ones that provide information that the account
     211             :    address is not a false positive.  I think about this as "spending a
     212             :    bit" to know it's not a false positive.
     213             : 
     214             :    Denoting the rows at which we spend a bit by a 1 and the rows at
     215             :    which we don't spend a bit by 0, we might get a column like:
     216             : 
     217             :                                 1
     218             :                                 0
     219             :                                 1
     220             :                                 1
     221             :                                 0.
     222             :    The number of ways this column can occur is x1 =
     223             :    W*(V+1)*(W-1)*(W-2)*(V+3), which means that the probability the
     224             :    column occurs is x1/(W+V)^5.  Expanding to multiple columns is easy,
     225             :    since the number of ways that two specific columns can occur is just
     226             :    the product of the number of ways each can occur.  For example,
     227             :                               1 0
     228             :                               0 1
     229             :                               1 1
     230             :                               1 0
     231             :                               0 0
     232             :    can occur x1 * x2 ways, where x2 = V*W*(W-1)*(V+2)*(V+2).
     233             :    A false positive happens when there's a row of all 0s, as in the last
     234             :    row of the example.
     235             : 
     236             :    It's cleaner to count the number of ways not to get a false positive.
     237             :    This gives us the inclusion-exclusion formula:
     238             :    (all ways)
     239             :         - (all ways where row 0 is all 0s)
     240             :         - (all ways where row 1 is all 0s)
     241             :         - ...
     242             :         + (all ways where rows 0 and 1 are both all 0s)
     243             :         + (all ways where rows 0 and 2 are both all 0s)
     244             :         + ...
     245             :         - (all ways where rows 0, 1, and 2 are all 0s)
     246             :         - (all ways where rows 0, 1, and 3 are all 0s)
     247             :         +, - ...
     248             :         + (-1)^R (all ways in which all rows are all 0s).
     249             : 
     250             :    There's a nice way to understand each of these terms.  For example,
     251             :    in the R=2, C=3 case, the term in which row 0 is all 0s has the
     252             :    following elements:
     253             :     0 0 0   0 0 0   0 0 0   0 0 0   0 0 0   0 0 0   0 0 0   0 0 0
     254             :     0 0 0   0 0 1   0 1 0   0 1 1   1 0 0   1 0 1   1 1 0   1 1 1
     255             :    Rather than enumerating all 2^( (R-1)*C ) elements, we'll represent
     256             :    it as
     257             :                       ( 0  + 0 )^3
     258             :                       ( 0    1 )
     259             : 
     260             :    Skipping some steps, the task boils down to counting the number of
     261             :    ways to get columns that match a mask, then raising that to the Cth
     262             :    power.
     263             : 
     264             :    Now, with all this behind us, our goal is to pick the optimal value
     265             :    of W,V, and C given a supposed distribution of transactions, and a
     266             :    performance model.  Based on some back of the envelope calculations
     267             :    based on instruction throughput and latency measurements and
     268             :    confirmed by some experiments, the fast path code takes about 3.5*R
     269             :    cycles if using one AVX2 vector (W*C==256) and
     270             :    2.5*R+2*ceil(W*C/256)*R cycles otherwise.  The slow path takes about
     271             :    133*J cycles.  Then the expected value of the number of cycles it
     272             :    takes to process a transaction with R accounts is
     273             :      R*(2.5+2*ceil(W*C/256) - [W*C<=256]) + FP_{W,V,R,C}*133*R
     274             : 
     275             :    Based on a sample of 100,000 slots containing about 100M
     276             :    transactions, the CDF looks like
     277             : 
     278             :      Fraction of transactions containing <=  3 account addresses     71%
     279             :                                          <= 13 account addresses     80%
     280             :                                          <= 24 account addresses     91%
     281             :                                          <= 31 account addresses     95%
     282             :                                          <= 44 account addresses     98%
     283             :                                          <= 50 account addresses     99%
     284             : 
     285             :    Basically, there's a peak at 3 (votes), and then a very long, very
     286             :    fat tail.  When using AVX2, it basically boils down into 2 regimes:
     287             :           0 <= R <  28            W=32, C=8,  V=0  (one AVX vector)
     288             :          28 <= R <= 64            W=32, C=32, V=32 (four AVX vectors)
     289             : 
     290             :    This combination has an expected value of about 54 cycles over all
     291             :    transactions.  For a typical transaction with 3 account addresses,
     292             :    this takes about 10 cycles and the false positive probability is
     293             :    about 2e-10.
     294             : 
     295             :    When using AVX512, the regimes are similar:
     296             :           0 <= R <  36            W=32, C=16, V=0  (one AVX vector)
     297             :          36 <= R <= 64            W=32, C=32, V=32 (two AVX vectors)
     298             :   This combination has an expected value of about 33 cycles over all
     299             :   transactions.  Again, the typical 3 account address account takes
     300             :   about 9 cycles and has a negligible false positive probability. */
     301             : 
     302             : 
     303             : struct fd_chkdup_waddr {
     304             :   fd_acct_addr_t key; /* account address */
     305             : };
     306             : typedef struct fd_chkdup_waddr fd_chkdup_waddr_t;
     307             : static const fd_acct_addr_t chkdup_null_addr = {{ 0 }};
     308             : 
     309             : #define MAP_NAME              fd_chkdup_pmap
     310  1603396854 : #define MAP_T                 fd_chkdup_waddr_t
     311  3456007446 : #define MAP_KEY_T             fd_acct_addr_t
     312  1603490565 : #define MAP_KEY_NULL          chkdup_null_addr
     313  5059406922 : #define MAP_KEY_INVAL(k)      MAP_KEY_EQUAL(k, chkdup_null_addr)
     314  5187856605 : #define MAP_KEY_EQUAL(k0,k1)  (!memcmp((k0).b,(k1).b, FD_TXN_ACCT_ADDR_SZ))
     315             : #define MAP_KEY_EQUAL_IS_SLOW 1
     316             : #define MAP_MEMOIZE           0
     317  1724356737 : #define MAP_KEY_HASH(key)     ((uint)fd_ulong_hash( fd_ulong_load_8( (key).b ) ))
     318  3577160829 : #define MAP_LG_SLOT_CNT 8
     319             : #include "../../util/tmpl/fd_map.c"
     320             : 
     321             : 
     322             : struct __attribute__((aligned(FD_CHKDUP_ALIGN))) fd_chkdup_private {
     323             : #if FD_CHKDUP_IMPL >= 1
     324             :   uchar  entropy[ 32*FD_CHKDUP_IMPL ];
     325             : #endif
     326             : 
     327             :   fd_chkdup_waddr_t hashmap[ 1UL<<8 ];
     328             : };
     329             : 
     330             : static inline void *
     331             : fd_chkdup_new( void     * shmem,
     332         750 :                fd_rng_t * rng   ) {
     333         750 :   fd_chkdup_t * chkdup = (fd_chkdup_t *)shmem;
     334         750 : #if FD_CHKDUP_IMPL >= 1
     335       32750 :   for( ulong i=0UL; i<32*FD_CHKDUP_IMPL; i++ ) chkdup->entropy[ i ] = fd_rng_uchar( rng );
     336             : #else
     337             :   (void)rng;
     338             : #endif
     339         750 :   FD_TEST( fd_chkdup_pmap_footprint()==sizeof(chkdup->hashmap) ); /* Known at compile time */
     340             : 
     341         750 :   fd_chkdup_pmap_new( chkdup->hashmap );
     342         750 :   return chkdup;
     343         750 : }
     344             : 
     345         228 : static inline fd_chkdup_t * fd_chkdup_join  ( void * shmem ) { return (fd_chkdup_t *)shmem; }
     346             : 
     347         228 : static inline void * fd_chkdup_leave ( fd_chkdup_t * chkdup ) { return (void *)chkdup; }
     348         228 : static inline void * fd_chkdup_delete( void        * shmem  ) { return         shmem;  }
     349             : 
     350             : 
     351             : static inline int
     352             : fd_chkdup_check     ( fd_chkdup_t          * chkdup,
     353             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     354    82622577 :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     355    82622577 :   if( FD_LIKELY( 0==fd_chkdup_check_fast( chkdup, list0, list0_cnt, list1, list1_cnt ) ) ) return 0;
     356      401587 :   return fd_chkdup_check_slow( chkdup, list0, list0_cnt, list1, list1_cnt );
     357    82622577 : }
     358             : 
     359             : static inline int
     360             : fd_chkdup_check_slow( fd_chkdup_t          * chkdup,
     361             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     362    69451504 :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     363    69451504 :   fd_chkdup_waddr_t * map = fd_chkdup_pmap_join( chkdup->hashmap );
     364    69451504 :   fd_chkdup_waddr_t * inserted[ FD_TXN_ACCT_ADDR_MAX ];
     365    69451504 :   ulong inserted_cnt = 0UL;
     366             : 
     367    69451504 :   int any_duplicates = 0;
     368    69451504 :   int skipped_inval = 0;
     369  1162727608 :   for( ulong i0=0UL; (i0<list0_cnt) & !any_duplicates; i0++ ) {
     370  1093276104 :     if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list0[ i0 ] ) ) ) {
     371             :       /* Okay if this is the 1st, but not if the 2nd */
     372        2676 :       any_duplicates |= skipped_inval;
     373        2676 :       skipped_inval   = 1;
     374        2676 :       continue;
     375        2676 :     }
     376  1093273428 :     fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list0[ i0 ] );
     377  1093273428 :     inserted[ inserted_cnt++ ] = ins;
     378  1093273428 :     any_duplicates |=        (NULL==ins);
     379  1093273428 :     inserted_cnt   -= (ulong)(NULL==ins); /* Correct inserted_cnt if we just stored a NULL */
     380  1093273428 :   }
     381   579574876 :   for( ulong i1=0UL; (i1<list1_cnt) & !any_duplicates; i1++ ) {
     382   510123372 :     if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list1[ i1 ] ) ) ) {
     383         696 :       any_duplicates |= skipped_inval;
     384         696 :       skipped_inval   = 1;
     385         696 :       continue;
     386         696 :     }
     387   510122676 :     fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list1[ i1 ] );
     388   510122676 :     inserted[ inserted_cnt++ ] = ins;
     389   510122676 :     any_duplicates |=        (NULL==ins);
     390   510122676 :     inserted_cnt   -= (ulong)(NULL==ins);
     391   510122676 :   }
     392             : 
     393             :   /* FIXME: This depends on undocumented map behavior for correctness.
     394             :      Deleting in the opposite order of insertion preserves previously
     395             :      inserted pointers. That behavior should be documented. */
     396  1672750069 :   for( ulong i=0UL; i<inserted_cnt; i++ ) fd_chkdup_pmap_remove( map, inserted[ inserted_cnt-1UL-i ] );
     397             : 
     398    69451504 :   fd_chkdup_pmap_leave( map );
     399             : 
     400    69451504 :   return any_duplicates;
     401    69451504 : }
     402             : 
     403             : 
     404             : #if FD_CHKDUP_IMPL==1
     405             : 
     406             : /* AVX2 implementation */
     407             : #include "../../util/simd/fd_avx.h"
     408             : static inline int
     409             : fd_chkdup_check_fast( fd_chkdup_t          * chkdup,
     410             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     411   103114996 :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     412   103114996 :   if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
     413             : 
     414    83004916 :   int any_duplicates = 0;
     415             : 
     416    83004916 :   const wu_t entropy = wb_ld( chkdup->entropy );
     417    83004916 :   const wu_t one     = wu_bcast( 1U );
     418             : 
     419             : 
     420    83004916 :   if( FD_LIKELY( list0_cnt+list1_cnt<28UL ) ) {
     421             :     /* Single vector implementation */
     422    46940140 :     const wu_t mask = wu_bcast( 0x1FU );
     423             : 
     424    46940140 :     wu_t bloom = wu_zero();
     425   534097890 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     426   487157750 :       wu_t addr    = wb_ldu( list0+i0 );
     427   487157750 :       wu_t masked  = wu_and( wu_xor( addr, entropy ), mask );
     428   487157750 :       wu_t select  = wu_shl_vector( one, masked );
     429             :       /* testc: "Compute the bitwise NOT of a and then AND with b, and
     430             :          [return] 1 if the result is zero." */
     431   487157750 :       any_duplicates |= _mm256_testc_si256( bloom, select );
     432   487157750 :       bloom = wu_or( bloom, select );
     433   487157750 :     }
     434   222949704 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     435   176009564 :       wu_t addr    = wb_ldu( list1+i1 );
     436   176009564 :       wu_t masked  = wu_and( wu_xor( addr, entropy ), mask );
     437   176009564 :       wu_t select  = wu_shl_vector( one, masked );
     438   176009564 :       any_duplicates |= _mm256_testc_si256( bloom, select );
     439   176009564 :       bloom = wu_or( bloom, select );
     440   176009564 :     }
     441    46940140 :     return any_duplicates;
     442             : 
     443    46940140 :   } else {
     444             :     /* 4-vector implementation: slower but much better false positive
     445             :        rate so that we don't have to fall back to the slow path as
     446             :        frequently. */
     447    36064776 :     const wu_t mask = wu_bcast( 0x3FU );
     448             : 
     449    36064776 :     wu_t bloom0 = wu_zero();                                wu_t bloom1 = wu_zero();
     450    36064776 :     wu_t bloom2 = wu_zero();                                wu_t bloom3 = wu_zero();
     451  1014851650 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     452   978786874 :       wu_t addr    = wb_ldu( list0+i0 );
     453   978786874 :       wu_t blinded = wu_xor( addr, entropy );
     454   978786874 :       wu_t masked0 = wu_and( mask, blinded );               wu_t masked1 = wu_and( mask, wu_shr( blinded,  6 ) );
     455   978786874 :       wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
     456   978786874 :       wu_t select0 = wu_shl_vector( one, masked0 );         wu_t select1 = wu_shl_vector( one, masked1 );
     457   978786874 :       wu_t select2 = wu_shl_vector( one, masked2 );         wu_t select3 = wu_shl_vector( one, masked3 );
     458             : 
     459   978786874 :       wu_t any_differences = wu_or(
     460   978786874 :            wu_or( wu_andnot( bloom0, select0 ),             wu_andnot( bloom1, select1 ) ),
     461   978786874 :            wu_or( wu_andnot( bloom2, select2 ),             wu_andnot( bloom3, select3 ) ) );
     462             : 
     463   978786874 :       bloom0  = wu_or( bloom0, select0 );                   bloom1  = wu_or( bloom1, select1 );
     464   978786874 :       bloom2  = wu_or( bloom2, select2 );                   bloom3  = wu_or( bloom3, select3 );
     465             : 
     466   978786874 :       any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
     467   978786874 :       FD_COMPILER_FORGET( any_duplicates );
     468   978786874 :     }
     469   528234226 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     470   492169450 :       wu_t addr    = wb_ldu( list1+i1 );
     471   492169450 :       wu_t blinded = wu_xor( addr, entropy );
     472   492169450 :       wu_t masked0 = wu_and( mask, blinded );               wu_t masked1 = wu_and( mask, wu_shr( blinded,  6 ) );
     473   492169450 :       wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
     474   492169450 :       wu_t select0 = wu_shl_vector( one, masked0 );         wu_t select1 = wu_shl_vector( one, masked1 );
     475   492169450 :       wu_t select2 = wu_shl_vector( one, masked2 );         wu_t select3 = wu_shl_vector( one, masked3 );
     476             : 
     477   492169450 :       wu_t any_differences = wu_or(
     478   492169450 :            wu_or( wu_andnot( bloom0, select0 ),             wu_andnot( bloom1, select1 ) ),
     479   492169450 :            wu_or( wu_andnot( bloom2, select2 ),             wu_andnot( bloom3, select3 ) ) );
     480             : 
     481   492169450 :       bloom0  = wu_or( bloom0, select0 );                   bloom1  = wu_or( bloom1, select1 );
     482   492169450 :       bloom2  = wu_or( bloom2, select2 );                   bloom3  = wu_or( bloom3, select3 );
     483             : 
     484   492169450 :       any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
     485   492169450 :       FD_COMPILER_FORGET( any_duplicates );
     486   492169450 :     }
     487    36064776 :     return any_duplicates;
     488    36064776 :   }
     489    83004916 : }
     490             : 
     491             : #elif FD_CHKDUP_IMPL==2
     492             : 
     493             : /* AVX512 implementation */
     494             : #include "../../util/simd/fd_avx512.h"
     495             : static inline int
     496             : fd_chkdup_check_fast( fd_chkdup_t              * chkdup,
     497             :                       fd_acct_addr_t const     * list0,     ulong list0_cnt,
     498    51557498 :                       fd_acct_addr_t const     * list1,     ulong list1_cnt ) {
     499    51557498 :   if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
     500             : 
     501    41502458 :   int any_duplicates = 0;
     502             : 
     503    41502458 :   const wwu_t entropy = wwu_ld( (uint const *)chkdup->entropy );
     504    41502458 :   const wwu_t one     = wwu_bcast( 1U );
     505             : 
     506    41502458 :   if( FD_LIKELY( list0_cnt+list1_cnt<36UL ) ) {
     507             :     /* One vector version */
     508             :     /* Our analysis assumed the 64 bytes of hash were all independent,
     509             :        but if we just xor and then use the low 5 bits of both parts of
     510             :        the vector, we get a lot more false positives than the math
     511             :        predicts. */
     512    31470204 :     const wwu_t mask = wwu_bcast( 0x1FU );
     513    31470204 :     wwu_t bloom = wwu_zero();
     514   493050727 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     515   461580523 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
     516   461580523 :       wwu_t blinded = wwu_xor( addr, entropy );
     517   461580523 :       wwu_t masked  = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
     518   461580523 :       wwu_t select  = _mm512_rolv_epi32( one, masked );
     519   461580523 :       wwu_t next    = wwu_or( bloom, select );
     520   461580523 :       __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE ); /* if non-zero, not a duplicate */
     521   461580523 :       bloom = next;
     522             :       /* kortestz_mask8_u8: "Compute the bitwise OR of 8-bit masks a and
     523             :          b. If the result is all zeroes, [return] 1" */
     524   461580523 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     525   461580523 :       FD_COMPILER_FORGET( any_duplicates );
     526   461580523 :     }
     527   157477748 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     528   126007544 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
     529   126007544 :       wwu_t blinded = wwu_xor( addr, entropy );
     530   126007544 :       wwu_t masked  = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
     531   126007544 :       wwu_t select  = _mm512_rolv_epi32( one, masked );
     532   126007544 :       wwu_t next    = wwu_or( bloom, select );
     533   126007544 :       __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE );
     534   126007544 :       bloom = next;
     535   126007544 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     536   126007544 :       FD_COMPILER_FORGET( any_duplicates );
     537   126007544 :     }
     538    31470204 :     return any_duplicates;
     539    31470204 :   } else {
     540             :     /* Two vector version */
     541    10032254 :     const wwu_t mask = wwu_bcast( 0x3FU );
     542    10032254 :     const wwu_t shift0 = wwu( 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
     543    10032254 :                               6U, 6U, 6U, 6U, 6U, 6U, 6U, 6U );
     544    10032254 :     const wwu_t shift1 = wwu( 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U,
     545    10032254 :                               18U, 18U, 18U, 18U, 18U, 18U, 18U, 18U );
     546    10032254 :     wwu_t bloom0 = wwu_zero();            wwu_t bloom1 = wwu_zero();
     547   259426680 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     548   249394426 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
     549   249394426 :       wwu_t blinded = wwu_xor( addr, entropy );
     550   249394426 :       wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) );  wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
     551   249394426 :       wwu_t select0 = wwu_shl_vector( one, masked0 );                      wwu_t select1 = wwu_shl_vector( one, masked1 );
     552   249394426 :       wwu_t next0   = wwu_or( bloom0, select0 );                           wwu_t next1   = wwu_or( bloom1, select1 );
     553   249394426 :       __mmask8 any_differences = _kor_mask8(
     554   249394426 :           _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ),           _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
     555             : 
     556   249394426 :       bloom0 = next0;                                                      bloom1 = next1;
     557             : 
     558   249394426 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     559   249394426 :       FD_COMPILER_FORGET( any_duplicates );
     560   249394426 :     }
     561   240113566 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     562   230081312 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
     563   230081312 :       wwu_t blinded = wwu_xor( addr, entropy );
     564   230081312 :       wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) );  wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
     565   230081312 :       wwu_t select0 = wwu_shl_vector( one, masked0 );                      wwu_t select1 = wwu_shl_vector( one, masked1 );
     566   230081312 :       wwu_t next0   = wwu_or( bloom0, select0 );                           wwu_t next1   = wwu_or( bloom1, select1 );
     567   230081312 :       __mmask8 any_differences = _kor_mask8(
     568   230081312 :           _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ),           _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
     569             : 
     570   230081312 :       bloom0 = next0;                                                      bloom1 = next1;
     571             : 
     572   230081312 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     573   230081312 :       FD_COMPILER_FORGET( any_duplicates );
     574   230081312 :     }
     575    10032254 :     return any_duplicates;
     576    10032254 :   }
     577    41502458 : }
     578             : 
     579             : #else
     580             : 
     581             : static inline int
     582             : fd_chkdup_check_fast( fd_chkdup_t          * chkdup,
     583             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     584             :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     585             :   (void)chkdup;
     586             :   (void)list0;
     587             :   (void)list1;
     588             :   (void)list0_cnt;
     589             :   (void)list1_cnt;
     590             :   return 1;
     591             : }
     592             : 
     593             : #endif
     594             : 
     595             : 
     596             : FD_PROTOTYPES_END
     597             : 
     598             : #endif /* HEADER_fd_src_ballet_pack_fd_chkdup_h */

Generated by: LCOV version 1.14