LCOV - code coverage report
Current view: top level - disco/pack - fd_chkdup.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 181 183 98.9 %
Date: 2025-07-01 05:00:49 Functions: 16 27 59.3 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_ballet_pack_fd_chkdup_h
       2             : #define HEADER_fd_src_ballet_pack_fd_chkdup_h
       3             : #include "../../ballet/fd_ballet_base.h"
       4             : #include "../../ballet/txn/fd_txn.h"
       5             : 
       6             : /* fd_chkdup declares a set of functions for ultra-HPC checking if a
       7             :    list of account addresses contains any duplicates.  It's important
       8             :    that this be fast, because a transaction containing duplicate account
       9             :    addresses fails to sanitize and is not charged a fee.  Although this
      10             :    check can (and really ought) to be done in parallel, perhaps in the
      11             :    verify tiles, right now, it's done in pack, which means it's serial
      12             :    and on the critical path.
      13             : 
      14             :    On platforms with AVX, the current implementation uses a fast initial
      15             :    check which may have false positives (thinking there are duplicates
      16             :    when there aren't).  Any transaction that fails the initial check is
      17             :    then subjected to the full, precise check.  Without AVX, all
      18             :    transactions use the slow path. */
      19             : 
      20             : /* The functions are defined in the header to facilitate inlining since
      21             :    they take 10s of cycles in the good case, but should probably be
      22             :    treated as if they were defined in a .c file. */
      23             : 
      24             : #ifndef FD_CHKDUP_IMPL
      25             : # if FD_HAS_AVX512
      26             : #   include "../../util/simd/fd_avx.h"
      27       16641 : #   define FD_CHKDUP_IMPL 2
      28             : # elif FD_HAS_AVX
      29       16898 : #   define FD_CHKDUP_IMPL 1
      30             : # else
      31             : #   define FD_CHKDUP_IMPL 0
      32             : # endif
      33             : #endif
      34             : 
      35             : 
      36             : #if FD_CHKDUP_IMPL==2
      37             : # define FD_CHKDUP_ALIGN ( 64UL)
      38             : #elif FD_CHKDUP_IMPL==1
      39             : # define FD_CHKDUP_ALIGN ( 32UL)
      40             : #elif FD_CHKDUP_IMPL==0
      41             : # define FD_CHKDUP_ALIGN ( 32UL)
      42             : #else
      43             : # error "Unrecognized value of FD_CHKDUP_IMPL"
      44             : #endif
      45             : 
      46             : 
      47             : # define FD_CHKDUP_FOOTPRINT  FD_LAYOUT_FINI( FD_LAYOUT_APPEND(                                         \
      48             :                                                              FD_LAYOUT_APPEND( FD_LAYOUT_INIT,               \
      49             :                                                              FD_CHKDUP_ALIGN, 32*FD_CHKDUP_IMPL ), \
      50             :                                                              32UL,                 (1UL<<8)*32UL    ),       \
      51             :                                                    FD_CHKDUP_ALIGN )
      52             : 
      53             : FD_STATIC_ASSERT( (1UL<<8)==2*FD_TXN_ACCT_ADDR_MAX, "hash table size" );
      54             : 
      55             : /* Fixed size (just over 8kB) and safe for declaration on the stack or
      56             :    inclusion in a struct. */
      57             : struct fd_chkdup_private;
      58             : typedef struct fd_chkdup_private fd_chkdup_t;
      59             : 
      60             : FD_PROTOTYPES_BEGIN
      61             : /* fd_chkdup_{footprint, align} return the footprint and alignment of
      62             :    the scratch memory that duplicate detection requires. */
      63           0 : static inline ulong fd_chkdup_footprint( void ) { return FD_CHKDUP_FOOTPRINT; }
      64           0 : static inline ulong fd_chkdup_align    ( void ) { return FD_CHKDUP_ALIGN;     }
      65             : 
      66             : /* fd_chkdup_new formats an appropriately sized region of memory for use
      67             :    in duplicate address detection.  shmem must point to the first byte
      68             :    of a region of memory with the appropriate alignment and footprint.
      69             :    rng must be a pointer to a local join of an RNG.  Some slots of the
      70             :    RNG will be consumed, but no interest in the RNG will be retained
      71             :    after the function returns.  Returns shmem on success and NULL on
      72             :    failure (logs details).  The only failure cases are if shmem is NULL
      73             :    or not aligned.
      74             : 
      75             :    fd_chkdup_join joins the caller to the formatted region of memory.
      76             :    Returns shmem.
      77             : 
      78             :    fd_chkdup_leave unjoins the caller to chkdup.  Returns chkdup.
      79             :    fd_chkdup_delete unformats the region of memory.  Returns a pointer
      80             :    to the unformatted memory region. */
      81             : static inline void *        fd_chkdup_new   ( void * shmem, fd_rng_t * rng );
      82             : static inline fd_chkdup_t * fd_chkdup_join  ( void * shmem                 );
      83             : static inline void *        fd_chkdup_leave ( fd_chkdup_t * chkdup         );
      84             : static inline void *        fd_chkdup_delete( void * shmem                 );
      85             : 
      86             : 
      87             : /* fd_chkdup_check{,_slow,_fast} check a list of account addresses for
      88             :    any duplicate addresses, i.e. an account address that appears twice
      89             :    in the list.  The list does not need to be sorted or have any
      90             :    particular order.  The list may be decomposed into two sublists
      91             :    (list0 and list1) to facilitate 0-copy usage with address lookup
      92             :    tables, but list0 and list1 are logically concatenated prior to
      93             :    checking for duplicates.
      94             : 
      95             :    chkdup is a pointer to a valid local join of a chkdup object.
      96             : 
      97             :    list0 and list1 point to the first account address of the respective
      98             :    sublists.  The memory they point to need not have any particular
      99             :    alignment.  list0==NULL is okay only if list0_cnt==0, and similarly
     100             :    for list1.  list0 is accessed with indices [0, list0_cnt) and list1
     101             :    is accessed with indices [0, list1_cnt).  list0 and list1 must not
     102             :    overlap.  Requires list0_cnt+list1_cnt<=128, and the function is
     103             :    somewhat tuned for smaller values.
     104             : 
     105             :    fd_chkdup_check and the _slow version return 1 if the list of
     106             :    transactions contains at least one duplicated account address and 0
     107             :    otherwise (implying each account address in the provided list is
     108             :    unique).
     109             : 
     110             :    fd_chkdup_check_fast returns 1 if the list of transactions contains
     111             :    at least one duplicated account address and typically returns 0 if
     112             :    each account address in the provided list is unique, but may
     113             :    sometimes spuriiously return 1 even without duplicates.
     114             : 
     115             :    WARNING: the _fast version MAY HAVE FALSE POSITIVES.  You probably
     116             :    want the un-suffixed version, which is precise.  It uses the fast
     117             :    version as a fast-path and then does a slower full check if the
     118             :    fast-path suggests there may be a duplicate.
     119             : 
     120             :    However, it's also worth calling out again that the _fast version
     121             :    only makes errors in one direction.  If the list contains duplicates,
     122             :    it will definitely return 1.  If it returns 0, the list definitely
     123             :    does not contain duplicates. (Those two statements are equivalent).
     124             :    */
     125             : static inline int
     126             : fd_chkdup_check     ( fd_chkdup_t          * chkdup,
     127             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     128             :                       fd_acct_addr_t const * list1,     ulong list1_cnt );
     129             : static inline int
     130             : fd_chkdup_check_slow( fd_chkdup_t          * chkdup,
     131             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     132             :                       fd_acct_addr_t const * list1,     ulong list1_cnt );
     133             : static inline int
     134             : fd_chkdup_check_fast( fd_chkdup_t          * chkdup,
     135             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     136             :                       fd_acct_addr_t const * list1,     ulong list1_cnt );
     137             : 
     138             : 
     139             : /*   -----  Implementation details and discussion follow------
     140             : 
     141             :    The fast path implementation is somewhat interesting.  The normal way
     142             :    to do this is with a Bloom filter, but Bloom filters do lots of
     143             :    unpredictable reads and the pack tile is somewhat cache sensitive.
     144             :    Instead, this implements a variant on a Bloom filter that lives
     145             :    entirely in AVX registers.
     146             : 
     147             :    Basically, we use C W-bit words stored in (C*W)/256 AVX2 registers or
     148             :    (C*W)/512 AVX512 registers.  Each word is a modified Bloom filter
     149             :    with one associated hash function.  For each account address, we
     150             :    compute C hashes, giving (up to) C positions across the AVX
     151             :    registers.  If any of those positions have an unset bit, then we know
     152             :    we have not seen the account address before.  Finally, we then set
     153             :    all the positions.
     154             : 
     155             :    The only difference between this idea and a normal Bloom filter is
     156             :    that sometimes the hash function may not select a bit.  There's a
     157             :    tradeoff to be made here: suppose you insert R account addresses, and
     158             :    R is at least almost as large as W.  Then each word fills up, and
     159             :    false positives become increasingly likely.  Only testing and
     160             :    inserting a bit, say, half the time, effectively halves C, but
     161             :    prevents each word from getting saturated as quickly, and makes the
     162             :    algorithm effective for larger values of R.  We use Intel's quirky
     163             :    behavior to get this for free.
     164             : 
     165             :    (Note: The R and C notation is supposed to suggest a tabular layout
     166             :    in which the account addresses are rows and the words are columns.)
     167             : 
     168             :    The key insight into making this work quickly is that the vpsllv{d,q}
     169             :    variable left shift instructions are cheap (1 cycle, can execute on
     170             :    port 0 or 1 for AVX2, still 1 cycle on port 0 for AVX512).  If we can
     171             :    compute the hashes in parallel with SIMD, then we can variably shift
     172             :    bcast(0x1) quickly, and select several bits at a time.  The rest is
     173             :    just bitwise logic, which is extremely cheap with AVX.  This approach
     174             :    constrains W to be either 32 or 64, and C to be a multiple of the
     175             :    number of words in a vector, but those are both pretty acceptable
     176             :    constraints.
     177             : 
     178             :    The final ingredient is a cheap hash function that places the hashes
     179             :    in the appropriate position for vpsllv.  We just xor the account
     180             :    address with some validator-specific entropy and use a mask to select
     181             :    certain bits.
     182             : 
     183             :    The slow-path implementation uses a hash table to check for
     184             :    duplicates.  This is slower than sorting for transactions with only a
     185             :    few account addresses, but substantially faster than sorting for
     186             :    transactions with large numbers of account addresses, which is when
     187             :    the slow-path matters more anyway.
     188             : 
     189             : 
     190             :    You can see from above there are a variety of knobs to tune.  Should
     191             :    W be 32 or 64?  How big should C be?  How many bits should we mask
     192             :    off for the hash, which controls the frequency with which we skip a
     193             :    word, neither checking nor inserting a bit?  It would be good to have
     194             :    a rigorous understanding of the false positive rate as a function of
     195             :    these parameters so we can make a decision that minimizes the
     196             :    expected compute required.
     197             : 
     198             :    Unfortunately, the false positive computation is tricky.  The key
     199             :    difficulty is that whether processing account address r results in a
     200             :    false positive is not independent from whether processing account
     201             :    address r+1 results in a false positive.  This leads to an obnoxious
     202             :    inclusion-exclusion formula which quickly becomes more unwieldy than
     203             :    I (or Sage) can handle.
     204             : 
     205             :    A dynamic-programming-ish algorithm can compute the false positive
     206             :    rate in approximately O(R*2^R) time.  To start, we just want to
     207             :    understand a single word/column.  Suppose k bits in the word have
     208             :    been set.  Then there are (W-k) hash values that set a new bit, and
     209             :    (V+k) hash values that don't set a new, where V is the number of hash
     210             :    values that don't select a bit.  The hash values that set a bit are
     211             :    also exactly the ones that provide information that the account
     212             :    address is not a false positive.  I think about this as "spending a
     213             :    bit" to know it's not a false positive.
     214             : 
     215             :    Denoting the rows at which we spend a bit by a 1 and the rows at
     216             :    which we don't spend a bit by 0, we might get a column like:
     217             : 
     218             :                                 1
     219             :                                 0
     220             :                                 1
     221             :                                 1
     222             :                                 0.
     223             :    The number of ways this column can occur is x1 =
     224             :    W*(V+1)*(W-1)*(W-2)*(V+3), which means that the probability the
     225             :    column occurs is x1/(W+V)^5.  Expanding to multiple columns is easy,
     226             :    since the number of ways that two specific columns can occur is just
     227             :    the product of the number of ways each can occur.  For example,
     228             :                               1 0
     229             :                               0 1
     230             :                               1 1
     231             :                               1 0
     232             :                               0 0
     233             :    can occur x1 * x2 ways, where x2 = V*W*(W-1)*(V+2)*(V+2).
     234             :    A false positive happens when there's a row of all 0s, as in the last
     235             :    row of the example.
     236             : 
     237             :    It's cleaner to count the number of ways not to get a false positive.
     238             :    This gives us the inclusion-exclusion formula:
     239             :    (all ways)
     240             :         - (all ways where row 0 is all 0s)
     241             :         - (all ways where row 1 is all 0s)
     242             :         - ...
     243             :         + (all ways where rows 0 and 1 are both all 0s)
     244             :         + (all ways where rows 0 and 2 are both all 0s)
     245             :         + ...
     246             :         - (all ways where rows 0, 1, and 2 are all 0s)
     247             :         - (all ways where rows 0, 1, and 3 are all 0s)
     248             :         +, - ...
     249             :         + (-1)^R (all ways in which all rows are all 0s).
     250             : 
     251             :    There's a nice way to understand each of these terms.  For example,
     252             :    in the R=2, C=3 case, the term in which row 0 is all 0s has the
     253             :    following elements:
     254             :     0 0 0   0 0 0   0 0 0   0 0 0   0 0 0   0 0 0   0 0 0   0 0 0
     255             :     0 0 0   0 0 1   0 1 0   0 1 1   1 0 0   1 0 1   1 1 0   1 1 1
     256             :    Rather than enumerating all 2^( (R-1)*C ) elements, we'll represent
     257             :    it as
     258             :                       ( 0  + 0 )^3
     259             :                       ( 0    1 )
     260             : 
     261             :    Skipping some steps, the task boils down to counting the number of
     262             :    ways to get columns that match a mask, then raising that to the Cth
     263             :    power.
     264             : 
     265             :    Now, with all this behind us, our goal is to pick the optimal value
     266             :    of W,V, and C given a supposed distribution of transactions, and a
     267             :    performance model.  Based on some back of the envelope calculations
     268             :    based on instruction throughput and latency measurements and
     269             :    confirmed by some experiments, the fast path code takes about 3.5*R
     270             :    cycles if using one AVX2 vector (W*C==256) and
     271             :    2.5*R+2*ceil(W*C/256)*R cycles otherwise.  The slow path takes about
     272             :    133*J cycles.  Then the expected value of the number of cycles it
     273             :    takes to process a transaction with R accounts is
     274             :      R*(2.5+2*ceil(W*C/256) - [W*C<=256]) + FP_{W,V,R,C}*133*R
     275             : 
     276             :    Based on a sample of 100,000 slots containing about 100M
     277             :    transactions, the CDF looks like
     278             : 
     279             :      Fraction of transactions containing <=  3 account addresses     71%
     280             :                                          <= 13 account addresses     80%
     281             :                                          <= 24 account addresses     91%
     282             :                                          <= 31 account addresses     95%
     283             :                                          <= 44 account addresses     98%
     284             :                                          <= 50 account addresses     99%
     285             : 
     286             :    Basically, there's a peak at 3 (votes), and then a very long, very
     287             :    fat tail.  When using AVX2, it basically boils down into 2 regimes:
     288             :           0 <= R <  28            W=32, C=8,  V=0  (one AVX vector)
     289             :          28 <= R <= 64            W=32, C=32, V=32 (four AVX vectors)
     290             : 
     291             :    This combination has an expected value of about 54 cycles over all
     292             :    transactions.  For a typical transaction with 3 account addresses,
     293             :    this takes about 10 cycles and the false positive probability is
     294             :    about 2e-10.
     295             : 
     296             :    When using AVX512, the regimes are similar:
     297             :           0 <= R <  36            W=32, C=16, V=0  (one AVX vector)
     298             :          36 <= R <= 64            W=32, C=32, V=32 (two AVX vectors)
     299             :   This combination has an expected value of about 33 cycles over all
     300             :   transactions.  Again, the typical 3 account address account takes
     301             :   about 9 cycles and has a negligible false positive probability. */
     302             : 
     303             : 
     304             : struct fd_chkdup_waddr {
     305             :   fd_acct_addr_t key; /* account address */
     306             : };
     307             : typedef struct fd_chkdup_waddr fd_chkdup_waddr_t;
     308             : static const fd_acct_addr_t chkdup_null_addr = {{ 0 }};
     309             : 
     310             : #define MAP_NAME              fd_chkdup_pmap
     311  1603396980 : #define MAP_T                 fd_chkdup_waddr_t
     312  3456007665 : #define MAP_KEY_T             fd_acct_addr_t
     313  1603495281 : #define MAP_KEY_NULL          chkdup_null_addr
     314  5059407249 : #define MAP_KEY_INVAL(k)      MAP_KEY_EQUAL(k, chkdup_null_addr)
     315  5187856935 : #define MAP_KEY_EQUAL(k0,k1)  (!memcmp((k0).b,(k1).b, FD_TXN_ACCT_ADDR_SZ))
     316             : #define MAP_KEY_EQUAL_IS_SLOW 1
     317             : #define MAP_MEMOIZE           0
     318  1724356845 : #define MAP_KEY_HASH(key)     ((uint)fd_ulong_hash( fd_ulong_load_8( (key).b ) ))
     319  3577166442 : #define MAP_LG_SLOT_CNT 8
     320             : #include "../../util/tmpl/fd_map.c"
     321             : 
     322             : 
     323             : struct __attribute__((aligned(FD_CHKDUP_ALIGN))) fd_chkdup_private {
     324             : #if FD_CHKDUP_IMPL >= 1
     325             :   uchar  entropy[ 32*FD_CHKDUP_IMPL ];
     326             : #endif
     327             : 
     328             :   fd_chkdup_waddr_t hashmap[ 1UL<<8 ];
     329             : };
     330             : 
     331             : static inline void *
     332             : fd_chkdup_new( void     * shmem,
     333         768 :                fd_rng_t * rng   ) {
     334         768 :   fd_chkdup_t * chkdup = (fd_chkdup_t *)shmem;
     335         768 : #if FD_CHKDUP_IMPL >= 1
     336       33536 :   for( ulong i=0UL; i<32*FD_CHKDUP_IMPL; i++ ) chkdup->entropy[ i ] = fd_rng_uchar( rng );
     337             : #else
     338             :   (void)rng;
     339             : #endif
     340         768 :   FD_TEST( fd_chkdup_pmap_footprint()==sizeof(chkdup->hashmap) ); /* Known at compile time */
     341             : 
     342         768 :   fd_chkdup_pmap_new( chkdup->hashmap );
     343         768 :   return chkdup;
     344         768 : }
     345             : 
     346         243 : static inline fd_chkdup_t * fd_chkdup_join  ( void * shmem ) { return (fd_chkdup_t *)shmem; }
     347             : 
     348         228 : static inline void * fd_chkdup_leave ( fd_chkdup_t * chkdup ) { return (void *)chkdup; }
     349         228 : static inline void * fd_chkdup_delete( void        * shmem  ) { return         shmem;  }
     350             : 
     351             : 
     352             : static inline int
     353             : fd_chkdup_check     ( fd_chkdup_t          * chkdup,
     354             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     355    82631337 :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     356    82631337 :   if( FD_LIKELY( 0==fd_chkdup_check_fast( chkdup, list0, list0_cnt, list1, list1_cnt ) ) ) return 0;
     357      401593 :   return fd_chkdup_check_slow( chkdup, list0, list0_cnt, list1, list1_cnt );
     358    82631337 : }
     359             : 
     360             : static inline int
     361             : fd_chkdup_check_slow( fd_chkdup_t          * chkdup,
     362             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     363    69451510 :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     364    69451510 :   fd_chkdup_waddr_t * map = fd_chkdup_pmap_join( chkdup->hashmap );
     365    69451510 :   fd_chkdup_waddr_t * inserted[ FD_TXN_ACCT_ADDR_MAX ];
     366    69451510 :   ulong inserted_cnt = 0UL;
     367             : 
     368    69451510 :   int any_duplicates = 0;
     369    69451510 :   int skipped_inval = 0;
     370  1162727722 :   for( ulong i0=0UL; (i0<list0_cnt) & !any_duplicates; i0++ ) {
     371  1093276212 :     if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list0[ i0 ] ) ) ) {
     372             :       /* Okay if this is the 1st, but not if the 2nd */
     373        2676 :       any_duplicates |= skipped_inval;
     374        2676 :       skipped_inval   = 1;
     375        2676 :       continue;
     376        2676 :     }
     377  1093273536 :     fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list0[ i0 ] );
     378  1093273536 :     inserted[ inserted_cnt++ ] = ins;
     379  1093273536 :     any_duplicates |=        (NULL==ins);
     380  1093273536 :     inserted_cnt   -= (ulong)(NULL==ins); /* Correct inserted_cnt if we just stored a NULL */
     381  1093273536 :   }
     382   579574882 :   for( ulong i1=0UL; (i1<list1_cnt) & !any_duplicates; i1++ ) {
     383   510123372 :     if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list1[ i1 ] ) ) ) {
     384         696 :       any_duplicates |= skipped_inval;
     385         696 :       skipped_inval   = 1;
     386         696 :       continue;
     387         696 :     }
     388   510122676 :     fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list1[ i1 ] );
     389   510122676 :     inserted[ inserted_cnt++ ] = ins;
     390   510122676 :     any_duplicates |=        (NULL==ins);
     391   510122676 :     inserted_cnt   -= (ulong)(NULL==ins);
     392   510122676 :   }
     393             : 
     394             :   /* FIXME: This depends on undocumented map behavior for correctness.
     395             :      Deleting in the opposite order of insertion preserves previously
     396             :      inserted pointers. That behavior should be documented. */
     397  1672750183 :   for( ulong i=0UL; i<inserted_cnt; i++ ) fd_chkdup_pmap_remove( map, inserted[ inserted_cnt-1UL-i ] );
     398             : 
     399    69451510 :   fd_chkdup_pmap_leave( map );
     400             : 
     401    69451510 :   return any_duplicates;
     402    69451510 : }
     403             : 
     404             : 
     405             : #if FD_CHKDUP_IMPL==1
     406             : 
     407             : /* AVX2 implementation */
     408             : #include "../../util/simd/fd_avx.h"
     409             : static inline int
     410             : fd_chkdup_check_fast( fd_chkdup_t          * chkdup,
     411             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     412   103120836 :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     413   103120836 :   if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
     414             : 
     415    83010756 :   int any_duplicates = 0;
     416             : 
     417    83010756 :   const wu_t entropy = wb_ld( chkdup->entropy );
     418    83010756 :   const wu_t one     = wu_bcast( 1U );
     419             : 
     420             : 
     421    83010756 :   if( FD_LIKELY( list0_cnt+list1_cnt<28UL ) ) {
     422             :     /* Single vector implementation */
     423    46945980 :     const wu_t mask = wu_bcast( 0x1FU );
     424             : 
     425    46945980 :     wu_t bloom = wu_zero();
     426   534133132 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     427   487187152 :       wu_t addr    = wb_ldu( list0+i0 );
     428   487187152 :       wu_t masked  = wu_and( wu_xor( addr, entropy ), mask );
     429   487187152 :       wu_t select  = wu_shl_vector( one, masked );
     430             :       /* testc: "Compute the bitwise NOT of a and then AND with b, and
     431             :          [return] 1 if the result is zero." */
     432   487187152 :       any_duplicates |= _mm256_testc_si256( bloom, select );
     433   487187152 :       bloom = wu_or( bloom, select );
     434   487187152 :     }
     435   222955544 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     436   176009564 :       wu_t addr    = wb_ldu( list1+i1 );
     437   176009564 :       wu_t masked  = wu_and( wu_xor( addr, entropy ), mask );
     438   176009564 :       wu_t select  = wu_shl_vector( one, masked );
     439   176009564 :       any_duplicates |= _mm256_testc_si256( bloom, select );
     440   176009564 :       bloom = wu_or( bloom, select );
     441   176009564 :     }
     442    46945980 :     return any_duplicates;
     443             : 
     444    46945980 :   } else {
     445             :     /* 4-vector implementation: slower but much better false positive
     446             :        rate so that we don't have to fall back to the slow path as
     447             :        frequently. */
     448    36064776 :     const wu_t mask = wu_bcast( 0x3FU );
     449             : 
     450    36064776 :     wu_t bloom0 = wu_zero();                                wu_t bloom1 = wu_zero();
     451    36064776 :     wu_t bloom2 = wu_zero();                                wu_t bloom3 = wu_zero();
     452  1014851650 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     453   978786874 :       wu_t addr    = wb_ldu( list0+i0 );
     454   978786874 :       wu_t blinded = wu_xor( addr, entropy );
     455   978786874 :       wu_t masked0 = wu_and( mask, blinded );               wu_t masked1 = wu_and( mask, wu_shr( blinded,  6 ) );
     456   978786874 :       wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
     457   978786874 :       wu_t select0 = wu_shl_vector( one, masked0 );         wu_t select1 = wu_shl_vector( one, masked1 );
     458   978786874 :       wu_t select2 = wu_shl_vector( one, masked2 );         wu_t select3 = wu_shl_vector( one, masked3 );
     459             : 
     460   978786874 :       wu_t any_differences = wu_or(
     461   978786874 :            wu_or( wu_andnot( bloom0, select0 ),             wu_andnot( bloom1, select1 ) ),
     462   978786874 :            wu_or( wu_andnot( bloom2, select2 ),             wu_andnot( bloom3, select3 ) ) );
     463             : 
     464   978786874 :       bloom0  = wu_or( bloom0, select0 );                   bloom1  = wu_or( bloom1, select1 );
     465   978786874 :       bloom2  = wu_or( bloom2, select2 );                   bloom3  = wu_or( bloom3, select3 );
     466             : 
     467   978786874 :       any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
     468   978786874 :       FD_COMPILER_FORGET( any_duplicates );
     469   978786874 :     }
     470   528234226 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     471   492169450 :       wu_t addr    = wb_ldu( list1+i1 );
     472   492169450 :       wu_t blinded = wu_xor( addr, entropy );
     473   492169450 :       wu_t masked0 = wu_and( mask, blinded );               wu_t masked1 = wu_and( mask, wu_shr( blinded,  6 ) );
     474   492169450 :       wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
     475   492169450 :       wu_t select0 = wu_shl_vector( one, masked0 );         wu_t select1 = wu_shl_vector( one, masked1 );
     476   492169450 :       wu_t select2 = wu_shl_vector( one, masked2 );         wu_t select3 = wu_shl_vector( one, masked3 );
     477             : 
     478   492169450 :       wu_t any_differences = wu_or(
     479   492169450 :            wu_or( wu_andnot( bloom0, select0 ),             wu_andnot( bloom1, select1 ) ),
     480   492169450 :            wu_or( wu_andnot( bloom2, select2 ),             wu_andnot( bloom3, select3 ) ) );
     481             : 
     482   492169450 :       bloom0  = wu_or( bloom0, select0 );                   bloom1  = wu_or( bloom1, select1 );
     483   492169450 :       bloom2  = wu_or( bloom2, select2 );                   bloom3  = wu_or( bloom3, select3 );
     484             : 
     485   492169450 :       any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
     486   492169450 :       FD_COMPILER_FORGET( any_duplicates );
     487   492169450 :     }
     488    36064776 :     return any_duplicates;
     489    36064776 :   }
     490    83010756 : }
     491             : 
     492             : #elif FD_CHKDUP_IMPL==2
     493             : 
     494             : /* AVX512 implementation */
     495             : #include "../../util/simd/fd_avx512.h"
     496             : static inline int
     497             : fd_chkdup_check_fast( fd_chkdup_t              * chkdup,
     498             :                       fd_acct_addr_t const     * list0,     ulong list0_cnt,
     499    51560418 :                       fd_acct_addr_t const     * list1,     ulong list1_cnt ) {
     500    51560418 :   if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
     501             : 
     502    41505378 :   int any_duplicates = 0;
     503             : 
     504    41505378 :   const wwu_t entropy = wwu_ld( (uint const *)chkdup->entropy );
     505    41505378 :   const wwu_t one     = wwu_bcast( 1U );
     506             : 
     507    41505378 :   if( FD_LIKELY( list0_cnt+list1_cnt<36UL ) ) {
     508             :     /* One vector version */
     509             :     /* Our analysis assumed the 64 bytes of hash were all independent,
     510             :        but if we just xor and then use the low 5 bits of both parts of
     511             :        the vector, we get a lot more false positives than the math
     512             :        predicts. */
     513    31473124 :     const wwu_t mask = wwu_bcast( 0x1FU );
     514    31473124 :     wwu_t bloom = wwu_zero();
     515   493068348 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     516   461595224 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
     517   461595224 :       wwu_t blinded = wwu_xor( addr, entropy );
     518   461595224 :       wwu_t masked  = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
     519   461595224 :       wwu_t select  = _mm512_rolv_epi32( one, masked );
     520   461595224 :       wwu_t next    = wwu_or( bloom, select );
     521   461595224 :       __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE ); /* if non-zero, not a duplicate */
     522   461595224 :       bloom = next;
     523             :       /* kortestz_mask8_u8: "Compute the bitwise OR of 8-bit masks a and
     524             :          b. If the result is all zeroes, [return] 1" */
     525   461595224 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     526   461595224 :       FD_COMPILER_FORGET( any_duplicates );
     527   461595224 :     }
     528   157480668 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     529   126007544 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
     530   126007544 :       wwu_t blinded = wwu_xor( addr, entropy );
     531   126007544 :       wwu_t masked  = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
     532   126007544 :       wwu_t select  = _mm512_rolv_epi32( one, masked );
     533   126007544 :       wwu_t next    = wwu_or( bloom, select );
     534   126007544 :       __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE );
     535   126007544 :       bloom = next;
     536   126007544 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     537   126007544 :       FD_COMPILER_FORGET( any_duplicates );
     538   126007544 :     }
     539    31473124 :     return any_duplicates;
     540    31473124 :   } else {
     541             :     /* Two vector version */
     542    10032254 :     const wwu_t mask = wwu_bcast( 0x3FU );
     543    10032254 :     const wwu_t shift0 = wwu( 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
     544    10032254 :                               6U, 6U, 6U, 6U, 6U, 6U, 6U, 6U );
     545    10032254 :     const wwu_t shift1 = wwu( 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U,
     546    10032254 :                               18U, 18U, 18U, 18U, 18U, 18U, 18U, 18U );
     547    10032254 :     wwu_t bloom0 = wwu_zero();            wwu_t bloom1 = wwu_zero();
     548   259426680 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     549   249394426 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
     550   249394426 :       wwu_t blinded = wwu_xor( addr, entropy );
     551   249394426 :       wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) );  wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
     552   249394426 :       wwu_t select0 = wwu_shl_vector( one, masked0 );                      wwu_t select1 = wwu_shl_vector( one, masked1 );
     553   249394426 :       wwu_t next0   = wwu_or( bloom0, select0 );                           wwu_t next1   = wwu_or( bloom1, select1 );
     554   249394426 :       __mmask8 any_differences = _kor_mask8(
     555   249394426 :           _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ),           _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
     556             : 
     557   249394426 :       bloom0 = next0;                                                      bloom1 = next1;
     558             : 
     559   249394426 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     560   249394426 :       FD_COMPILER_FORGET( any_duplicates );
     561   249394426 :     }
     562   240113566 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     563   230081312 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
     564   230081312 :       wwu_t blinded = wwu_xor( addr, entropy );
     565   230081312 :       wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) );  wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
     566   230081312 :       wwu_t select0 = wwu_shl_vector( one, masked0 );                      wwu_t select1 = wwu_shl_vector( one, masked1 );
     567   230081312 :       wwu_t next0   = wwu_or( bloom0, select0 );                           wwu_t next1   = wwu_or( bloom1, select1 );
     568   230081312 :       __mmask8 any_differences = _kor_mask8(
     569   230081312 :           _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ),           _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
     570             : 
     571   230081312 :       bloom0 = next0;                                                      bloom1 = next1;
     572             : 
     573   230081312 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     574   230081312 :       FD_COMPILER_FORGET( any_duplicates );
     575   230081312 :     }
     576    10032254 :     return any_duplicates;
     577    10032254 :   }
     578    41505378 : }
     579             : 
     580             : #else
     581             : 
     582             : static inline int
     583             : fd_chkdup_check_fast( fd_chkdup_t          * chkdup,
     584             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     585             :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     586             :   (void)chkdup;
     587             :   (void)list0;
     588             :   (void)list1;
     589             :   (void)list0_cnt;
     590             :   (void)list1_cnt;
     591             :   return 1;
     592             : }
     593             : 
     594             : #endif
     595             : 
     596             : 
     597             : FD_PROTOTYPES_END
     598             : 
     599             : #endif /* HEADER_fd_src_ballet_pack_fd_chkdup_h */

Generated by: LCOV version 1.14