LCOV - code coverage report
Current view: top level - disco/pack - fd_chkdup.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 181 183 98.9 %
Date: 2025-10-27 04:40:00 Functions: 20 45 44.4 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_disco_pack_fd_chkdup_h
       2             : #define HEADER_fd_src_disco_pack_fd_chkdup_h
       3             : 
       4             : #include "../../ballet/fd_ballet_base.h"
       5             : #include "../../ballet/txn/fd_txn.h"
       6             : 
       7             : /* fd_chkdup declares a set of functions for ultra-HPC checking if a
       8             :    list of account addresses contains any duplicates.  It's important
       9             :    that this be fast, because a transaction containing duplicate account
      10             :    addresses fails to sanitize and is not charged a fee.  Although this
      11             :    check can (and really ought) to be done in parallel, perhaps in the
      12             :    verify tiles, right now, it's done in pack, which means it's serial
      13             :    and on the critical path.
      14             : 
      15             :    On platforms with AVX, the current implementation uses a fast initial
      16             :    check which may have false positives (thinking there are duplicates
      17             :    when there aren't).  Any transaction that fails the initial check is
      18             :    then subjected to the full, precise check.  Without AVX, all
      19             :    transactions use the slow path. */
      20             : 
      21             : /* The functions are defined in the header to facilitate inlining since
      22             :    they take 10s of cycles in the good case, but should probably be
      23             :    treated as if they were defined in a .c file. */
      24             : 
      25             : #ifndef FD_CHKDUP_IMPL
      26             : # if FD_HAS_AVX512
      27             : #   include "../../util/simd/fd_avx.h"
      28       17356 : #   define FD_CHKDUP_IMPL 2
      29             : # elif FD_HAS_AVX
      30       17624 : #   define FD_CHKDUP_IMPL 1
      31             : # else
      32             : #   define FD_CHKDUP_IMPL 0
      33             : # endif
      34             : #endif
      35             : 
      36             : 
      37             : #if FD_CHKDUP_IMPL==2
      38             : # define FD_CHKDUP_ALIGN ( 64UL)
      39             : #elif FD_CHKDUP_IMPL==1
      40             : # define FD_CHKDUP_ALIGN ( 32UL)
      41             : #elif FD_CHKDUP_IMPL==0
      42             : # define FD_CHKDUP_ALIGN ( 32UL)
      43             : #else
      44             : # error "Unrecognized value of FD_CHKDUP_IMPL"
      45             : #endif
      46             : 
      47             : 
      48             : # define FD_CHKDUP_FOOTPRINT  FD_LAYOUT_FINI( FD_LAYOUT_APPEND(                                         \
      49             :                                                              FD_LAYOUT_APPEND( FD_LAYOUT_INIT,               \
      50             :                                                              FD_CHKDUP_ALIGN, 32*FD_CHKDUP_IMPL ), \
      51             :                                                              32UL,                 (1UL<<8)*32UL    ),       \
      52             :                                                    FD_CHKDUP_ALIGN )
      53             : 
      54             : FD_STATIC_ASSERT( (1UL<<8)==2*FD_TXN_ACCT_ADDR_MAX, "hash table size" );
      55             : 
      56             : /* Fixed size (just over 8kB) and safe for declaration on the stack or
      57             :    inclusion in a struct. */
      58             : struct fd_chkdup_private;
      59             : typedef struct fd_chkdup_private fd_chkdup_t;
      60             : 
      61             : FD_PROTOTYPES_BEGIN
      62             : /* fd_chkdup_{footprint, align} return the footprint and alignment of
      63             :    the scratch memory that duplicate detection requires. */
      64           0 : static inline ulong fd_chkdup_footprint( void ) { return FD_CHKDUP_FOOTPRINT; }
      65           0 : static inline ulong fd_chkdup_align    ( void ) { return FD_CHKDUP_ALIGN;     }
      66             : 
      67             : /* fd_chkdup_new formats an appropriately sized region of memory for use
      68             :    in duplicate address detection.  shmem must point to the first byte
      69             :    of a region of memory with the appropriate alignment and footprint.
      70             :    rng must be a pointer to a local join of an RNG.  Some slots of the
      71             :    RNG will be consumed, but no interest in the RNG will be retained
      72             :    after the function returns.  Returns shmem on success and NULL on
      73             :    failure (logs details).  The only failure cases are if shmem is NULL
      74             :    or not aligned.
      75             : 
      76             :    fd_chkdup_join joins the caller to the formatted region of memory.
      77             :    Returns shmem.
      78             : 
      79             :    fd_chkdup_leave unjoins the caller to chkdup.  Returns chkdup.
      80             :    fd_chkdup_delete unformats the region of memory.  Returns a pointer
      81             :    to the unformatted memory region. */
      82             : static inline void *        fd_chkdup_new   ( void * shmem, fd_rng_t * rng );
      83             : static inline fd_chkdup_t * fd_chkdup_join  ( void * shmem                 );
      84             : static inline void *        fd_chkdup_leave ( fd_chkdup_t * chkdup         );
      85             : static inline void *        fd_chkdup_delete( void * shmem                 );
      86             : 
      87             : 
      88             : /* fd_chkdup_check{,_slow,_fast} check a list of account addresses for
      89             :    any duplicate addresses, i.e. an account address that appears twice
      90             :    in the list.  The list does not need to be sorted or have any
      91             :    particular order.  The list may be decomposed into two sublists
      92             :    (list0 and list1) to facilitate 0-copy usage with address lookup
      93             :    tables, but list0 and list1 are logically concatenated prior to
      94             :    checking for duplicates.
      95             : 
      96             :    chkdup is a pointer to a valid local join of a chkdup object.
      97             : 
      98             :    list0 and list1 point to the first account address of the respective
      99             :    sublists.  The memory they point to need not have any particular
     100             :    alignment.  list0==NULL is okay only if list0_cnt==0, and similarly
     101             :    for list1.  list0 is accessed with indices [0, list0_cnt) and list1
     102             :    is accessed with indices [0, list1_cnt).  list0 and list1 must not
     103             :    overlap.  Requires list0_cnt+list1_cnt<=128, and the function is
     104             :    somewhat tuned for smaller values.
     105             : 
     106             :    fd_chkdup_check and the _slow version return 1 if the list of
     107             :    transactions contains at least one duplicated account address and 0
     108             :    otherwise (implying each account address in the provided list is
     109             :    unique).
     110             : 
     111             :    fd_chkdup_check_fast returns 1 if the list of transactions contains
     112             :    at least one duplicated account address and typically returns 0 if
     113             :    each account address in the provided list is unique, but may
     114             :    sometimes spuriiously return 1 even without duplicates.
     115             : 
     116             :    WARNING: the _fast version MAY HAVE FALSE POSITIVES.  You probably
     117             :    want the un-suffixed version, which is precise.  It uses the fast
     118             :    version as a fast-path and then does a slower full check if the
     119             :    fast-path suggests there may be a duplicate.
     120             : 
     121             :    However, it's also worth calling out again that the _fast version
     122             :    only makes errors in one direction.  If the list contains duplicates,
     123             :    it will definitely return 1.  If it returns 0, the list definitely
     124             :    does not contain duplicates. (Those two statements are equivalent).
     125             :    */
     126             : static inline int
     127             : fd_chkdup_check     ( fd_chkdup_t          * chkdup,
     128             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     129             :                       fd_acct_addr_t const * list1,     ulong list1_cnt );
     130             : static inline int
     131             : fd_chkdup_check_slow( fd_chkdup_t          * chkdup,
     132             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     133             :                       fd_acct_addr_t const * list1,     ulong list1_cnt );
     134             : static inline int
     135             : fd_chkdup_check_fast( fd_chkdup_t          * chkdup,
     136             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     137             :                       fd_acct_addr_t const * list1,     ulong list1_cnt );
     138             : 
     139             : 
     140             : /*   -----  Implementation details and discussion follow------
     141             : 
     142             :    The fast path implementation is somewhat interesting.  The normal way
     143             :    to do this is with a Bloom filter, but Bloom filters do lots of
     144             :    unpredictable reads and the pack tile is somewhat cache sensitive.
     145             :    Instead, this implements a variant on a Bloom filter that lives
     146             :    entirely in AVX registers.
     147             : 
     148             :    Basically, we use C W-bit words stored in (C*W)/256 AVX2 registers or
     149             :    (C*W)/512 AVX512 registers.  Each word is a modified Bloom filter
     150             :    with one associated hash function.  For each account address, we
     151             :    compute C hashes, giving (up to) C positions across the AVX
     152             :    registers.  If any of those positions have an unset bit, then we know
     153             :    we have not seen the account address before.  Finally, we then set
     154             :    all the positions.
     155             : 
     156             :    The only difference between this idea and a normal Bloom filter is
     157             :    that sometimes the hash function may not select a bit.  There's a
     158             :    tradeoff to be made here: suppose you insert R account addresses, and
     159             :    R is at least almost as large as W.  Then each word fills up, and
     160             :    false positives become increasingly likely.  Only testing and
     161             :    inserting a bit, say, half the time, effectively halves C, but
     162             :    prevents each word from getting saturated as quickly, and makes the
     163             :    algorithm effective for larger values of R.  We use Intel's quirky
     164             :    behavior to get this for free.
     165             : 
     166             :    (Note: The R and C notation is supposed to suggest a tabular layout
     167             :    in which the account addresses are rows and the words are columns.)
     168             : 
     169             :    The key insight into making this work quickly is that the vpsllv{d,q}
     170             :    variable left shift instructions are cheap (1 cycle, can execute on
     171             :    port 0 or 1 for AVX2, still 1 cycle on port 0 for AVX512).  If we can
     172             :    compute the hashes in parallel with SIMD, then we can variably shift
     173             :    bcast(0x1) quickly, and select several bits at a time.  The rest is
     174             :    just bitwise logic, which is extremely cheap with AVX.  This approach
     175             :    constrains W to be either 32 or 64, and C to be a multiple of the
     176             :    number of words in a vector, but those are both pretty acceptable
     177             :    constraints.
     178             : 
     179             :    The final ingredient is a cheap hash function that places the hashes
     180             :    in the appropriate position for vpsllv.  We just xor the account
     181             :    address with some validator-specific entropy and use a mask to select
     182             :    certain bits.
     183             : 
     184             :    The slow-path implementation uses a hash table to check for
     185             :    duplicates.  This is slower than sorting for transactions with only a
     186             :    few account addresses, but substantially faster than sorting for
     187             :    transactions with large numbers of account addresses, which is when
     188             :    the slow-path matters more anyway.
     189             : 
     190             : 
     191             :    You can see from above there are a variety of knobs to tune.  Should
     192             :    W be 32 or 64?  How big should C be?  How many bits should we mask
     193             :    off for the hash, which controls the frequency with which we skip a
     194             :    word, neither checking nor inserting a bit?  It would be good to have
     195             :    a rigorous understanding of the false positive rate as a function of
     196             :    these parameters so we can make a decision that minimizes the
     197             :    expected compute required.
     198             : 
     199             :    Unfortunately, the false positive computation is tricky.  The key
     200             :    difficulty is that whether processing account address r results in a
     201             :    false positive is not independent from whether processing account
     202             :    address r+1 results in a false positive.  This leads to an obnoxious
     203             :    inclusion-exclusion formula which quickly becomes more unwieldy than
     204             :    I (or Sage) can handle.
     205             : 
     206             :    A dynamic-programming-ish algorithm can compute the false positive
     207             :    rate in approximately O(R*2^R) time.  To start, we just want to
     208             :    understand a single word/column.  Suppose k bits in the word have
     209             :    been set.  Then there are (W-k) hash values that set a new bit, and
     210             :    (V+k) hash values that don't set a new, where V is the number of hash
     211             :    values that don't select a bit.  The hash values that set a bit are
     212             :    also exactly the ones that provide information that the account
     213             :    address is not a false positive.  I think about this as "spending a
     214             :    bit" to know it's not a false positive.
     215             : 
     216             :    Denoting the rows at which we spend a bit by a 1 and the rows at
     217             :    which we don't spend a bit by 0, we might get a column like:
     218             : 
     219             :                                 1
     220             :                                 0
     221             :                                 1
     222             :                                 1
     223             :                                 0.
     224             :    The number of ways this column can occur is x1 =
     225             :    W*(V+1)*(W-1)*(W-2)*(V+3), which means that the probability the
     226             :    column occurs is x1/(W+V)^5.  Expanding to multiple columns is easy,
     227             :    since the number of ways that two specific columns can occur is just
     228             :    the product of the number of ways each can occur.  For example,
     229             :                               1 0
     230             :                               0 1
     231             :                               1 1
     232             :                               1 0
     233             :                               0 0
     234             :    can occur x1 * x2 ways, where x2 = V*W*(W-1)*(V+2)*(V+2).
     235             :    A false positive happens when there's a row of all 0s, as in the last
     236             :    row of the example.
     237             : 
     238             :    It's cleaner to count the number of ways not to get a false positive.
     239             :    This gives us the inclusion-exclusion formula:
     240             :    (all ways)
     241             :         - (all ways where row 0 is all 0s)
     242             :         - (all ways where row 1 is all 0s)
     243             :         - ...
     244             :         + (all ways where rows 0 and 1 are both all 0s)
     245             :         + (all ways where rows 0 and 2 are both all 0s)
     246             :         + ...
     247             :         - (all ways where rows 0, 1, and 2 are all 0s)
     248             :         - (all ways where rows 0, 1, and 3 are all 0s)
     249             :         +, - ...
     250             :         + (-1)^R (all ways in which all rows are all 0s).
     251             : 
     252             :    There's a nice way to understand each of these terms.  For example,
     253             :    in the R=2, C=3 case, the term in which row 0 is all 0s has the
     254             :    following elements:
     255             :     0 0 0   0 0 0   0 0 0   0 0 0   0 0 0   0 0 0   0 0 0   0 0 0
     256             :     0 0 0   0 0 1   0 1 0   0 1 1   1 0 0   1 0 1   1 1 0   1 1 1
     257             :    Rather than enumerating all 2^( (R-1)*C ) elements, we'll represent
     258             :    it as
     259             :                       ( 0  + 0 )^3
     260             :                       ( 0    1 )
     261             : 
     262             :    Skipping some steps, the task boils down to counting the number of
     263             :    ways to get columns that match a mask, then raising that to the Cth
     264             :    power.
     265             : 
     266             :    Now, with all this behind us, our goal is to pick the optimal value
     267             :    of W,V, and C given a supposed distribution of transactions, and a
     268             :    performance model.  Based on some back of the envelope calculations
     269             :    based on instruction throughput and latency measurements and
     270             :    confirmed by some experiments, the fast path code takes about 3.5*R
     271             :    cycles if using one AVX2 vector (W*C==256) and
     272             :    2.5*R+2*ceil(W*C/256)*R cycles otherwise.  The slow path takes about
     273             :    133*J cycles.  Then the expected value of the number of cycles it
     274             :    takes to process a transaction with R accounts is
     275             :      R*(2.5+2*ceil(W*C/256) - [W*C<=256]) + FP_{W,V,R,C}*133*R
     276             : 
     277             :    Based on a sample of 100,000 slots containing about 100M
     278             :    transactions, the CDF looks like
     279             : 
     280             :      Fraction of transactions containing <=  3 account addresses     71%
     281             :                                          <= 13 account addresses     80%
     282             :                                          <= 24 account addresses     91%
     283             :                                          <= 31 account addresses     95%
     284             :                                          <= 44 account addresses     98%
     285             :                                          <= 50 account addresses     99%
     286             : 
     287             :    Basically, there's a peak at 3 (votes), and then a very long, very
     288             :    fat tail.  When using AVX2, it basically boils down into 2 regimes:
     289             :           0 <= R <  28            W=32, C=8,  V=0  (one AVX vector)
     290             :          28 <= R <= 64            W=32, C=32, V=32 (four AVX vectors)
     291             : 
     292             :    This combination has an expected value of about 54 cycles over all
     293             :    transactions.  For a typical transaction with 3 account addresses,
     294             :    this takes about 10 cycles and the false positive probability is
     295             :    about 2e-10.
     296             : 
     297             :    When using AVX512, the regimes are similar:
     298             :           0 <= R <  36            W=32, C=16, V=0  (one AVX vector)
     299             :          36 <= R <= 64            W=32, C=32, V=32 (two AVX vectors)
     300             :   This combination has an expected value of about 33 cycles over all
     301             :   transactions.  Again, the typical 3 account address account takes
     302             :   about 9 cycles and has a negligible false positive probability. */
     303             : 
     304             : 
     305             : struct fd_chkdup_waddr {
     306             :   fd_acct_addr_t key; /* account address */
     307             : };
     308             : typedef struct fd_chkdup_waddr fd_chkdup_waddr_t;
     309             : static const fd_acct_addr_t chkdup_null_addr = {{ 0 }};
     310             : 
     311             : #define MAP_NAME              fd_chkdup_pmap
     312   185236819 : #define MAP_T                 fd_chkdup_waddr_t
     313   401552927 : #define MAP_KEY_T             fd_acct_addr_t
     314   185343535 : #define MAP_KEY_NULL          chkdup_null_addr
     315   586792326 : #define MAP_KEY_INVAL(k)      MAP_KEY_EQUAL(k, chkdup_null_addr)
     316   602794584 : #define MAP_KEY_EQUAL(k0,k1)  (!memcmp((k0).b,(k1).b, FD_TXN_ACCT_ADDR_SZ))
     317             : #define MAP_KEY_EQUAL_IS_SLOW 1
     318             : #define MAP_MEMOIZE           0
     319   200509729 : #define MAP_KEY_HASH(key)     ((uint)fd_ulong_hash( fd_ulong_load_8( (key).b ) ))
     320   417033296 : #define MAP_LG_SLOT_CNT 8
     321             : #include "../../util/tmpl/fd_map.c"
     322             : 
     323             : 
     324             : struct __attribute__((aligned(FD_CHKDUP_ALIGN))) fd_chkdup_private {
     325             : #if FD_CHKDUP_IMPL >= 1
     326             :   uchar  entropy[ 32*FD_CHKDUP_IMPL ];
     327             : #endif
     328             : 
     329             :   fd_chkdup_waddr_t hashmap[ 1UL<<8 ];
     330             : };
     331             : 
     332             : static inline void *
     333             : fd_chkdup_new( void     * shmem,
     334         801 :                fd_rng_t * rng   ) {
     335         801 :   fd_chkdup_t * chkdup = (fd_chkdup_t *)shmem;
     336         801 : #if FD_CHKDUP_IMPL >= 1
     337       34977 :   for( ulong i=0UL; i<32*FD_CHKDUP_IMPL; i++ ) chkdup->entropy[ i ] = fd_rng_uchar( rng );
     338             : #else
     339             :   (void)rng;
     340             : #endif
     341         801 :   FD_TEST( fd_chkdup_pmap_footprint()==sizeof(chkdup->hashmap) ); /* Known at compile time */
     342             : 
     343         801 :   fd_chkdup_pmap_new( chkdup->hashmap );
     344         801 :   return chkdup;
     345         801 : }
     346             : 
     347         243 : static inline fd_chkdup_t * fd_chkdup_join  ( void * shmem ) { return (fd_chkdup_t *)shmem; }
     348             : 
     349         228 : static inline void * fd_chkdup_leave ( fd_chkdup_t * chkdup ) { return (void *)chkdup; }
     350         228 : static inline void * fd_chkdup_delete( void        * shmem  ) { return         shmem;  }
     351             : 
     352             : 
     353             : static inline int
     354             : fd_chkdup_check     ( fd_chkdup_t          * chkdup,
     355             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     356    23236242 :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     357    23236242 :   if( FD_LIKELY( 0==fd_chkdup_check_fast( chkdup, list0, list0_cnt, list1, list1_cnt ) ) ) return 0;
     358       64895 :   return fd_chkdup_check_slow( chkdup, list0, list0_cnt, list1, list1_cnt );
     359    23236242 : }
     360             : 
     361             : static inline int
     362             : fd_chkdup_check_slow( fd_chkdup_t          * chkdup,
     363             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     364     9714812 :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     365     9714812 :   fd_chkdup_waddr_t * map = fd_chkdup_pmap_join( chkdup->hashmap );
     366     9714812 :   fd_chkdup_waddr_t * inserted[ FD_TXN_ACCT_ADDR_MAX ];
     367     9714812 :   ulong inserted_cnt = 0UL;
     368             : 
     369     9714812 :   int any_duplicates = 0;
     370     9714812 :   int skipped_inval = 0;
     371   136858723 :   for( ulong i0=0UL; (i0<list0_cnt) & !any_duplicates; i0++ ) {
     372   127143911 :     if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list0[ i0 ] ) ) ) {
     373             :       /* Okay if this is the 1st, but not if the 2nd */
     374        2685 :       any_duplicates |= skipped_inval;
     375        2685 :       skipped_inval   = 1;
     376        2685 :       continue;
     377        2685 :     }
     378   127141226 :     fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list0[ i0 ] );
     379   127141226 :     inserted[ inserted_cnt++ ] = ins;
     380   127141226 :     any_duplicates |=        (NULL==ins);
     381   127141226 :     inserted_cnt   -= (ulong)(NULL==ins); /* Correct inserted_cnt if we just stored a NULL */
     382   127141226 :   }
     383    67810300 :   for( ulong i1=0UL; (i1<list1_cnt) & !any_duplicates; i1++ ) {
     384    58095488 :     if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list1[ i1 ] ) ) ) {
     385         696 :       any_duplicates |= skipped_inval;
     386         696 :       skipped_inval   = 1;
     387         696 :       continue;
     388         696 :     }
     389    58094792 :     fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list1[ i1 ] );
     390    58094792 :     inserted[ inserted_cnt++ ] = ins;
     391    58094792 :     any_duplicates |=        (NULL==ins);
     392    58094792 :     inserted_cnt   -= (ulong)(NULL==ins);
     393    58094792 :   }
     394             : 
     395             :   /* FIXME: This depends on undocumented map behavior for correctness.
     396             :      Deleting in the opposite order of insertion preserves previously
     397             :      inserted pointers. That behavior should be documented. */
     398   194853291 :   for( ulong i=0UL; i<inserted_cnt; i++ ) fd_chkdup_pmap_remove( map, inserted[ inserted_cnt-1UL-i ] );
     399             : 
     400     9714812 :   fd_chkdup_pmap_leave( map );
     401             : 
     402     9714812 :   return any_duplicates;
     403     9714812 : }
     404             : 
     405             : 
     406             : #if FD_CHKDUP_IMPL==1
     407             : 
     408             : /* AVX2 implementation */
     409             : #include "../../util/simd/fd_avx.h"
     410             : static inline int
     411             : fd_chkdup_check_fast( fd_chkdup_t          * chkdup,
     412             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     413    22124106 :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     414    22124106 :   if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
     415             : 
     416    12814026 :   int any_duplicates = 0;
     417             : 
     418    12814026 :   const wu_t entropy = wb_ld( chkdup->entropy );
     419    12814026 :   const wu_t one     = wu_bcast( 1U );
     420             : 
     421             : 
     422    12814026 :   if( FD_LIKELY( list0_cnt+list1_cnt<28UL ) ) {
     423             :     /* Single vector implementation */
     424     9149256 :     const wu_t mask = wu_bcast( 0x1FU );
     425             : 
     426     9149256 :     wu_t bloom = wu_zero();
     427    82755554 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     428    73606298 :       wu_t addr    = wb_ldu( list0+i0 );
     429    73606298 :       wu_t masked  = wu_and( wu_xor( addr, entropy ), mask );
     430    73606298 :       wu_t select  = wu_shl_vector( one, masked );
     431             :       /* testc: "Compute the bitwise NOT of a and then AND with b, and
     432             :          [return] 1 if the result is zero." */
     433    73606298 :       any_duplicates |= _mm256_testc_si256( bloom, select );
     434    73606298 :       bloom = wu_or( bloom, select );
     435    73606298 :     }
     436    29958936 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     437    20809680 :       wu_t addr    = wb_ldu( list1+i1 );
     438    20809680 :       wu_t masked  = wu_and( wu_xor( addr, entropy ), mask );
     439    20809680 :       wu_t select  = wu_shl_vector( one, masked );
     440    20809680 :       any_duplicates |= _mm256_testc_si256( bloom, select );
     441    20809680 :       bloom = wu_or( bloom, select );
     442    20809680 :     }
     443     9149256 :     return any_duplicates;
     444             : 
     445     9149256 :   } else {
     446             :     /* 4-vector implementation: slower but much better false positive
     447             :        rate so that we don't have to fall back to the slow path as
     448             :        frequently. */
     449     3664770 :     const wu_t mask = wu_bcast( 0x3FU );
     450             : 
     451     3664770 :     wu_t bloom0 = wu_zero();                                wu_t bloom1 = wu_zero();
     452     3664770 :     wu_t bloom2 = wu_zero();                                wu_t bloom3 = wu_zero();
     453    94854176 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     454    91189406 :       wu_t addr    = wb_ldu( list0+i0 );
     455    91189406 :       wu_t blinded = wu_xor( addr, entropy );
     456    91189406 :       wu_t masked0 = wu_and( mask, blinded );               wu_t masked1 = wu_and( mask, wu_shr( blinded,  6 ) );
     457    91189406 :       wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
     458    91189406 :       wu_t select0 = wu_shl_vector( one, masked0 );         wu_t select1 = wu_shl_vector( one, masked1 );
     459    91189406 :       wu_t select2 = wu_shl_vector( one, masked2 );         wu_t select3 = wu_shl_vector( one, masked3 );
     460             : 
     461    91189406 :       wu_t any_differences = wu_or(
     462    91189406 :            wu_or( wu_andnot( bloom0, select0 ),             wu_andnot( bloom1, select1 ) ),
     463    91189406 :            wu_or( wu_andnot( bloom2, select2 ),             wu_andnot( bloom3, select3 ) ) );
     464             : 
     465    91189406 :       bloom0  = wu_or( bloom0, select0 );                   bloom1  = wu_or( bloom1, select1 );
     466    91189406 :       bloom2  = wu_or( bloom2, select2 );                   bloom3  = wu_or( bloom3, select3 );
     467             : 
     468    91189406 :       any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
     469    91189406 :       FD_COMPILER_FORGET( any_duplicates );
     470    91189406 :     }
     471    65821520 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     472    62156750 :       wu_t addr    = wb_ldu( list1+i1 );
     473    62156750 :       wu_t blinded = wu_xor( addr, entropy );
     474    62156750 :       wu_t masked0 = wu_and( mask, blinded );               wu_t masked1 = wu_and( mask, wu_shr( blinded,  6 ) );
     475    62156750 :       wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
     476    62156750 :       wu_t select0 = wu_shl_vector( one, masked0 );         wu_t select1 = wu_shl_vector( one, masked1 );
     477    62156750 :       wu_t select2 = wu_shl_vector( one, masked2 );         wu_t select3 = wu_shl_vector( one, masked3 );
     478             : 
     479    62156750 :       wu_t any_differences = wu_or(
     480    62156750 :            wu_or( wu_andnot( bloom0, select0 ),             wu_andnot( bloom1, select1 ) ),
     481    62156750 :            wu_or( wu_andnot( bloom2, select2 ),             wu_andnot( bloom3, select3 ) ) );
     482             : 
     483    62156750 :       bloom0  = wu_or( bloom0, select0 );                   bloom1  = wu_or( bloom1, select1 );
     484    62156750 :       bloom2  = wu_or( bloom2, select2 );                   bloom3  = wu_or( bloom3, select3 );
     485             : 
     486    62156750 :       any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
     487    62156750 :       FD_COMPILER_FORGET( any_duplicates );
     488    62156750 :     }
     489     3664770 :     return any_duplicates;
     490     3664770 :   }
     491    12814026 : }
     492             : 
     493             : #elif FD_CHKDUP_IMPL==2
     494             : 
     495             : /* AVX512 implementation */
     496             : #include "../../util/simd/fd_avx512.h"
     497             : static inline int
     498             : fd_chkdup_check_fast( fd_chkdup_t              * chkdup,
     499             :                       fd_acct_addr_t const     * list0,     ulong list0_cnt,
     500    11062053 :                       fd_acct_addr_t const     * list1,     ulong list1_cnt ) {
     501    11062053 :   if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
     502             : 
     503     6407013 :   int any_duplicates = 0;
     504             : 
     505     6407013 :   const wwu_t entropy = wwu_ld( (uint const *)chkdup->entropy );
     506     6407013 :   const wwu_t one     = wwu_bcast( 1U );
     507             : 
     508     6407013 :   if( FD_LIKELY( list0_cnt+list1_cnt<36UL ) ) {
     509             :     /* One vector version */
     510             :     /* Our analysis assumed the 64 bytes of hash were all independent,
     511             :        but if we just xor and then use the low 5 bits of both parts of
     512             :        the vector, we get a lot more false positives than the math
     513             :        predicts. */
     514     5374748 :     const wwu_t mask = wwu_bcast( 0x1FU );
     515     5374748 :     wwu_t bloom = wwu_zero();
     516    63579440 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     517    58204692 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
     518    58204692 :       wwu_t blinded = wwu_xor( addr, entropy );
     519    58204692 :       wwu_t masked  = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
     520    58204692 :       wwu_t select  = _mm512_rolv_epi32( one, masked );
     521    58204692 :       wwu_t next    = wwu_or( bloom, select );
     522    58204692 :       __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE ); /* if non-zero, not a duplicate */
     523    58204692 :       bloom = next;
     524             :       /* kortestz_mask8_u8: "Compute the bitwise OR of 8-bit masks a and
     525             :          b. If the result is all zeroes, [return] 1" */
     526    58204692 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     527    58204692 :       FD_COMPILER_FORGET( any_duplicates );
     528    58204692 :     }
     529    19981904 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     530    14607156 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
     531    14607156 :       wwu_t blinded = wwu_xor( addr, entropy );
     532    14607156 :       wwu_t masked  = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
     533    14607156 :       wwu_t select  = _mm512_rolv_epi32( one, masked );
     534    14607156 :       wwu_t next    = wwu_or( bloom, select );
     535    14607156 :       __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE );
     536    14607156 :       bloom = next;
     537    14607156 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     538    14607156 :       FD_COMPILER_FORGET( any_duplicates );
     539    14607156 :     }
     540     5374748 :     return any_duplicates;
     541     5374748 :   } else {
     542             :     /* Two vector version */
     543     1032265 :     const wwu_t mask = wwu_bcast( 0x3FU );
     544     1032265 :     const wwu_t shift0 = wwu( 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
     545     1032265 :                               6U, 6U, 6U, 6U, 6U, 6U, 6U, 6U );
     546     1032265 :     const wwu_t shift1 = wwu( 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U,
     547     1032265 :                               18U, 18U, 18U, 18U, 18U, 18U, 18U, 18U );
     548     1032265 :     wwu_t bloom0 = wwu_zero();            wwu_t bloom1 = wwu_zero();
     549    35034403 :     for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
     550    34002138 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
     551    34002138 :       wwu_t blinded = wwu_xor( addr, entropy );
     552    34002138 :       wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) );  wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
     553    34002138 :       wwu_t select0 = wwu_shl_vector( one, masked0 );                      wwu_t select1 = wwu_shl_vector( one, masked1 );
     554    34002138 :       wwu_t next0   = wwu_or( bloom0, select0 );                           wwu_t next1   = wwu_or( bloom1, select1 );
     555    34002138 :       __mmask8 any_differences = _kor_mask8(
     556    34002138 :           _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ),           _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
     557             : 
     558    34002138 :       bloom0 = next0;                                                      bloom1 = next1;
     559             : 
     560    34002138 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     561    34002138 :       FD_COMPILER_FORGET( any_duplicates );
     562    34002138 :     }
     563    18101163 :     for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
     564    17068898 :       wwu_t addr    = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
     565    17068898 :       wwu_t blinded = wwu_xor( addr, entropy );
     566    17068898 :       wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) );  wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
     567    17068898 :       wwu_t select0 = wwu_shl_vector( one, masked0 );                      wwu_t select1 = wwu_shl_vector( one, masked1 );
     568    17068898 :       wwu_t next0   = wwu_or( bloom0, select0 );                           wwu_t next1   = wwu_or( bloom1, select1 );
     569    17068898 :       __mmask8 any_differences = _kor_mask8(
     570    17068898 :           _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ),           _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
     571             : 
     572    17068898 :       bloom0 = next0;                                                      bloom1 = next1;
     573             : 
     574    17068898 :       any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
     575    17068898 :       FD_COMPILER_FORGET( any_duplicates );
     576    17068898 :     }
     577     1032265 :     return any_duplicates;
     578     1032265 :   }
     579     6407013 : }
     580             : 
     581             : #else
     582             : 
     583             : static inline int
     584             : fd_chkdup_check_fast( fd_chkdup_t          * chkdup,
     585             :                       fd_acct_addr_t const * list0,     ulong list0_cnt,
     586             :                       fd_acct_addr_t const * list1,     ulong list1_cnt ) {
     587             :   (void)chkdup;
     588             :   (void)list0;
     589             :   (void)list1;
     590             :   (void)list0_cnt;
     591             :   (void)list1_cnt;
     592             :   return 1;
     593             : }
     594             : 
     595             : #endif
     596             : 
     597             : 
     598             : FD_PROTOTYPES_END
     599             : 
     600             : #endif /* HEADER_fd_src_disco_pack_fd_chkdup_h */

Generated by: LCOV version 1.14