Line data Source code
1 : #ifndef HEADER_fd_src_ballet_pack_fd_chkdup_h
2 : #define HEADER_fd_src_ballet_pack_fd_chkdup_h
3 : #include "../fd_ballet_base.h"
4 : #include "../txn/fd_txn.h"
5 :
6 : /* fd_chkdup declares a set of functions for ultra-HPC checking if a
7 : list of account addresses contains any duplicates. It's important
8 : that this be fast, because a transaction containing duplicate account
9 : addresses fails to sanitize and is not charged a fee. Although this
10 : check can (and really ought) to be done in parallel, perhaps in the
11 : verify tiles, right now, it's done in pack, which means it's serial
12 : and on the critical path.
13 :
14 : On platforms with AVX, the current implementation uses a fast initial
15 : check which may have false positives (thinking there are duplicates
16 : when there aren't). Any transaction that fails the initial check is
17 : then subjected to the full, precise check. Without AVX, all
18 : transactions use the slow path. */
19 :
20 : /* The functions are defined in the header to facilitate inlining since
21 : they take 10s of cycles in the good case, but should probably be
22 : treated as if they were defined in a .c file. */
23 :
24 : #ifndef FD_CHKDUP_IMPL
25 : # if FD_HAS_AVX512
26 16186 : # define FD_CHKDUP_IMPL 2
27 : # elif FD_HAS_AVX
28 16436 : # define FD_CHKDUP_IMPL 1
29 : # else
30 : # define FD_CHKDUP_IMPL 0
31 : # endif
32 : #endif
33 :
34 :
35 : #if FD_CHKDUP_IMPL==2
36 : # define FD_CHKDUP_ALIGN ( 64UL)
37 : #elif FD_CHKDUP_IMPL==1
38 : # define FD_CHKDUP_ALIGN ( 32UL)
39 : #elif FD_CHKDUP_IMPL==0
40 : # define FD_CHKDUP_ALIGN ( 32UL)
41 : #else
42 : # error "Unrecognized value of FD_CHKDUP_IMPL"
43 : #endif
44 :
45 :
46 : # define FD_CHKDUP_FOOTPRINT FD_LAYOUT_FINI( FD_LAYOUT_APPEND( \
47 : FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \
48 : FD_CHKDUP_ALIGN, 32*FD_CHKDUP_IMPL ), \
49 : 32UL, (1UL<<8)*32UL ), \
50 : FD_CHKDUP_ALIGN )
51 :
52 : FD_STATIC_ASSERT( (1UL<<8)==2*FD_TXN_ACCT_ADDR_MAX, "hash table size" );
53 :
54 : /* Fixed size (just over 8kB) and safe for declaration on the stack or
55 : inclusion in a struct. */
56 : struct fd_chkdup_private;
57 : typedef struct fd_chkdup_private fd_chkdup_t;
58 :
59 : FD_PROTOTYPES_BEGIN
60 : /* fd_chkdup_{footprint, align} return the footprint and alignment of
61 : the scratch memory that duplicate detection requires. */
62 0 : static inline ulong fd_chkdup_footprint( void ) { return FD_CHKDUP_FOOTPRINT; }
63 0 : static inline ulong fd_chkdup_align ( void ) { return FD_CHKDUP_ALIGN; }
64 :
65 : /* fd_chkdup_new formats an appropriately sized region of memory for use
66 : in duplicate address detection. shmem must point to the first byte
67 : of a region of memory with the appropriate alignment and footprint.
68 : rng must be a pointer to a local join of an RNG. Some slots of the
69 : RNG will be consumed, but no interest in the RNG will be retained
70 : after the function returns. Returns shmem on success and NULL on
71 : failure (logs details). The only failure cases are if shmem is NULL
72 : or not aligned.
73 :
74 : fd_chkdup_join joins the caller to the formatted region of memory.
75 : Returns shmem.
76 :
77 : fd_chkdup_leave unjoins the caller to chkdup. Returns chkdup.
78 : fd_chkdup_delete unformats the region of memory. Returns a pointer
79 : to the unformatted memory region. */
80 : static inline void * fd_chkdup_new ( void * shmem, fd_rng_t * rng );
81 : static inline fd_chkdup_t * fd_chkdup_join ( void * shmem );
82 : static inline void * fd_chkdup_leave ( fd_chkdup_t * chkdup );
83 : static inline void * fd_chkdup_delete( void * shmem );
84 :
85 :
86 : /* fd_chkdup_check{,_slow,_fast} check a list of account addresses for
87 : any duplicate addresses, i.e. an account address that appears twice
88 : in the list. The list does not need to be sorted or have any
89 : particular order. The list may be decomposed into two sublists
90 : (list0 and list1) to facilitate 0-copy usage with address lookup
91 : tables, but list0 and list1 are logically concatenated prior to
92 : checking for duplicates.
93 :
94 : chkdup is a pointer to a valid local join of a chkdup object.
95 :
96 : list0 and list1 point to the first account address of the respective
97 : sublists. The memory they point to need not have any particular
98 : alignment. list0==NULL is okay only if list0_cnt==0, and similarly
99 : for list1. list0 is accessed with indices [0, list0_cnt) and list1
100 : is accessed with indices [0, list1_cnt). list0 and list1 must not
101 : overlap. Requires list0_cnt+list1_cnt<=128, and the function is
102 : somewhat tuned for smaller values.
103 :
104 : fd_chkdup_check and the _slow version return 1 if the list of
105 : transactions contains at least one duplicated account address and 0
106 : otherwise (implying each account address in the provided list is
107 : unique).
108 :
109 : fd_chkdup_check_fast returns 1 if the list of transactions contains
110 : at least one duplicated account address and typically returns 0 if
111 : each account address in the provided list is unique, but may
112 : sometimes spuriiously return 1 even without duplicates.
113 :
114 : WARNING: the _fast version MAY HAVE FALSE POSITIVES. You probably
115 : want the un-suffixed version, which is precise. It uses the fast
116 : version as a fast-path and then does a slower full check if the
117 : fast-path suggests there may be a duplicate.
118 :
119 : However, it's also worth calling out again that the _fast version
120 : only makes errors in one direction. If the list contains duplicates,
121 : it will definitely return 1. If it returns 0, the list definitely
122 : does not contain duplicates. (Those two statements are equivalent).
123 : */
124 : static inline int
125 : fd_chkdup_check ( fd_chkdup_t * chkdup,
126 : fd_acct_addr_t const * list0, ulong list0_cnt,
127 : fd_acct_addr_t const * list1, ulong list1_cnt );
128 : static inline int
129 : fd_chkdup_check_slow( fd_chkdup_t * chkdup,
130 : fd_acct_addr_t const * list0, ulong list0_cnt,
131 : fd_acct_addr_t const * list1, ulong list1_cnt );
132 : static inline int
133 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
134 : fd_acct_addr_t const * list0, ulong list0_cnt,
135 : fd_acct_addr_t const * list1, ulong list1_cnt );
136 :
137 :
138 : /* ----- Implementation details and discussion follow------
139 :
140 : The fast path implementation is somewhat interesting. The normal way
141 : to do this is with a Bloom filter, but Bloom filters do lots of
142 : unpredictable reads and the pack tile is somewhat cache sensitive.
143 : Instead, this implements a variant on a Bloom filter that lives
144 : entirely in AVX registers.
145 :
146 : Basically, we use C W-bit words stored in (C*W)/256 AVX2 registers or
147 : (C*W)/512 AVX512 registers. Each word is a modified Bloom filter
148 : with one associated hash function. For each account address, we
149 : compute C hashes, giving (up to) C positions across the AVX
150 : registers. If any of those positions have an unset bit, then we know
151 : we have not seen the account address before. Finally, we then set
152 : all the positions.
153 :
154 : The only difference between this idea and a normal Bloom filter is
155 : that sometimes the hash function may not select a bit. There's a
156 : tradeoff to be made here: suppose you insert R account addresses, and
157 : R is at least almost as large as W. Then each word fills up, and
158 : false positives become increasingly likely. Only testing and
159 : inserting a bit, say, half the time, effectively halves C, but
160 : prevents each word from getting saturated as quickly, and makes the
161 : algorithm effective for larger values of R. We use Intel's quirky
162 : behavior to get this for free.
163 :
164 : (Note: The R and C notation is supposed to suggest a tabular layout
165 : in which the account addresses are rows and the words are columns.)
166 :
167 : The key insight into making this work quickly is that the vpsllv{d,q}
168 : variable left shift instructions are cheap (1 cycle, can execute on
169 : port 0 or 1 for AVX2, still 1 cycle on port 0 for AVX512). If we can
170 : compute the hashes in parallel with SIMD, then we can variably shift
171 : bcast(0x1) quickly, and select several bits at a time. The rest is
172 : just bitwise logic, which is extremely cheap with AVX. This approach
173 : constrains W to be either 32 or 64, and C to be a multiple of the
174 : number of words in a vector, but those are both pretty acceptable
175 : constraints.
176 :
177 : The final ingredient is a cheap hash function that places the hashes
178 : in the appropriate position for vpsllv. We just xor the account
179 : address with some validator-specific entropy and use a mask to select
180 : certain bits.
181 :
182 : The slow-path implementation uses a hash table to check for
183 : duplicates. This is slower than sorting for transactions with only a
184 : few account addresses, but substantially faster than sorting for
185 : transactions with large numbers of account addresses, which is when
186 : the slow-path matters more anyway.
187 :
188 :
189 : You can see from above there are a variety of knobs to tune. Should
190 : W be 32 or 64? How big should C be? How many bits should we mask
191 : off for the hash, which controls the frequency with which we skip a
192 : word, neither checking nor inserting a bit? It would be good to have
193 : a rigorous understanding of the false positive rate as a function of
194 : these parameters so we can make a decision that minimizes the
195 : expected compute required.
196 :
197 : Unfortunately, the false positive computation is tricky. The key
198 : difficulty is that whether processing account address r results in a
199 : false positive is not independent from whether processing account
200 : address r+1 results in a false positive. This leads to an obnoxious
201 : inclusion-exclusion formula which quickly becomes more unwieldy than
202 : I (or Sage) can handle.
203 :
204 : A dynamic-programming-ish algorithm can compute the false positive
205 : rate in approximately O(R*2^R) time. To start, we just want to
206 : understand a single word/column. Suppose k bits in the word have
207 : been set. Then there are (W-k) hash values that set a new bit, and
208 : (V+k) hash values that don't set a new, where V is the number of hash
209 : values that don't select a bit. The hash values that set a bit are
210 : also exactly the ones that provide information that the account
211 : address is not a false positive. I think about this as "spending a
212 : bit" to know it's not a false positive.
213 :
214 : Denoting the rows at which we spend a bit by a 1 and the rows at
215 : which we don't spend a bit by 0, we might get a column like:
216 :
217 : 1
218 : 0
219 : 1
220 : 1
221 : 0.
222 : The number of ways this column can occur is x1 =
223 : W*(V+1)*(W-1)*(W-2)*(V+3), which means that the probability the
224 : column occurs is x1/(W+V)^5. Expanding to multiple columns is easy,
225 : since the number of ways that two specific columns can occur is just
226 : the product of the number of ways each can occur. For example,
227 : 1 0
228 : 0 1
229 : 1 1
230 : 1 0
231 : 0 0
232 : can occur x1 * x2 ways, where x2 = V*W*(W-1)*(V+2)*(V+2).
233 : A false positive happens when there's a row of all 0s, as in the last
234 : row of the example.
235 :
236 : It's cleaner to count the number of ways not to get a false positive.
237 : This gives us the inclusion-exclusion formula:
238 : (all ways)
239 : - (all ways where row 0 is all 0s)
240 : - (all ways where row 1 is all 0s)
241 : - ...
242 : + (all ways where rows 0 and 1 are both all 0s)
243 : + (all ways where rows 0 and 2 are both all 0s)
244 : + ...
245 : - (all ways where rows 0, 1, and 2 are all 0s)
246 : - (all ways where rows 0, 1, and 3 are all 0s)
247 : +, - ...
248 : + (-1)^R (all ways in which all rows are all 0s).
249 :
250 : There's a nice way to understand each of these terms. For example,
251 : in the R=2, C=3 case, the term in which row 0 is all 0s has the
252 : following elements:
253 : 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
254 : 0 0 0 0 0 1 0 1 0 0 1 1 1 0 0 1 0 1 1 1 0 1 1 1
255 : Rather than enumerating all 2^( (R-1)*C ) elements, we'll represent
256 : it as
257 : ( 0 + 0 )^3
258 : ( 0 1 )
259 :
260 : Skipping some steps, the task boils down to counting the number of
261 : ways to get columns that match a mask, then raising that to the Cth
262 : power.
263 :
264 : Now, with all this behind us, our goal is to pick the optimal value
265 : of W,V, and C given a supposed distribution of transactions, and a
266 : performance model. Based on some back of the envelope calculations
267 : based on instruction throughput and latency measurements and
268 : confirmed by some experiments, the fast path code takes about 3.5*R
269 : cycles if using one AVX2 vector (W*C==256) and
270 : 2.5*R+2*ceil(W*C/256)*R cycles otherwise. The slow path takes about
271 : 133*J cycles. Then the expected value of the number of cycles it
272 : takes to process a transaction with R accounts is
273 : R*(2.5+2*ceil(W*C/256) - [W*C<=256]) + FP_{W,V,R,C}*133*R
274 :
275 : Based on a sample of 100,000 slots containing about 100M
276 : transactions, the CDF looks like
277 :
278 : Fraction of transactions containing <= 3 account addresses 71%
279 : <= 13 account addresses 80%
280 : <= 24 account addresses 91%
281 : <= 31 account addresses 95%
282 : <= 44 account addresses 98%
283 : <= 50 account addresses 99%
284 :
285 : Basically, there's a peak at 3 (votes), and then a very long, very
286 : fat tail. When using AVX2, it basically boils down into 2 regimes:
287 : 0 <= R < 28 W=32, C=8, V=0 (one AVX vector)
288 : 28 <= R <= 64 W=32, C=32, V=32 (four AVX vectors)
289 :
290 : This combination has an expected value of about 54 cycles over all
291 : transactions. For a typical transaction with 3 account addresses,
292 : this takes about 10 cycles and the false positive probability is
293 : about 2e-10.
294 :
295 : When using AVX512, the regimes are similar:
296 : 0 <= R < 36 W=32, C=16, V=0 (one AVX vector)
297 : 36 <= R <= 64 W=32, C=32, V=32 (two AVX vectors)
298 : This combination has an expected value of about 33 cycles over all
299 : transactions. Again, the typical 3 account address account takes
300 : about 9 cycles and has a negligible false positive probability. */
301 :
302 :
303 : struct fd_chkdup_waddr {
304 : fd_acct_addr_t key; /* account address */
305 : };
306 : typedef struct fd_chkdup_waddr fd_chkdup_waddr_t;
307 : static const fd_acct_addr_t chkdup_null_addr = {{ 0 }};
308 :
309 : #define MAP_NAME fd_chkdup_pmap
310 1603396851 : #define MAP_T fd_chkdup_waddr_t
311 3456007446 : #define MAP_KEY_T fd_acct_addr_t
312 1603489797 : #define MAP_KEY_NULL chkdup_null_addr
313 5059406922 : #define MAP_KEY_INVAL(k) MAP_KEY_EQUAL(k, chkdup_null_addr)
314 5187856605 : #define MAP_KEY_EQUAL(k0,k1) (!memcmp((k0).b,(k1).b, FD_TXN_ACCT_ADDR_SZ))
315 : #define MAP_KEY_EQUAL_IS_SLOW 1
316 : #define MAP_MEMOIZE 0
317 1724356737 : #define MAP_KEY_HASH(key) ((uint)fd_ulong_hash( fd_ulong_load_8( (key).b ) ))
318 3577160058 : #define MAP_LG_SLOT_CNT 8
319 : #include "../../util/tmpl/fd_map.c"
320 :
321 :
322 : struct __attribute__((aligned(FD_CHKDUP_ALIGN))) fd_chkdup_private {
323 : #if FD_CHKDUP_IMPL >= 1
324 : uchar entropy[ 32*FD_CHKDUP_IMPL ];
325 : #endif
326 :
327 : fd_chkdup_waddr_t hashmap[ 1UL<<8 ];
328 : };
329 :
330 : static inline void *
331 : fd_chkdup_new( void * shmem,
332 747 : fd_rng_t * rng ) {
333 747 : fd_chkdup_t * chkdup = (fd_chkdup_t *)shmem;
334 747 : #if FD_CHKDUP_IMPL >= 1
335 32619 : for( ulong i=0UL; i<32*FD_CHKDUP_IMPL; i++ ) chkdup->entropy[ i ] = fd_rng_uchar( rng );
336 : #else
337 : (void)rng;
338 : #endif
339 747 : FD_TEST( fd_chkdup_pmap_footprint()==sizeof(chkdup->hashmap) ); /* Known at compile time */
340 :
341 747 : fd_chkdup_pmap_new( chkdup->hashmap );
342 747 : return chkdup;
343 747 : }
344 :
345 228 : static inline fd_chkdup_t * fd_chkdup_join ( void * shmem ) { return (fd_chkdup_t *)shmem; }
346 :
347 228 : static inline void * fd_chkdup_leave ( fd_chkdup_t * chkdup ) { return (void *)chkdup; }
348 228 : static inline void * fd_chkdup_delete( void * shmem ) { return shmem; }
349 :
350 :
351 : static inline int
352 : fd_chkdup_check ( fd_chkdup_t * chkdup,
353 : fd_acct_addr_t const * list0, ulong list0_cnt,
354 82622472 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
355 82622472 : if( FD_LIKELY( 0==fd_chkdup_check_fast( chkdup, list0, list0_cnt, list1, list1_cnt ) ) ) return 0;
356 401587 : return fd_chkdup_check_slow( chkdup, list0, list0_cnt, list1, list1_cnt );
357 82622472 : }
358 :
359 : static inline int
360 : fd_chkdup_check_slow( fd_chkdup_t * chkdup,
361 : fd_acct_addr_t const * list0, ulong list0_cnt,
362 69451504 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
363 69451504 : fd_chkdup_waddr_t * map = fd_chkdup_pmap_join( chkdup->hashmap );
364 69451504 : fd_chkdup_waddr_t * inserted[ FD_TXN_ACCT_ADDR_MAX ];
365 69451504 : ulong inserted_cnt = 0UL;
366 :
367 69451504 : int any_duplicates = 0;
368 69451504 : int skipped_inval = 0;
369 1162727608 : for( ulong i0=0UL; (i0<list0_cnt) & !any_duplicates; i0++ ) {
370 1093276104 : if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list0[ i0 ] ) ) ) {
371 : /* Okay if this is the 1st, but not if the 2nd */
372 2676 : any_duplicates |= skipped_inval;
373 2676 : skipped_inval = 1;
374 2676 : continue;
375 2676 : }
376 1093273428 : fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list0[ i0 ] );
377 1093273428 : inserted[ inserted_cnt++ ] = ins;
378 1093273428 : any_duplicates |= (NULL==ins);
379 1093273428 : inserted_cnt -= (ulong)(NULL==ins); /* Correct inserted_cnt if we just stored a NULL */
380 1093273428 : }
381 579574876 : for( ulong i1=0UL; (i1<list1_cnt) & !any_duplicates; i1++ ) {
382 510123372 : if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list1[ i1 ] ) ) ) {
383 696 : any_duplicates |= skipped_inval;
384 696 : skipped_inval = 1;
385 696 : continue;
386 696 : }
387 510122676 : fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list1[ i1 ] );
388 510122676 : inserted[ inserted_cnt++ ] = ins;
389 510122676 : any_duplicates |= (NULL==ins);
390 510122676 : inserted_cnt -= (ulong)(NULL==ins);
391 510122676 : }
392 :
393 : /* FIXME: This depends on undocumented map behavior for correctness.
394 : Deleting in the opposite order of insertion preserves previously
395 : inserted pointers. That behavior should be documented. */
396 1672750069 : for( ulong i=0UL; i<inserted_cnt; i++ ) fd_chkdup_pmap_remove( map, inserted[ inserted_cnt-1UL-i ] );
397 :
398 69451504 : fd_chkdup_pmap_leave( map );
399 :
400 69451504 : return any_duplicates;
401 69451504 : }
402 :
403 :
404 : #if FD_CHKDUP_IMPL==1
405 :
406 : /* AVX2 implementation */
407 : #include "../../util/simd/fd_avx.h"
408 : static inline int
409 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
410 : fd_acct_addr_t const * list0, ulong list0_cnt,
411 103114926 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
412 103114926 : if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
413 :
414 83004846 : int any_duplicates = 0;
415 :
416 83004846 : const wu_t entropy = wb_ld( chkdup->entropy );
417 83004846 : const wu_t one = wu_bcast( 1U );
418 :
419 :
420 83004846 : if( FD_LIKELY( list0_cnt+list1_cnt<28UL ) ) {
421 : /* Single vector implementation */
422 46940070 : const wu_t mask = wu_bcast( 0x1FU );
423 :
424 46940070 : wu_t bloom = wu_zero();
425 534097470 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
426 487157400 : wu_t addr = wb_ldu( list0+i0 );
427 487157400 : wu_t masked = wu_and( wu_xor( addr, entropy ), mask );
428 487157400 : wu_t select = wu_shl_vector( one, masked );
429 : /* testc: "Compute the bitwise NOT of a and then AND with b, and
430 : [return] 1 if the result is zero." */
431 487157400 : any_duplicates |= _mm256_testc_si256( bloom, select );
432 487157400 : bloom = wu_or( bloom, select );
433 487157400 : }
434 222949634 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
435 176009564 : wu_t addr = wb_ldu( list1+i1 );
436 176009564 : wu_t masked = wu_and( wu_xor( addr, entropy ), mask );
437 176009564 : wu_t select = wu_shl_vector( one, masked );
438 176009564 : any_duplicates |= _mm256_testc_si256( bloom, select );
439 176009564 : bloom = wu_or( bloom, select );
440 176009564 : }
441 46940070 : return any_duplicates;
442 :
443 46940070 : } else {
444 : /* 4-vector implementation: slower but much better false positive
445 : rate so that we don't have to fall back to the slow path as
446 : frequently. */
447 36064776 : const wu_t mask = wu_bcast( 0x3FU );
448 :
449 36064776 : wu_t bloom0 = wu_zero(); wu_t bloom1 = wu_zero();
450 36064776 : wu_t bloom2 = wu_zero(); wu_t bloom3 = wu_zero();
451 1014851650 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
452 978786874 : wu_t addr = wb_ldu( list0+i0 );
453 978786874 : wu_t blinded = wu_xor( addr, entropy );
454 978786874 : wu_t masked0 = wu_and( mask, blinded ); wu_t masked1 = wu_and( mask, wu_shr( blinded, 6 ) );
455 978786874 : wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
456 978786874 : wu_t select0 = wu_shl_vector( one, masked0 ); wu_t select1 = wu_shl_vector( one, masked1 );
457 978786874 : wu_t select2 = wu_shl_vector( one, masked2 ); wu_t select3 = wu_shl_vector( one, masked3 );
458 :
459 978786874 : wu_t any_differences = wu_or(
460 978786874 : wu_or( wu_andnot( bloom0, select0 ), wu_andnot( bloom1, select1 ) ),
461 978786874 : wu_or( wu_andnot( bloom2, select2 ), wu_andnot( bloom3, select3 ) ) );
462 :
463 978786874 : bloom0 = wu_or( bloom0, select0 ); bloom1 = wu_or( bloom1, select1 );
464 978786874 : bloom2 = wu_or( bloom2, select2 ); bloom3 = wu_or( bloom3, select3 );
465 :
466 978786874 : any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
467 978786874 : FD_COMPILER_FORGET( any_duplicates );
468 978786874 : }
469 528234226 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
470 492169450 : wu_t addr = wb_ldu( list1+i1 );
471 492169450 : wu_t blinded = wu_xor( addr, entropy );
472 492169450 : wu_t masked0 = wu_and( mask, blinded ); wu_t masked1 = wu_and( mask, wu_shr( blinded, 6 ) );
473 492169450 : wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
474 492169450 : wu_t select0 = wu_shl_vector( one, masked0 ); wu_t select1 = wu_shl_vector( one, masked1 );
475 492169450 : wu_t select2 = wu_shl_vector( one, masked2 ); wu_t select3 = wu_shl_vector( one, masked3 );
476 :
477 492169450 : wu_t any_differences = wu_or(
478 492169450 : wu_or( wu_andnot( bloom0, select0 ), wu_andnot( bloom1, select1 ) ),
479 492169450 : wu_or( wu_andnot( bloom2, select2 ), wu_andnot( bloom3, select3 ) ) );
480 :
481 492169450 : bloom0 = wu_or( bloom0, select0 ); bloom1 = wu_or( bloom1, select1 );
482 492169450 : bloom2 = wu_or( bloom2, select2 ); bloom3 = wu_or( bloom3, select3 );
483 :
484 492169450 : any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
485 492169450 : FD_COMPILER_FORGET( any_duplicates );
486 492169450 : }
487 36064776 : return any_duplicates;
488 36064776 : }
489 83004846 : }
490 :
491 : #elif FD_CHKDUP_IMPL==2
492 :
493 : /* AVX512 implementation */
494 : #include "../../util/simd/fd_avx512.h"
495 : static inline int
496 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
497 : fd_acct_addr_t const * list0, ulong list0_cnt,
498 51557463 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
499 51557463 : if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
500 :
501 41502423 : int any_duplicates = 0;
502 :
503 41502423 : const wwu_t entropy = wwu_ld( (uint const *)chkdup->entropy );
504 41502423 : const wwu_t one = wwu_bcast( 1U );
505 :
506 41502423 : if( FD_LIKELY( list0_cnt+list1_cnt<36UL ) ) {
507 : /* One vector version */
508 : /* Our analysis assumed the 64 bytes of hash were all independent,
509 : but if we just xor and then use the low 5 bits of both parts of
510 : the vector, we get a lot more false positives than the math
511 : predicts. */
512 31470169 : const wwu_t mask = wwu_bcast( 0x1FU );
513 31470169 : wwu_t bloom = wwu_zero();
514 493050517 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
515 461580348 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
516 461580348 : wwu_t blinded = wwu_xor( addr, entropy );
517 461580348 : wwu_t masked = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
518 461580348 : wwu_t select = _mm512_rolv_epi32( one, masked );
519 461580348 : wwu_t next = wwu_or( bloom, select );
520 461580348 : __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE ); /* if non-zero, not a duplicate */
521 461580348 : bloom = next;
522 : /* kortestz_mask8_u8: "Compute the bitwise OR of 8-bit masks a and
523 : b. If the result is all zeroes, [return] 1" */
524 461580348 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
525 461580348 : FD_COMPILER_FORGET( any_duplicates );
526 461580348 : }
527 157477713 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
528 126007544 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
529 126007544 : wwu_t blinded = wwu_xor( addr, entropy );
530 126007544 : wwu_t masked = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
531 126007544 : wwu_t select = _mm512_rolv_epi32( one, masked );
532 126007544 : wwu_t next = wwu_or( bloom, select );
533 126007544 : __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE );
534 126007544 : bloom = next;
535 126007544 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
536 126007544 : FD_COMPILER_FORGET( any_duplicates );
537 126007544 : }
538 31470169 : return any_duplicates;
539 31470169 : } else {
540 : /* Two vector version */
541 10032254 : const wwu_t mask = wwu_bcast( 0x3FU );
542 10032254 : const wwu_t shift0 = wwu( 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
543 10032254 : 6U, 6U, 6U, 6U, 6U, 6U, 6U, 6U );
544 10032254 : const wwu_t shift1 = wwu( 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U,
545 10032254 : 18U, 18U, 18U, 18U, 18U, 18U, 18U, 18U );
546 10032254 : wwu_t bloom0 = wwu_zero(); wwu_t bloom1 = wwu_zero();
547 259426680 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
548 249394426 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
549 249394426 : wwu_t blinded = wwu_xor( addr, entropy );
550 249394426 : wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) ); wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
551 249394426 : wwu_t select0 = wwu_shl_vector( one, masked0 ); wwu_t select1 = wwu_shl_vector( one, masked1 );
552 249394426 : wwu_t next0 = wwu_or( bloom0, select0 ); wwu_t next1 = wwu_or( bloom1, select1 );
553 249394426 : __mmask8 any_differences = _kor_mask8(
554 249394426 : _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ), _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
555 :
556 249394426 : bloom0 = next0; bloom1 = next1;
557 :
558 249394426 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
559 249394426 : FD_COMPILER_FORGET( any_duplicates );
560 249394426 : }
561 240113566 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
562 230081312 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
563 230081312 : wwu_t blinded = wwu_xor( addr, entropy );
564 230081312 : wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) ); wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
565 230081312 : wwu_t select0 = wwu_shl_vector( one, masked0 ); wwu_t select1 = wwu_shl_vector( one, masked1 );
566 230081312 : wwu_t next0 = wwu_or( bloom0, select0 ); wwu_t next1 = wwu_or( bloom1, select1 );
567 230081312 : __mmask8 any_differences = _kor_mask8(
568 230081312 : _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ), _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
569 :
570 230081312 : bloom0 = next0; bloom1 = next1;
571 :
572 230081312 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
573 230081312 : FD_COMPILER_FORGET( any_duplicates );
574 230081312 : }
575 10032254 : return any_duplicates;
576 10032254 : }
577 41502423 : }
578 :
579 : #else
580 :
581 : static inline int
582 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
583 : fd_acct_addr_t const * list0, ulong list0_cnt,
584 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
585 : (void)chkdup;
586 : (void)list0;
587 : (void)list1;
588 : (void)list0_cnt;
589 : (void)list1_cnt;
590 : return 1;
591 : }
592 :
593 : #endif
594 :
595 :
596 : FD_PROTOTYPES_END
597 :
598 : #endif /* HEADER_fd_src_ballet_pack_fd_chkdup_h */
|