Line data Source code
1 : #ifndef HEADER_fd_src_ballet_pack_fd_chkdup_h
2 : #define HEADER_fd_src_ballet_pack_fd_chkdup_h
3 : #include "../../ballet/fd_ballet_base.h"
4 : #include "../../ballet/txn/fd_txn.h"
5 :
6 : /* fd_chkdup declares a set of functions for ultra-HPC checking if a
7 : list of account addresses contains any duplicates. It's important
8 : that this be fast, because a transaction containing duplicate account
9 : addresses fails to sanitize and is not charged a fee. Although this
10 : check can (and really ought) to be done in parallel, perhaps in the
11 : verify tiles, right now, it's done in pack, which means it's serial
12 : and on the critical path.
13 :
14 : On platforms with AVX, the current implementation uses a fast initial
15 : check which may have false positives (thinking there are duplicates
16 : when there aren't). Any transaction that fails the initial check is
17 : then subjected to the full, precise check. Without AVX, all
18 : transactions use the slow path. */
19 :
20 : /* The functions are defined in the header to facilitate inlining since
21 : they take 10s of cycles in the good case, but should probably be
22 : treated as if they were defined in a .c file. */
23 :
24 : #ifndef FD_CHKDUP_IMPL
25 : # if FD_HAS_AVX512
26 : # include "../../util/simd/fd_avx.h"
27 16641 : # define FD_CHKDUP_IMPL 2
28 : # elif FD_HAS_AVX
29 16898 : # define FD_CHKDUP_IMPL 1
30 : # else
31 : # define FD_CHKDUP_IMPL 0
32 : # endif
33 : #endif
34 :
35 :
36 : #if FD_CHKDUP_IMPL==2
37 : # define FD_CHKDUP_ALIGN ( 64UL)
38 : #elif FD_CHKDUP_IMPL==1
39 : # define FD_CHKDUP_ALIGN ( 32UL)
40 : #elif FD_CHKDUP_IMPL==0
41 : # define FD_CHKDUP_ALIGN ( 32UL)
42 : #else
43 : # error "Unrecognized value of FD_CHKDUP_IMPL"
44 : #endif
45 :
46 :
47 : # define FD_CHKDUP_FOOTPRINT FD_LAYOUT_FINI( FD_LAYOUT_APPEND( \
48 : FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \
49 : FD_CHKDUP_ALIGN, 32*FD_CHKDUP_IMPL ), \
50 : 32UL, (1UL<<8)*32UL ), \
51 : FD_CHKDUP_ALIGN )
52 :
53 : FD_STATIC_ASSERT( (1UL<<8)==2*FD_TXN_ACCT_ADDR_MAX, "hash table size" );
54 :
55 : /* Fixed size (just over 8kB) and safe for declaration on the stack or
56 : inclusion in a struct. */
57 : struct fd_chkdup_private;
58 : typedef struct fd_chkdup_private fd_chkdup_t;
59 :
60 : FD_PROTOTYPES_BEGIN
61 : /* fd_chkdup_{footprint, align} return the footprint and alignment of
62 : the scratch memory that duplicate detection requires. */
63 0 : static inline ulong fd_chkdup_footprint( void ) { return FD_CHKDUP_FOOTPRINT; }
64 0 : static inline ulong fd_chkdup_align ( void ) { return FD_CHKDUP_ALIGN; }
65 :
66 : /* fd_chkdup_new formats an appropriately sized region of memory for use
67 : in duplicate address detection. shmem must point to the first byte
68 : of a region of memory with the appropriate alignment and footprint.
69 : rng must be a pointer to a local join of an RNG. Some slots of the
70 : RNG will be consumed, but no interest in the RNG will be retained
71 : after the function returns. Returns shmem on success and NULL on
72 : failure (logs details). The only failure cases are if shmem is NULL
73 : or not aligned.
74 :
75 : fd_chkdup_join joins the caller to the formatted region of memory.
76 : Returns shmem.
77 :
78 : fd_chkdup_leave unjoins the caller to chkdup. Returns chkdup.
79 : fd_chkdup_delete unformats the region of memory. Returns a pointer
80 : to the unformatted memory region. */
81 : static inline void * fd_chkdup_new ( void * shmem, fd_rng_t * rng );
82 : static inline fd_chkdup_t * fd_chkdup_join ( void * shmem );
83 : static inline void * fd_chkdup_leave ( fd_chkdup_t * chkdup );
84 : static inline void * fd_chkdup_delete( void * shmem );
85 :
86 :
87 : /* fd_chkdup_check{,_slow,_fast} check a list of account addresses for
88 : any duplicate addresses, i.e. an account address that appears twice
89 : in the list. The list does not need to be sorted or have any
90 : particular order. The list may be decomposed into two sublists
91 : (list0 and list1) to facilitate 0-copy usage with address lookup
92 : tables, but list0 and list1 are logically concatenated prior to
93 : checking for duplicates.
94 :
95 : chkdup is a pointer to a valid local join of a chkdup object.
96 :
97 : list0 and list1 point to the first account address of the respective
98 : sublists. The memory they point to need not have any particular
99 : alignment. list0==NULL is okay only if list0_cnt==0, and similarly
100 : for list1. list0 is accessed with indices [0, list0_cnt) and list1
101 : is accessed with indices [0, list1_cnt). list0 and list1 must not
102 : overlap. Requires list0_cnt+list1_cnt<=128, and the function is
103 : somewhat tuned for smaller values.
104 :
105 : fd_chkdup_check and the _slow version return 1 if the list of
106 : transactions contains at least one duplicated account address and 0
107 : otherwise (implying each account address in the provided list is
108 : unique).
109 :
110 : fd_chkdup_check_fast returns 1 if the list of transactions contains
111 : at least one duplicated account address and typically returns 0 if
112 : each account address in the provided list is unique, but may
113 : sometimes spuriiously return 1 even without duplicates.
114 :
115 : WARNING: the _fast version MAY HAVE FALSE POSITIVES. You probably
116 : want the un-suffixed version, which is precise. It uses the fast
117 : version as a fast-path and then does a slower full check if the
118 : fast-path suggests there may be a duplicate.
119 :
120 : However, it's also worth calling out again that the _fast version
121 : only makes errors in one direction. If the list contains duplicates,
122 : it will definitely return 1. If it returns 0, the list definitely
123 : does not contain duplicates. (Those two statements are equivalent).
124 : */
125 : static inline int
126 : fd_chkdup_check ( fd_chkdup_t * chkdup,
127 : fd_acct_addr_t const * list0, ulong list0_cnt,
128 : fd_acct_addr_t const * list1, ulong list1_cnt );
129 : static inline int
130 : fd_chkdup_check_slow( fd_chkdup_t * chkdup,
131 : fd_acct_addr_t const * list0, ulong list0_cnt,
132 : fd_acct_addr_t const * list1, ulong list1_cnt );
133 : static inline int
134 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
135 : fd_acct_addr_t const * list0, ulong list0_cnt,
136 : fd_acct_addr_t const * list1, ulong list1_cnt );
137 :
138 :
139 : /* ----- Implementation details and discussion follow------
140 :
141 : The fast path implementation is somewhat interesting. The normal way
142 : to do this is with a Bloom filter, but Bloom filters do lots of
143 : unpredictable reads and the pack tile is somewhat cache sensitive.
144 : Instead, this implements a variant on a Bloom filter that lives
145 : entirely in AVX registers.
146 :
147 : Basically, we use C W-bit words stored in (C*W)/256 AVX2 registers or
148 : (C*W)/512 AVX512 registers. Each word is a modified Bloom filter
149 : with one associated hash function. For each account address, we
150 : compute C hashes, giving (up to) C positions across the AVX
151 : registers. If any of those positions have an unset bit, then we know
152 : we have not seen the account address before. Finally, we then set
153 : all the positions.
154 :
155 : The only difference between this idea and a normal Bloom filter is
156 : that sometimes the hash function may not select a bit. There's a
157 : tradeoff to be made here: suppose you insert R account addresses, and
158 : R is at least almost as large as W. Then each word fills up, and
159 : false positives become increasingly likely. Only testing and
160 : inserting a bit, say, half the time, effectively halves C, but
161 : prevents each word from getting saturated as quickly, and makes the
162 : algorithm effective for larger values of R. We use Intel's quirky
163 : behavior to get this for free.
164 :
165 : (Note: The R and C notation is supposed to suggest a tabular layout
166 : in which the account addresses are rows and the words are columns.)
167 :
168 : The key insight into making this work quickly is that the vpsllv{d,q}
169 : variable left shift instructions are cheap (1 cycle, can execute on
170 : port 0 or 1 for AVX2, still 1 cycle on port 0 for AVX512). If we can
171 : compute the hashes in parallel with SIMD, then we can variably shift
172 : bcast(0x1) quickly, and select several bits at a time. The rest is
173 : just bitwise logic, which is extremely cheap with AVX. This approach
174 : constrains W to be either 32 or 64, and C to be a multiple of the
175 : number of words in a vector, but those are both pretty acceptable
176 : constraints.
177 :
178 : The final ingredient is a cheap hash function that places the hashes
179 : in the appropriate position for vpsllv. We just xor the account
180 : address with some validator-specific entropy and use a mask to select
181 : certain bits.
182 :
183 : The slow-path implementation uses a hash table to check for
184 : duplicates. This is slower than sorting for transactions with only a
185 : few account addresses, but substantially faster than sorting for
186 : transactions with large numbers of account addresses, which is when
187 : the slow-path matters more anyway.
188 :
189 :
190 : You can see from above there are a variety of knobs to tune. Should
191 : W be 32 or 64? How big should C be? How many bits should we mask
192 : off for the hash, which controls the frequency with which we skip a
193 : word, neither checking nor inserting a bit? It would be good to have
194 : a rigorous understanding of the false positive rate as a function of
195 : these parameters so we can make a decision that minimizes the
196 : expected compute required.
197 :
198 : Unfortunately, the false positive computation is tricky. The key
199 : difficulty is that whether processing account address r results in a
200 : false positive is not independent from whether processing account
201 : address r+1 results in a false positive. This leads to an obnoxious
202 : inclusion-exclusion formula which quickly becomes more unwieldy than
203 : I (or Sage) can handle.
204 :
205 : A dynamic-programming-ish algorithm can compute the false positive
206 : rate in approximately O(R*2^R) time. To start, we just want to
207 : understand a single word/column. Suppose k bits in the word have
208 : been set. Then there are (W-k) hash values that set a new bit, and
209 : (V+k) hash values that don't set a new, where V is the number of hash
210 : values that don't select a bit. The hash values that set a bit are
211 : also exactly the ones that provide information that the account
212 : address is not a false positive. I think about this as "spending a
213 : bit" to know it's not a false positive.
214 :
215 : Denoting the rows at which we spend a bit by a 1 and the rows at
216 : which we don't spend a bit by 0, we might get a column like:
217 :
218 : 1
219 : 0
220 : 1
221 : 1
222 : 0.
223 : The number of ways this column can occur is x1 =
224 : W*(V+1)*(W-1)*(W-2)*(V+3), which means that the probability the
225 : column occurs is x1/(W+V)^5. Expanding to multiple columns is easy,
226 : since the number of ways that two specific columns can occur is just
227 : the product of the number of ways each can occur. For example,
228 : 1 0
229 : 0 1
230 : 1 1
231 : 1 0
232 : 0 0
233 : can occur x1 * x2 ways, where x2 = V*W*(W-1)*(V+2)*(V+2).
234 : A false positive happens when there's a row of all 0s, as in the last
235 : row of the example.
236 :
237 : It's cleaner to count the number of ways not to get a false positive.
238 : This gives us the inclusion-exclusion formula:
239 : (all ways)
240 : - (all ways where row 0 is all 0s)
241 : - (all ways where row 1 is all 0s)
242 : - ...
243 : + (all ways where rows 0 and 1 are both all 0s)
244 : + (all ways where rows 0 and 2 are both all 0s)
245 : + ...
246 : - (all ways where rows 0, 1, and 2 are all 0s)
247 : - (all ways where rows 0, 1, and 3 are all 0s)
248 : +, - ...
249 : + (-1)^R (all ways in which all rows are all 0s).
250 :
251 : There's a nice way to understand each of these terms. For example,
252 : in the R=2, C=3 case, the term in which row 0 is all 0s has the
253 : following elements:
254 : 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
255 : 0 0 0 0 0 1 0 1 0 0 1 1 1 0 0 1 0 1 1 1 0 1 1 1
256 : Rather than enumerating all 2^( (R-1)*C ) elements, we'll represent
257 : it as
258 : ( 0 + 0 )^3
259 : ( 0 1 )
260 :
261 : Skipping some steps, the task boils down to counting the number of
262 : ways to get columns that match a mask, then raising that to the Cth
263 : power.
264 :
265 : Now, with all this behind us, our goal is to pick the optimal value
266 : of W,V, and C given a supposed distribution of transactions, and a
267 : performance model. Based on some back of the envelope calculations
268 : based on instruction throughput and latency measurements and
269 : confirmed by some experiments, the fast path code takes about 3.5*R
270 : cycles if using one AVX2 vector (W*C==256) and
271 : 2.5*R+2*ceil(W*C/256)*R cycles otherwise. The slow path takes about
272 : 133*J cycles. Then the expected value of the number of cycles it
273 : takes to process a transaction with R accounts is
274 : R*(2.5+2*ceil(W*C/256) - [W*C<=256]) + FP_{W,V,R,C}*133*R
275 :
276 : Based on a sample of 100,000 slots containing about 100M
277 : transactions, the CDF looks like
278 :
279 : Fraction of transactions containing <= 3 account addresses 71%
280 : <= 13 account addresses 80%
281 : <= 24 account addresses 91%
282 : <= 31 account addresses 95%
283 : <= 44 account addresses 98%
284 : <= 50 account addresses 99%
285 :
286 : Basically, there's a peak at 3 (votes), and then a very long, very
287 : fat tail. When using AVX2, it basically boils down into 2 regimes:
288 : 0 <= R < 28 W=32, C=8, V=0 (one AVX vector)
289 : 28 <= R <= 64 W=32, C=32, V=32 (four AVX vectors)
290 :
291 : This combination has an expected value of about 54 cycles over all
292 : transactions. For a typical transaction with 3 account addresses,
293 : this takes about 10 cycles and the false positive probability is
294 : about 2e-10.
295 :
296 : When using AVX512, the regimes are similar:
297 : 0 <= R < 36 W=32, C=16, V=0 (one AVX vector)
298 : 36 <= R <= 64 W=32, C=32, V=32 (two AVX vectors)
299 : This combination has an expected value of about 33 cycles over all
300 : transactions. Again, the typical 3 account address account takes
301 : about 9 cycles and has a negligible false positive probability. */
302 :
303 :
304 : struct fd_chkdup_waddr {
305 : fd_acct_addr_t key; /* account address */
306 : };
307 : typedef struct fd_chkdup_waddr fd_chkdup_waddr_t;
308 : static const fd_acct_addr_t chkdup_null_addr = {{ 0 }};
309 :
310 : #define MAP_NAME fd_chkdup_pmap
311 1603396980 : #define MAP_T fd_chkdup_waddr_t
312 3456007665 : #define MAP_KEY_T fd_acct_addr_t
313 1603495281 : #define MAP_KEY_NULL chkdup_null_addr
314 5059407249 : #define MAP_KEY_INVAL(k) MAP_KEY_EQUAL(k, chkdup_null_addr)
315 5187856935 : #define MAP_KEY_EQUAL(k0,k1) (!memcmp((k0).b,(k1).b, FD_TXN_ACCT_ADDR_SZ))
316 : #define MAP_KEY_EQUAL_IS_SLOW 1
317 : #define MAP_MEMOIZE 0
318 1724356845 : #define MAP_KEY_HASH(key) ((uint)fd_ulong_hash( fd_ulong_load_8( (key).b ) ))
319 3577166442 : #define MAP_LG_SLOT_CNT 8
320 : #include "../../util/tmpl/fd_map.c"
321 :
322 :
323 : struct __attribute__((aligned(FD_CHKDUP_ALIGN))) fd_chkdup_private {
324 : #if FD_CHKDUP_IMPL >= 1
325 : uchar entropy[ 32*FD_CHKDUP_IMPL ];
326 : #endif
327 :
328 : fd_chkdup_waddr_t hashmap[ 1UL<<8 ];
329 : };
330 :
331 : static inline void *
332 : fd_chkdup_new( void * shmem,
333 768 : fd_rng_t * rng ) {
334 768 : fd_chkdup_t * chkdup = (fd_chkdup_t *)shmem;
335 768 : #if FD_CHKDUP_IMPL >= 1
336 33536 : for( ulong i=0UL; i<32*FD_CHKDUP_IMPL; i++ ) chkdup->entropy[ i ] = fd_rng_uchar( rng );
337 : #else
338 : (void)rng;
339 : #endif
340 768 : FD_TEST( fd_chkdup_pmap_footprint()==sizeof(chkdup->hashmap) ); /* Known at compile time */
341 :
342 768 : fd_chkdup_pmap_new( chkdup->hashmap );
343 768 : return chkdup;
344 768 : }
345 :
346 243 : static inline fd_chkdup_t * fd_chkdup_join ( void * shmem ) { return (fd_chkdup_t *)shmem; }
347 :
348 228 : static inline void * fd_chkdup_leave ( fd_chkdup_t * chkdup ) { return (void *)chkdup; }
349 228 : static inline void * fd_chkdup_delete( void * shmem ) { return shmem; }
350 :
351 :
352 : static inline int
353 : fd_chkdup_check ( fd_chkdup_t * chkdup,
354 : fd_acct_addr_t const * list0, ulong list0_cnt,
355 82631337 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
356 82631337 : if( FD_LIKELY( 0==fd_chkdup_check_fast( chkdup, list0, list0_cnt, list1, list1_cnt ) ) ) return 0;
357 401593 : return fd_chkdup_check_slow( chkdup, list0, list0_cnt, list1, list1_cnt );
358 82631337 : }
359 :
360 : static inline int
361 : fd_chkdup_check_slow( fd_chkdup_t * chkdup,
362 : fd_acct_addr_t const * list0, ulong list0_cnt,
363 69451510 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
364 69451510 : fd_chkdup_waddr_t * map = fd_chkdup_pmap_join( chkdup->hashmap );
365 69451510 : fd_chkdup_waddr_t * inserted[ FD_TXN_ACCT_ADDR_MAX ];
366 69451510 : ulong inserted_cnt = 0UL;
367 :
368 69451510 : int any_duplicates = 0;
369 69451510 : int skipped_inval = 0;
370 1162727722 : for( ulong i0=0UL; (i0<list0_cnt) & !any_duplicates; i0++ ) {
371 1093276212 : if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list0[ i0 ] ) ) ) {
372 : /* Okay if this is the 1st, but not if the 2nd */
373 2676 : any_duplicates |= skipped_inval;
374 2676 : skipped_inval = 1;
375 2676 : continue;
376 2676 : }
377 1093273536 : fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list0[ i0 ] );
378 1093273536 : inserted[ inserted_cnt++ ] = ins;
379 1093273536 : any_duplicates |= (NULL==ins);
380 1093273536 : inserted_cnt -= (ulong)(NULL==ins); /* Correct inserted_cnt if we just stored a NULL */
381 1093273536 : }
382 579574882 : for( ulong i1=0UL; (i1<list1_cnt) & !any_duplicates; i1++ ) {
383 510123372 : if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list1[ i1 ] ) ) ) {
384 696 : any_duplicates |= skipped_inval;
385 696 : skipped_inval = 1;
386 696 : continue;
387 696 : }
388 510122676 : fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list1[ i1 ] );
389 510122676 : inserted[ inserted_cnt++ ] = ins;
390 510122676 : any_duplicates |= (NULL==ins);
391 510122676 : inserted_cnt -= (ulong)(NULL==ins);
392 510122676 : }
393 :
394 : /* FIXME: This depends on undocumented map behavior for correctness.
395 : Deleting in the opposite order of insertion preserves previously
396 : inserted pointers. That behavior should be documented. */
397 1672750183 : for( ulong i=0UL; i<inserted_cnt; i++ ) fd_chkdup_pmap_remove( map, inserted[ inserted_cnt-1UL-i ] );
398 :
399 69451510 : fd_chkdup_pmap_leave( map );
400 :
401 69451510 : return any_duplicates;
402 69451510 : }
403 :
404 :
405 : #if FD_CHKDUP_IMPL==1
406 :
407 : /* AVX2 implementation */
408 : #include "../../util/simd/fd_avx.h"
409 : static inline int
410 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
411 : fd_acct_addr_t const * list0, ulong list0_cnt,
412 103120836 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
413 103120836 : if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
414 :
415 83010756 : int any_duplicates = 0;
416 :
417 83010756 : const wu_t entropy = wb_ld( chkdup->entropy );
418 83010756 : const wu_t one = wu_bcast( 1U );
419 :
420 :
421 83010756 : if( FD_LIKELY( list0_cnt+list1_cnt<28UL ) ) {
422 : /* Single vector implementation */
423 46945980 : const wu_t mask = wu_bcast( 0x1FU );
424 :
425 46945980 : wu_t bloom = wu_zero();
426 534133132 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
427 487187152 : wu_t addr = wb_ldu( list0+i0 );
428 487187152 : wu_t masked = wu_and( wu_xor( addr, entropy ), mask );
429 487187152 : wu_t select = wu_shl_vector( one, masked );
430 : /* testc: "Compute the bitwise NOT of a and then AND with b, and
431 : [return] 1 if the result is zero." */
432 487187152 : any_duplicates |= _mm256_testc_si256( bloom, select );
433 487187152 : bloom = wu_or( bloom, select );
434 487187152 : }
435 222955544 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
436 176009564 : wu_t addr = wb_ldu( list1+i1 );
437 176009564 : wu_t masked = wu_and( wu_xor( addr, entropy ), mask );
438 176009564 : wu_t select = wu_shl_vector( one, masked );
439 176009564 : any_duplicates |= _mm256_testc_si256( bloom, select );
440 176009564 : bloom = wu_or( bloom, select );
441 176009564 : }
442 46945980 : return any_duplicates;
443 :
444 46945980 : } else {
445 : /* 4-vector implementation: slower but much better false positive
446 : rate so that we don't have to fall back to the slow path as
447 : frequently. */
448 36064776 : const wu_t mask = wu_bcast( 0x3FU );
449 :
450 36064776 : wu_t bloom0 = wu_zero(); wu_t bloom1 = wu_zero();
451 36064776 : wu_t bloom2 = wu_zero(); wu_t bloom3 = wu_zero();
452 1014851650 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
453 978786874 : wu_t addr = wb_ldu( list0+i0 );
454 978786874 : wu_t blinded = wu_xor( addr, entropy );
455 978786874 : wu_t masked0 = wu_and( mask, blinded ); wu_t masked1 = wu_and( mask, wu_shr( blinded, 6 ) );
456 978786874 : wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
457 978786874 : wu_t select0 = wu_shl_vector( one, masked0 ); wu_t select1 = wu_shl_vector( one, masked1 );
458 978786874 : wu_t select2 = wu_shl_vector( one, masked2 ); wu_t select3 = wu_shl_vector( one, masked3 );
459 :
460 978786874 : wu_t any_differences = wu_or(
461 978786874 : wu_or( wu_andnot( bloom0, select0 ), wu_andnot( bloom1, select1 ) ),
462 978786874 : wu_or( wu_andnot( bloom2, select2 ), wu_andnot( bloom3, select3 ) ) );
463 :
464 978786874 : bloom0 = wu_or( bloom0, select0 ); bloom1 = wu_or( bloom1, select1 );
465 978786874 : bloom2 = wu_or( bloom2, select2 ); bloom3 = wu_or( bloom3, select3 );
466 :
467 978786874 : any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
468 978786874 : FD_COMPILER_FORGET( any_duplicates );
469 978786874 : }
470 528234226 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
471 492169450 : wu_t addr = wb_ldu( list1+i1 );
472 492169450 : wu_t blinded = wu_xor( addr, entropy );
473 492169450 : wu_t masked0 = wu_and( mask, blinded ); wu_t masked1 = wu_and( mask, wu_shr( blinded, 6 ) );
474 492169450 : wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
475 492169450 : wu_t select0 = wu_shl_vector( one, masked0 ); wu_t select1 = wu_shl_vector( one, masked1 );
476 492169450 : wu_t select2 = wu_shl_vector( one, masked2 ); wu_t select3 = wu_shl_vector( one, masked3 );
477 :
478 492169450 : wu_t any_differences = wu_or(
479 492169450 : wu_or( wu_andnot( bloom0, select0 ), wu_andnot( bloom1, select1 ) ),
480 492169450 : wu_or( wu_andnot( bloom2, select2 ), wu_andnot( bloom3, select3 ) ) );
481 :
482 492169450 : bloom0 = wu_or( bloom0, select0 ); bloom1 = wu_or( bloom1, select1 );
483 492169450 : bloom2 = wu_or( bloom2, select2 ); bloom3 = wu_or( bloom3, select3 );
484 :
485 492169450 : any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
486 492169450 : FD_COMPILER_FORGET( any_duplicates );
487 492169450 : }
488 36064776 : return any_duplicates;
489 36064776 : }
490 83010756 : }
491 :
492 : #elif FD_CHKDUP_IMPL==2
493 :
494 : /* AVX512 implementation */
495 : #include "../../util/simd/fd_avx512.h"
496 : static inline int
497 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
498 : fd_acct_addr_t const * list0, ulong list0_cnt,
499 51560418 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
500 51560418 : if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
501 :
502 41505378 : int any_duplicates = 0;
503 :
504 41505378 : const wwu_t entropy = wwu_ld( (uint const *)chkdup->entropy );
505 41505378 : const wwu_t one = wwu_bcast( 1U );
506 :
507 41505378 : if( FD_LIKELY( list0_cnt+list1_cnt<36UL ) ) {
508 : /* One vector version */
509 : /* Our analysis assumed the 64 bytes of hash were all independent,
510 : but if we just xor and then use the low 5 bits of both parts of
511 : the vector, we get a lot more false positives than the math
512 : predicts. */
513 31473124 : const wwu_t mask = wwu_bcast( 0x1FU );
514 31473124 : wwu_t bloom = wwu_zero();
515 493068348 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
516 461595224 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
517 461595224 : wwu_t blinded = wwu_xor( addr, entropy );
518 461595224 : wwu_t masked = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
519 461595224 : wwu_t select = _mm512_rolv_epi32( one, masked );
520 461595224 : wwu_t next = wwu_or( bloom, select );
521 461595224 : __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE ); /* if non-zero, not a duplicate */
522 461595224 : bloom = next;
523 : /* kortestz_mask8_u8: "Compute the bitwise OR of 8-bit masks a and
524 : b. If the result is all zeroes, [return] 1" */
525 461595224 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
526 461595224 : FD_COMPILER_FORGET( any_duplicates );
527 461595224 : }
528 157480668 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
529 126007544 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
530 126007544 : wwu_t blinded = wwu_xor( addr, entropy );
531 126007544 : wwu_t masked = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
532 126007544 : wwu_t select = _mm512_rolv_epi32( one, masked );
533 126007544 : wwu_t next = wwu_or( bloom, select );
534 126007544 : __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE );
535 126007544 : bloom = next;
536 126007544 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
537 126007544 : FD_COMPILER_FORGET( any_duplicates );
538 126007544 : }
539 31473124 : return any_duplicates;
540 31473124 : } else {
541 : /* Two vector version */
542 10032254 : const wwu_t mask = wwu_bcast( 0x3FU );
543 10032254 : const wwu_t shift0 = wwu( 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
544 10032254 : 6U, 6U, 6U, 6U, 6U, 6U, 6U, 6U );
545 10032254 : const wwu_t shift1 = wwu( 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U,
546 10032254 : 18U, 18U, 18U, 18U, 18U, 18U, 18U, 18U );
547 10032254 : wwu_t bloom0 = wwu_zero(); wwu_t bloom1 = wwu_zero();
548 259426680 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
549 249394426 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
550 249394426 : wwu_t blinded = wwu_xor( addr, entropy );
551 249394426 : wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) ); wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
552 249394426 : wwu_t select0 = wwu_shl_vector( one, masked0 ); wwu_t select1 = wwu_shl_vector( one, masked1 );
553 249394426 : wwu_t next0 = wwu_or( bloom0, select0 ); wwu_t next1 = wwu_or( bloom1, select1 );
554 249394426 : __mmask8 any_differences = _kor_mask8(
555 249394426 : _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ), _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
556 :
557 249394426 : bloom0 = next0; bloom1 = next1;
558 :
559 249394426 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
560 249394426 : FD_COMPILER_FORGET( any_duplicates );
561 249394426 : }
562 240113566 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
563 230081312 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
564 230081312 : wwu_t blinded = wwu_xor( addr, entropy );
565 230081312 : wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) ); wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
566 230081312 : wwu_t select0 = wwu_shl_vector( one, masked0 ); wwu_t select1 = wwu_shl_vector( one, masked1 );
567 230081312 : wwu_t next0 = wwu_or( bloom0, select0 ); wwu_t next1 = wwu_or( bloom1, select1 );
568 230081312 : __mmask8 any_differences = _kor_mask8(
569 230081312 : _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ), _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
570 :
571 230081312 : bloom0 = next0; bloom1 = next1;
572 :
573 230081312 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
574 230081312 : FD_COMPILER_FORGET( any_duplicates );
575 230081312 : }
576 10032254 : return any_duplicates;
577 10032254 : }
578 41505378 : }
579 :
580 : #else
581 :
582 : static inline int
583 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
584 : fd_acct_addr_t const * list0, ulong list0_cnt,
585 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
586 : (void)chkdup;
587 : (void)list0;
588 : (void)list1;
589 : (void)list0_cnt;
590 : (void)list1_cnt;
591 : return 1;
592 : }
593 :
594 : #endif
595 :
596 :
597 : FD_PROTOTYPES_END
598 :
599 : #endif /* HEADER_fd_src_ballet_pack_fd_chkdup_h */
|