Line data Source code
1 : #ifndef HEADER_fd_src_disco_pack_fd_chkdup_h
2 : #define HEADER_fd_src_disco_pack_fd_chkdup_h
3 :
4 : #include "../../ballet/fd_ballet_base.h"
5 : #include "../../ballet/txn/fd_txn.h"
6 :
7 : /* fd_chkdup declares a set of functions for ultra-HPC checking if a
8 : list of account addresses contains any duplicates. It's important
9 : that this be fast, because a transaction containing duplicate account
10 : addresses fails to sanitize and is not charged a fee. Although this
11 : check can (and really ought) to be done in parallel, perhaps in the
12 : verify tiles, right now, it's done in pack, which means it's serial
13 : and on the critical path.
14 :
15 : On platforms with AVX, the current implementation uses a fast initial
16 : check which may have false positives (thinking there are duplicates
17 : when there aren't). Any transaction that fails the initial check is
18 : then subjected to the full, precise check. Without AVX, all
19 : transactions use the slow path. */
20 :
21 : /* The functions are defined in the header to facilitate inlining since
22 : they take 10s of cycles in the good case, but should probably be
23 : treated as if they were defined in a .c file. */
24 :
25 : #ifndef FD_CHKDUP_IMPL
26 : # if FD_HAS_AVX512
27 : # include "../../util/simd/fd_avx.h"
28 17356 : # define FD_CHKDUP_IMPL 2
29 : # elif FD_HAS_AVX
30 17624 : # define FD_CHKDUP_IMPL 1
31 : # else
32 : # define FD_CHKDUP_IMPL 0
33 : # endif
34 : #endif
35 :
36 :
37 : #if FD_CHKDUP_IMPL==2
38 : # define FD_CHKDUP_ALIGN ( 64UL)
39 : #elif FD_CHKDUP_IMPL==1
40 : # define FD_CHKDUP_ALIGN ( 32UL)
41 : #elif FD_CHKDUP_IMPL==0
42 : # define FD_CHKDUP_ALIGN ( 32UL)
43 : #else
44 : # error "Unrecognized value of FD_CHKDUP_IMPL"
45 : #endif
46 :
47 :
48 : # define FD_CHKDUP_FOOTPRINT FD_LAYOUT_FINI( FD_LAYOUT_APPEND( \
49 : FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \
50 : FD_CHKDUP_ALIGN, 32*FD_CHKDUP_IMPL ), \
51 : 32UL, (1UL<<8)*32UL ), \
52 : FD_CHKDUP_ALIGN )
53 :
54 : FD_STATIC_ASSERT( (1UL<<8)==2*FD_TXN_ACCT_ADDR_MAX, "hash table size" );
55 :
56 : /* Fixed size (just over 8kB) and safe for declaration on the stack or
57 : inclusion in a struct. */
58 : struct fd_chkdup_private;
59 : typedef struct fd_chkdup_private fd_chkdup_t;
60 :
61 : FD_PROTOTYPES_BEGIN
62 : /* fd_chkdup_{footprint, align} return the footprint and alignment of
63 : the scratch memory that duplicate detection requires. */
64 0 : static inline ulong fd_chkdup_footprint( void ) { return FD_CHKDUP_FOOTPRINT; }
65 0 : static inline ulong fd_chkdup_align ( void ) { return FD_CHKDUP_ALIGN; }
66 :
67 : /* fd_chkdup_new formats an appropriately sized region of memory for use
68 : in duplicate address detection. shmem must point to the first byte
69 : of a region of memory with the appropriate alignment and footprint.
70 : rng must be a pointer to a local join of an RNG. Some slots of the
71 : RNG will be consumed, but no interest in the RNG will be retained
72 : after the function returns. Returns shmem on success and NULL on
73 : failure (logs details). The only failure cases are if shmem is NULL
74 : or not aligned.
75 :
76 : fd_chkdup_join joins the caller to the formatted region of memory.
77 : Returns shmem.
78 :
79 : fd_chkdup_leave unjoins the caller to chkdup. Returns chkdup.
80 : fd_chkdup_delete unformats the region of memory. Returns a pointer
81 : to the unformatted memory region. */
82 : static inline void * fd_chkdup_new ( void * shmem, fd_rng_t * rng );
83 : static inline fd_chkdup_t * fd_chkdup_join ( void * shmem );
84 : static inline void * fd_chkdup_leave ( fd_chkdup_t * chkdup );
85 : static inline void * fd_chkdup_delete( void * shmem );
86 :
87 :
88 : /* fd_chkdup_check{,_slow,_fast} check a list of account addresses for
89 : any duplicate addresses, i.e. an account address that appears twice
90 : in the list. The list does not need to be sorted or have any
91 : particular order. The list may be decomposed into two sublists
92 : (list0 and list1) to facilitate 0-copy usage with address lookup
93 : tables, but list0 and list1 are logically concatenated prior to
94 : checking for duplicates.
95 :
96 : chkdup is a pointer to a valid local join of a chkdup object.
97 :
98 : list0 and list1 point to the first account address of the respective
99 : sublists. The memory they point to need not have any particular
100 : alignment. list0==NULL is okay only if list0_cnt==0, and similarly
101 : for list1. list0 is accessed with indices [0, list0_cnt) and list1
102 : is accessed with indices [0, list1_cnt). list0 and list1 must not
103 : overlap. Requires list0_cnt+list1_cnt<=128, and the function is
104 : somewhat tuned for smaller values.
105 :
106 : fd_chkdup_check and the _slow version return 1 if the list of
107 : transactions contains at least one duplicated account address and 0
108 : otherwise (implying each account address in the provided list is
109 : unique).
110 :
111 : fd_chkdup_check_fast returns 1 if the list of transactions contains
112 : at least one duplicated account address and typically returns 0 if
113 : each account address in the provided list is unique, but may
114 : sometimes spuriiously return 1 even without duplicates.
115 :
116 : WARNING: the _fast version MAY HAVE FALSE POSITIVES. You probably
117 : want the un-suffixed version, which is precise. It uses the fast
118 : version as a fast-path and then does a slower full check if the
119 : fast-path suggests there may be a duplicate.
120 :
121 : However, it's also worth calling out again that the _fast version
122 : only makes errors in one direction. If the list contains duplicates,
123 : it will definitely return 1. If it returns 0, the list definitely
124 : does not contain duplicates. (Those two statements are equivalent).
125 : */
126 : static inline int
127 : fd_chkdup_check ( fd_chkdup_t * chkdup,
128 : fd_acct_addr_t const * list0, ulong list0_cnt,
129 : fd_acct_addr_t const * list1, ulong list1_cnt );
130 : static inline int
131 : fd_chkdup_check_slow( fd_chkdup_t * chkdup,
132 : fd_acct_addr_t const * list0, ulong list0_cnt,
133 : fd_acct_addr_t const * list1, ulong list1_cnt );
134 : static inline int
135 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
136 : fd_acct_addr_t const * list0, ulong list0_cnt,
137 : fd_acct_addr_t const * list1, ulong list1_cnt );
138 :
139 :
140 : /* ----- Implementation details and discussion follow------
141 :
142 : The fast path implementation is somewhat interesting. The normal way
143 : to do this is with a Bloom filter, but Bloom filters do lots of
144 : unpredictable reads and the pack tile is somewhat cache sensitive.
145 : Instead, this implements a variant on a Bloom filter that lives
146 : entirely in AVX registers.
147 :
148 : Basically, we use C W-bit words stored in (C*W)/256 AVX2 registers or
149 : (C*W)/512 AVX512 registers. Each word is a modified Bloom filter
150 : with one associated hash function. For each account address, we
151 : compute C hashes, giving (up to) C positions across the AVX
152 : registers. If any of those positions have an unset bit, then we know
153 : we have not seen the account address before. Finally, we then set
154 : all the positions.
155 :
156 : The only difference between this idea and a normal Bloom filter is
157 : that sometimes the hash function may not select a bit. There's a
158 : tradeoff to be made here: suppose you insert R account addresses, and
159 : R is at least almost as large as W. Then each word fills up, and
160 : false positives become increasingly likely. Only testing and
161 : inserting a bit, say, half the time, effectively halves C, but
162 : prevents each word from getting saturated as quickly, and makes the
163 : algorithm effective for larger values of R. We use Intel's quirky
164 : behavior to get this for free.
165 :
166 : (Note: The R and C notation is supposed to suggest a tabular layout
167 : in which the account addresses are rows and the words are columns.)
168 :
169 : The key insight into making this work quickly is that the vpsllv{d,q}
170 : variable left shift instructions are cheap (1 cycle, can execute on
171 : port 0 or 1 for AVX2, still 1 cycle on port 0 for AVX512). If we can
172 : compute the hashes in parallel with SIMD, then we can variably shift
173 : bcast(0x1) quickly, and select several bits at a time. The rest is
174 : just bitwise logic, which is extremely cheap with AVX. This approach
175 : constrains W to be either 32 or 64, and C to be a multiple of the
176 : number of words in a vector, but those are both pretty acceptable
177 : constraints.
178 :
179 : The final ingredient is a cheap hash function that places the hashes
180 : in the appropriate position for vpsllv. We just xor the account
181 : address with some validator-specific entropy and use a mask to select
182 : certain bits.
183 :
184 : The slow-path implementation uses a hash table to check for
185 : duplicates. This is slower than sorting for transactions with only a
186 : few account addresses, but substantially faster than sorting for
187 : transactions with large numbers of account addresses, which is when
188 : the slow-path matters more anyway.
189 :
190 :
191 : You can see from above there are a variety of knobs to tune. Should
192 : W be 32 or 64? How big should C be? How many bits should we mask
193 : off for the hash, which controls the frequency with which we skip a
194 : word, neither checking nor inserting a bit? It would be good to have
195 : a rigorous understanding of the false positive rate as a function of
196 : these parameters so we can make a decision that minimizes the
197 : expected compute required.
198 :
199 : Unfortunately, the false positive computation is tricky. The key
200 : difficulty is that whether processing account address r results in a
201 : false positive is not independent from whether processing account
202 : address r+1 results in a false positive. This leads to an obnoxious
203 : inclusion-exclusion formula which quickly becomes more unwieldy than
204 : I (or Sage) can handle.
205 :
206 : A dynamic-programming-ish algorithm can compute the false positive
207 : rate in approximately O(R*2^R) time. To start, we just want to
208 : understand a single word/column. Suppose k bits in the word have
209 : been set. Then there are (W-k) hash values that set a new bit, and
210 : (V+k) hash values that don't set a new, where V is the number of hash
211 : values that don't select a bit. The hash values that set a bit are
212 : also exactly the ones that provide information that the account
213 : address is not a false positive. I think about this as "spending a
214 : bit" to know it's not a false positive.
215 :
216 : Denoting the rows at which we spend a bit by a 1 and the rows at
217 : which we don't spend a bit by 0, we might get a column like:
218 :
219 : 1
220 : 0
221 : 1
222 : 1
223 : 0.
224 : The number of ways this column can occur is x1 =
225 : W*(V+1)*(W-1)*(W-2)*(V+3), which means that the probability the
226 : column occurs is x1/(W+V)^5. Expanding to multiple columns is easy,
227 : since the number of ways that two specific columns can occur is just
228 : the product of the number of ways each can occur. For example,
229 : 1 0
230 : 0 1
231 : 1 1
232 : 1 0
233 : 0 0
234 : can occur x1 * x2 ways, where x2 = V*W*(W-1)*(V+2)*(V+2).
235 : A false positive happens when there's a row of all 0s, as in the last
236 : row of the example.
237 :
238 : It's cleaner to count the number of ways not to get a false positive.
239 : This gives us the inclusion-exclusion formula:
240 : (all ways)
241 : - (all ways where row 0 is all 0s)
242 : - (all ways where row 1 is all 0s)
243 : - ...
244 : + (all ways where rows 0 and 1 are both all 0s)
245 : + (all ways where rows 0 and 2 are both all 0s)
246 : + ...
247 : - (all ways where rows 0, 1, and 2 are all 0s)
248 : - (all ways where rows 0, 1, and 3 are all 0s)
249 : +, - ...
250 : + (-1)^R (all ways in which all rows are all 0s).
251 :
252 : There's a nice way to understand each of these terms. For example,
253 : in the R=2, C=3 case, the term in which row 0 is all 0s has the
254 : following elements:
255 : 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
256 : 0 0 0 0 0 1 0 1 0 0 1 1 1 0 0 1 0 1 1 1 0 1 1 1
257 : Rather than enumerating all 2^( (R-1)*C ) elements, we'll represent
258 : it as
259 : ( 0 + 0 )^3
260 : ( 0 1 )
261 :
262 : Skipping some steps, the task boils down to counting the number of
263 : ways to get columns that match a mask, then raising that to the Cth
264 : power.
265 :
266 : Now, with all this behind us, our goal is to pick the optimal value
267 : of W,V, and C given a supposed distribution of transactions, and a
268 : performance model. Based on some back of the envelope calculations
269 : based on instruction throughput and latency measurements and
270 : confirmed by some experiments, the fast path code takes about 3.5*R
271 : cycles if using one AVX2 vector (W*C==256) and
272 : 2.5*R+2*ceil(W*C/256)*R cycles otherwise. The slow path takes about
273 : 133*J cycles. Then the expected value of the number of cycles it
274 : takes to process a transaction with R accounts is
275 : R*(2.5+2*ceil(W*C/256) - [W*C<=256]) + FP_{W,V,R,C}*133*R
276 :
277 : Based on a sample of 100,000 slots containing about 100M
278 : transactions, the CDF looks like
279 :
280 : Fraction of transactions containing <= 3 account addresses 71%
281 : <= 13 account addresses 80%
282 : <= 24 account addresses 91%
283 : <= 31 account addresses 95%
284 : <= 44 account addresses 98%
285 : <= 50 account addresses 99%
286 :
287 : Basically, there's a peak at 3 (votes), and then a very long, very
288 : fat tail. When using AVX2, it basically boils down into 2 regimes:
289 : 0 <= R < 28 W=32, C=8, V=0 (one AVX vector)
290 : 28 <= R <= 64 W=32, C=32, V=32 (four AVX vectors)
291 :
292 : This combination has an expected value of about 54 cycles over all
293 : transactions. For a typical transaction with 3 account addresses,
294 : this takes about 10 cycles and the false positive probability is
295 : about 2e-10.
296 :
297 : When using AVX512, the regimes are similar:
298 : 0 <= R < 36 W=32, C=16, V=0 (one AVX vector)
299 : 36 <= R <= 64 W=32, C=32, V=32 (two AVX vectors)
300 : This combination has an expected value of about 33 cycles over all
301 : transactions. Again, the typical 3 account address account takes
302 : about 9 cycles and has a negligible false positive probability. */
303 :
304 :
305 : struct fd_chkdup_waddr {
306 : fd_acct_addr_t key; /* account address */
307 : };
308 : typedef struct fd_chkdup_waddr fd_chkdup_waddr_t;
309 : static const fd_acct_addr_t chkdup_null_addr = {{ 0 }};
310 :
311 : #define MAP_NAME fd_chkdup_pmap
312 185236819 : #define MAP_T fd_chkdup_waddr_t
313 401552927 : #define MAP_KEY_T fd_acct_addr_t
314 185343535 : #define MAP_KEY_NULL chkdup_null_addr
315 586792326 : #define MAP_KEY_INVAL(k) MAP_KEY_EQUAL(k, chkdup_null_addr)
316 602794584 : #define MAP_KEY_EQUAL(k0,k1) (!memcmp((k0).b,(k1).b, FD_TXN_ACCT_ADDR_SZ))
317 : #define MAP_KEY_EQUAL_IS_SLOW 1
318 : #define MAP_MEMOIZE 0
319 200509729 : #define MAP_KEY_HASH(key) ((uint)fd_ulong_hash( fd_ulong_load_8( (key).b ) ))
320 417033296 : #define MAP_LG_SLOT_CNT 8
321 : #include "../../util/tmpl/fd_map.c"
322 :
323 :
324 : struct __attribute__((aligned(FD_CHKDUP_ALIGN))) fd_chkdup_private {
325 : #if FD_CHKDUP_IMPL >= 1
326 : uchar entropy[ 32*FD_CHKDUP_IMPL ];
327 : #endif
328 :
329 : fd_chkdup_waddr_t hashmap[ 1UL<<8 ];
330 : };
331 :
332 : static inline void *
333 : fd_chkdup_new( void * shmem,
334 801 : fd_rng_t * rng ) {
335 801 : fd_chkdup_t * chkdup = (fd_chkdup_t *)shmem;
336 801 : #if FD_CHKDUP_IMPL >= 1
337 34977 : for( ulong i=0UL; i<32*FD_CHKDUP_IMPL; i++ ) chkdup->entropy[ i ] = fd_rng_uchar( rng );
338 : #else
339 : (void)rng;
340 : #endif
341 801 : FD_TEST( fd_chkdup_pmap_footprint()==sizeof(chkdup->hashmap) ); /* Known at compile time */
342 :
343 801 : fd_chkdup_pmap_new( chkdup->hashmap );
344 801 : return chkdup;
345 801 : }
346 :
347 243 : static inline fd_chkdup_t * fd_chkdup_join ( void * shmem ) { return (fd_chkdup_t *)shmem; }
348 :
349 228 : static inline void * fd_chkdup_leave ( fd_chkdup_t * chkdup ) { return (void *)chkdup; }
350 228 : static inline void * fd_chkdup_delete( void * shmem ) { return shmem; }
351 :
352 :
353 : static inline int
354 : fd_chkdup_check ( fd_chkdup_t * chkdup,
355 : fd_acct_addr_t const * list0, ulong list0_cnt,
356 23236242 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
357 23236242 : if( FD_LIKELY( 0==fd_chkdup_check_fast( chkdup, list0, list0_cnt, list1, list1_cnt ) ) ) return 0;
358 64895 : return fd_chkdup_check_slow( chkdup, list0, list0_cnt, list1, list1_cnt );
359 23236242 : }
360 :
361 : static inline int
362 : fd_chkdup_check_slow( fd_chkdup_t * chkdup,
363 : fd_acct_addr_t const * list0, ulong list0_cnt,
364 9714812 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
365 9714812 : fd_chkdup_waddr_t * map = fd_chkdup_pmap_join( chkdup->hashmap );
366 9714812 : fd_chkdup_waddr_t * inserted[ FD_TXN_ACCT_ADDR_MAX ];
367 9714812 : ulong inserted_cnt = 0UL;
368 :
369 9714812 : int any_duplicates = 0;
370 9714812 : int skipped_inval = 0;
371 136858723 : for( ulong i0=0UL; (i0<list0_cnt) & !any_duplicates; i0++ ) {
372 127143911 : if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list0[ i0 ] ) ) ) {
373 : /* Okay if this is the 1st, but not if the 2nd */
374 2685 : any_duplicates |= skipped_inval;
375 2685 : skipped_inval = 1;
376 2685 : continue;
377 2685 : }
378 127141226 : fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list0[ i0 ] );
379 127141226 : inserted[ inserted_cnt++ ] = ins;
380 127141226 : any_duplicates |= (NULL==ins);
381 127141226 : inserted_cnt -= (ulong)(NULL==ins); /* Correct inserted_cnt if we just stored a NULL */
382 127141226 : }
383 67810300 : for( ulong i1=0UL; (i1<list1_cnt) & !any_duplicates; i1++ ) {
384 58095488 : if( FD_UNLIKELY( fd_chkdup_pmap_key_inval( list1[ i1 ] ) ) ) {
385 696 : any_duplicates |= skipped_inval;
386 696 : skipped_inval = 1;
387 696 : continue;
388 696 : }
389 58094792 : fd_chkdup_waddr_t * ins = fd_chkdup_pmap_insert( map, list1[ i1 ] );
390 58094792 : inserted[ inserted_cnt++ ] = ins;
391 58094792 : any_duplicates |= (NULL==ins);
392 58094792 : inserted_cnt -= (ulong)(NULL==ins);
393 58094792 : }
394 :
395 : /* FIXME: This depends on undocumented map behavior for correctness.
396 : Deleting in the opposite order of insertion preserves previously
397 : inserted pointers. That behavior should be documented. */
398 194853291 : for( ulong i=0UL; i<inserted_cnt; i++ ) fd_chkdup_pmap_remove( map, inserted[ inserted_cnt-1UL-i ] );
399 :
400 9714812 : fd_chkdup_pmap_leave( map );
401 :
402 9714812 : return any_duplicates;
403 9714812 : }
404 :
405 :
406 : #if FD_CHKDUP_IMPL==1
407 :
408 : /* AVX2 implementation */
409 : #include "../../util/simd/fd_avx.h"
410 : static inline int
411 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
412 : fd_acct_addr_t const * list0, ulong list0_cnt,
413 22124106 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
414 22124106 : if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
415 :
416 12814026 : int any_duplicates = 0;
417 :
418 12814026 : const wu_t entropy = wb_ld( chkdup->entropy );
419 12814026 : const wu_t one = wu_bcast( 1U );
420 :
421 :
422 12814026 : if( FD_LIKELY( list0_cnt+list1_cnt<28UL ) ) {
423 : /* Single vector implementation */
424 9149256 : const wu_t mask = wu_bcast( 0x1FU );
425 :
426 9149256 : wu_t bloom = wu_zero();
427 82755554 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
428 73606298 : wu_t addr = wb_ldu( list0+i0 );
429 73606298 : wu_t masked = wu_and( wu_xor( addr, entropy ), mask );
430 73606298 : wu_t select = wu_shl_vector( one, masked );
431 : /* testc: "Compute the bitwise NOT of a and then AND with b, and
432 : [return] 1 if the result is zero." */
433 73606298 : any_duplicates |= _mm256_testc_si256( bloom, select );
434 73606298 : bloom = wu_or( bloom, select );
435 73606298 : }
436 29958936 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
437 20809680 : wu_t addr = wb_ldu( list1+i1 );
438 20809680 : wu_t masked = wu_and( wu_xor( addr, entropy ), mask );
439 20809680 : wu_t select = wu_shl_vector( one, masked );
440 20809680 : any_duplicates |= _mm256_testc_si256( bloom, select );
441 20809680 : bloom = wu_or( bloom, select );
442 20809680 : }
443 9149256 : return any_duplicates;
444 :
445 9149256 : } else {
446 : /* 4-vector implementation: slower but much better false positive
447 : rate so that we don't have to fall back to the slow path as
448 : frequently. */
449 3664770 : const wu_t mask = wu_bcast( 0x3FU );
450 :
451 3664770 : wu_t bloom0 = wu_zero(); wu_t bloom1 = wu_zero();
452 3664770 : wu_t bloom2 = wu_zero(); wu_t bloom3 = wu_zero();
453 94854176 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
454 91189406 : wu_t addr = wb_ldu( list0+i0 );
455 91189406 : wu_t blinded = wu_xor( addr, entropy );
456 91189406 : wu_t masked0 = wu_and( mask, blinded ); wu_t masked1 = wu_and( mask, wu_shr( blinded, 6 ) );
457 91189406 : wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
458 91189406 : wu_t select0 = wu_shl_vector( one, masked0 ); wu_t select1 = wu_shl_vector( one, masked1 );
459 91189406 : wu_t select2 = wu_shl_vector( one, masked2 ); wu_t select3 = wu_shl_vector( one, masked3 );
460 :
461 91189406 : wu_t any_differences = wu_or(
462 91189406 : wu_or( wu_andnot( bloom0, select0 ), wu_andnot( bloom1, select1 ) ),
463 91189406 : wu_or( wu_andnot( bloom2, select2 ), wu_andnot( bloom3, select3 ) ) );
464 :
465 91189406 : bloom0 = wu_or( bloom0, select0 ); bloom1 = wu_or( bloom1, select1 );
466 91189406 : bloom2 = wu_or( bloom2, select2 ); bloom3 = wu_or( bloom3, select3 );
467 :
468 91189406 : any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
469 91189406 : FD_COMPILER_FORGET( any_duplicates );
470 91189406 : }
471 65821520 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
472 62156750 : wu_t addr = wb_ldu( list1+i1 );
473 62156750 : wu_t blinded = wu_xor( addr, entropy );
474 62156750 : wu_t masked0 = wu_and( mask, blinded ); wu_t masked1 = wu_and( mask, wu_shr( blinded, 6 ) );
475 62156750 : wu_t masked2 = wu_and( mask, wu_shr( blinded, 12 ) ); wu_t masked3 = wu_and( mask, wu_shr( blinded, 18 ) );
476 62156750 : wu_t select0 = wu_shl_vector( one, masked0 ); wu_t select1 = wu_shl_vector( one, masked1 );
477 62156750 : wu_t select2 = wu_shl_vector( one, masked2 ); wu_t select3 = wu_shl_vector( one, masked3 );
478 :
479 62156750 : wu_t any_differences = wu_or(
480 62156750 : wu_or( wu_andnot( bloom0, select0 ), wu_andnot( bloom1, select1 ) ),
481 62156750 : wu_or( wu_andnot( bloom2, select2 ), wu_andnot( bloom3, select3 ) ) );
482 :
483 62156750 : bloom0 = wu_or( bloom0, select0 ); bloom1 = wu_or( bloom1, select1 );
484 62156750 : bloom2 = wu_or( bloom2, select2 ); bloom3 = wu_or( bloom3, select3 );
485 :
486 62156750 : any_duplicates |= _mm256_testz_si256( any_differences, any_differences );
487 62156750 : FD_COMPILER_FORGET( any_duplicates );
488 62156750 : }
489 3664770 : return any_duplicates;
490 3664770 : }
491 12814026 : }
492 :
493 : #elif FD_CHKDUP_IMPL==2
494 :
495 : /* AVX512 implementation */
496 : #include "../../util/simd/fd_avx512.h"
497 : static inline int
498 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
499 : fd_acct_addr_t const * list0, ulong list0_cnt,
500 11062053 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
501 11062053 : if( FD_UNLIKELY( list0_cnt+list1_cnt<=1UL ) ) return 0UL;
502 :
503 6407013 : int any_duplicates = 0;
504 :
505 6407013 : const wwu_t entropy = wwu_ld( (uint const *)chkdup->entropy );
506 6407013 : const wwu_t one = wwu_bcast( 1U );
507 :
508 6407013 : if( FD_LIKELY( list0_cnt+list1_cnt<36UL ) ) {
509 : /* One vector version */
510 : /* Our analysis assumed the 64 bytes of hash were all independent,
511 : but if we just xor and then use the low 5 bits of both parts of
512 : the vector, we get a lot more false positives than the math
513 : predicts. */
514 5374748 : const wwu_t mask = wwu_bcast( 0x1FU );
515 5374748 : wwu_t bloom = wwu_zero();
516 63579440 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
517 58204692 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
518 58204692 : wwu_t blinded = wwu_xor( addr, entropy );
519 58204692 : wwu_t masked = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
520 58204692 : wwu_t select = _mm512_rolv_epi32( one, masked );
521 58204692 : wwu_t next = wwu_or( bloom, select );
522 58204692 : __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE ); /* if non-zero, not a duplicate */
523 58204692 : bloom = next;
524 : /* kortestz_mask8_u8: "Compute the bitwise OR of 8-bit masks a and
525 : b. If the result is all zeroes, [return] 1" */
526 58204692 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
527 58204692 : FD_COMPILER_FORGET( any_duplicates );
528 58204692 : }
529 19981904 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
530 14607156 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
531 14607156 : wwu_t blinded = wwu_xor( addr, entropy );
532 14607156 : wwu_t masked = wwu_and( mask, _mm512_mask_srli_epi32( blinded, 0xFF00, blinded, 6 ) );
533 14607156 : wwu_t select = _mm512_rolv_epi32( one, masked );
534 14607156 : wwu_t next = wwu_or( bloom, select );
535 14607156 : __mmask8 any_differences = _mm512_cmp_epi64_mask( bloom, next, _MM_CMPINT_NE );
536 14607156 : bloom = next;
537 14607156 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
538 14607156 : FD_COMPILER_FORGET( any_duplicates );
539 14607156 : }
540 5374748 : return any_duplicates;
541 5374748 : } else {
542 : /* Two vector version */
543 1032265 : const wwu_t mask = wwu_bcast( 0x3FU );
544 1032265 : const wwu_t shift0 = wwu( 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
545 1032265 : 6U, 6U, 6U, 6U, 6U, 6U, 6U, 6U );
546 1032265 : const wwu_t shift1 = wwu( 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U,
547 1032265 : 18U, 18U, 18U, 18U, 18U, 18U, 18U, 18U );
548 1032265 : wwu_t bloom0 = wwu_zero(); wwu_t bloom1 = wwu_zero();
549 35034403 : for( ulong i0=0UL; i0<list0_cnt; i0++ ) {
550 34002138 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list0+i0 ) );
551 34002138 : wwu_t blinded = wwu_xor( addr, entropy );
552 34002138 : wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) ); wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
553 34002138 : wwu_t select0 = wwu_shl_vector( one, masked0 ); wwu_t select1 = wwu_shl_vector( one, masked1 );
554 34002138 : wwu_t next0 = wwu_or( bloom0, select0 ); wwu_t next1 = wwu_or( bloom1, select1 );
555 34002138 : __mmask8 any_differences = _kor_mask8(
556 34002138 : _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ), _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
557 :
558 34002138 : bloom0 = next0; bloom1 = next1;
559 :
560 34002138 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
561 34002138 : FD_COMPILER_FORGET( any_duplicates );
562 34002138 : }
563 18101163 : for( ulong i1=0UL; i1<list1_cnt; i1++ ) {
564 17068898 : wwu_t addr = _mm512_broadcast_i64x4( wu_ldu( list1+i1 ) );
565 17068898 : wwu_t blinded = wwu_xor( addr, entropy );
566 17068898 : wwu_t masked0 = wwu_and( mask, wwu_shr_vector( blinded, shift0 ) ); wwu_t masked1 = wwu_and( mask, wwu_shr_vector( blinded, shift1 ) );
567 17068898 : wwu_t select0 = wwu_shl_vector( one, masked0 ); wwu_t select1 = wwu_shl_vector( one, masked1 );
568 17068898 : wwu_t next0 = wwu_or( bloom0, select0 ); wwu_t next1 = wwu_or( bloom1, select1 );
569 17068898 : __mmask8 any_differences = _kor_mask8(
570 17068898 : _mm512_cmp_epi64_mask( bloom0, next0, _MM_CMPINT_NE ), _mm512_cmp_epi64_mask( bloom1, next1, _MM_CMPINT_NE ) );
571 :
572 17068898 : bloom0 = next0; bloom1 = next1;
573 :
574 17068898 : any_duplicates |= _kortestz_mask8_u8( any_differences, any_differences );
575 17068898 : FD_COMPILER_FORGET( any_duplicates );
576 17068898 : }
577 1032265 : return any_duplicates;
578 1032265 : }
579 6407013 : }
580 :
581 : #else
582 :
583 : static inline int
584 : fd_chkdup_check_fast( fd_chkdup_t * chkdup,
585 : fd_acct_addr_t const * list0, ulong list0_cnt,
586 : fd_acct_addr_t const * list1, ulong list1_cnt ) {
587 : (void)chkdup;
588 : (void)list0;
589 : (void)list1;
590 : (void)list0_cnt;
591 : (void)list1_cnt;
592 : return 1;
593 : }
594 :
595 : #endif
596 :
597 :
598 : FD_PROTOTYPES_END
599 :
600 : #endif /* HEADER_fd_src_disco_pack_fd_chkdup_h */
|