Line data Source code
1 : /* Generate prototypes, inlines and/or implementations for concurrent
2 : persistent shared maps based on linear probing with some insights
3 : from cuckoo hashing for improved concurrent performance. Notably:
4 :
5 : - Supports an arbitrary number of concurrent operations with a
6 : comparable performance to a single threaded HPC linear probed map
7 : for non-conflicting operations (likely hardware NOC limited for
8 : conflicting operations).
9 :
10 : - Concurrent queries do not interfere with other concurrent queries.
11 :
12 : - All map operations can be serialized.
13 :
14 : - Does not require a key sentinel (but can support them useful for
15 : the application).
16 :
17 : - Does not require a guarantee there's at least one free element in
18 : the element store (but like a serial linear probed map, it is a
19 : good idea to keep utilization below the absolute theoretical
20 : capacity for strong algorithmic performance guarantees).
21 :
22 : - Insert/modify/query have a run-time configurable worst case O(1)
23 : cost, regardless of map fill ratio. (Remove's worst case cost is
24 : not configurable due to cases involving maps filled to or near
25 : capacity. For reasonable fill ratios, remove is also comparable.)
26 :
27 : - Stable in the sense that all keys with the same hash (or more
28 : generally same initial probe element) are ordered in the element
29 : store by insertion order. (This allows the user to use the key
30 : hash to group related keys contiguously in the element store and
31 : then stably iterate over them with fast streaming.)
32 :
33 : - Query requires no atomic operations at all. (Usual target memory
34 : subsystem requirements that writes to memory become visible to
35 : other threads in the order in which they were issued in the machine
36 : code.)
37 :
38 : - Insert/modify/remove only require atomic fetch-and-or (typically
39 : just one). There's no need for an atomic compare-and-swap /
40 : underlying pool of free elements / etc. (Can also be used as a
41 : non-concurrent map on targets without FD_HAS_ATOMIC.)
42 :
43 : - Map concurrency metadata and the actual map element store can be
44 : located in separate memory regions (can also split the element
45 : store over multiple memory regions ... e.g. keys here / values
46 : there) and can back any of these memory regions by a file system to
47 : scale beyond billions of elements with no code change.
48 :
49 : - Map metadata easily fits in CPU caches with a fixed O(1) overhead
50 : regardless of element store capacity. Access patterns naturally
51 : exploit CPU and storage caching, streaming and prefetching
52 : behaviors.
53 :
54 : - Supports non-plain-old-data keys and non-plain-old-data values
55 : (both of which are gross on real world computers but commonly done
56 : nevertheless).
57 :
58 : A map can be persisted beyond the lifetime of the creating process,
59 : be used inter-process, relocated in memory, be naively
60 : serialized/deserialized, be moved between hosts, etc. Massive
61 : concurrency, high algorithmic and implementation performance for
62 : normal usage and friendly cache / file system streaming access
63 : patterns for heavily loaded / heavily concurrent usage are
64 : prioritized. In particular, unlike fd_map_para, this takes ownership
65 : of the underlying element store for the lifetime of the map in order
66 : to speed up operations and increase concurrency.
67 :
68 : Typical usage:
69 :
70 : struct myele {
71 : ulong key; // Technically "MAP_KEY_T MAP_KEY" (default is ulong key)
72 :
73 : ... key can be located arbitrarily in the element. The mapping
74 : ... of a key to an element in the element store is arbitrary and
75 : ... can move while the key is in the map.
76 : };
77 :
78 : typedef struct myele myele_t;
79 :
80 : #define MAP_NAME mymap
81 : #define MAP_ELE_T myele_t
82 : #include "tmpl/fd_map_slot_para.c"
83 :
84 : will declare the following APIs as a header only style library in the
85 : compilation unit:
86 :
87 : // A mymap_t is a stack declaration friendly quasi-opaque local
88 : // object used to hold the state of a local join to a mymap.
89 : // Similarly, a mymap_query_t holds the local state of an ongoing
90 : // operation. E.g. it is fine to do mymap_t join[1];" to allocate
91 : // a mymap_t but the contents should not be used directly.
92 :
93 : typedef struct mymap_private mymap_t;
94 : typedef struct mymap_query_private mymap_query_t;
95 :
96 : // mymap_lock_max returns the maximum number of version locks that
97 : // can be used by a mymap. Will be a positive integer
98 : // power-of-two.
99 :
100 : ulong mymap_lock_max();
101 :
102 : // mymap_lock_cnt_est returns a reasonable number of locks to use
103 : // for a map backed by an ele_max capacity element store. Assumes
104 : // ele_max is an integer power-of-two. Returns an integer
105 : // power-of-two in [1,mymap_lock_max()].
106 :
107 : ulong mymap_lock_cnt_est( ulong ele_max );
108 :
109 : // mymap_probe_max_est returns a reasonable maximum probe sequence
110 : // length for a map backed by an ele_max capacity element store.
111 : // Assumes ele_max is an integer power-of-two. Returns an integer
112 : // power-of-two in [1,ele_max].
113 :
114 : ulong mymap_probe_max_est( ulong ele_max );
115 :
116 : // mymap_{align,footprint} returns the alignment and footprint
117 : // needed for a memory region to be used as a mymap. align will be
118 : // an integer power-of-two and footprint will be a multiple of
119 : // align. ele_max / lock_cnt / probe_max specify the capacity of
120 : // the element store / number of version locks / maximum probe
121 : // sequence length for the map. footprint returns 0 for invalid
122 : // configurations. In a valid configuration, ele_max is an integer
123 : // power-of-two, lock_cnt is an integer power-of-two, lock_cnt is
124 : // at most min( lock_max, ele_max ) and probe_max is in
125 : // [1,ele_max].
126 : //
127 : // mymap_new formats a memory region with the required alignment
128 : // and footprint into a mymap. shmem points in the caller's
129 : // address space to the memory region to use. Returns shmem on
130 : // success (mymap has ownership of the memory region) and NULL on
131 : // failure (no changes, logs details). The caller is not joined on
132 : // return. All map versions will be at version 0 / unlocked. The
133 : // map contents will be in whatever state the backing element store
134 : // is in. IMPORTANT SAFETY TIP! THE ELEMENT STORE SHOULD BE IN A
135 : // CONSISTENT STATE BEFORE USING MYMAP_NEW. For example, the
136 : // caller could mark all elements as free before calling this and
137 : // the caller could use verify immediately after creation to verify
138 : // integrity.
139 : //
140 : // mymap_join joins the caller to an existing mymap. ljoin points
141 : // to a mymap_t compatible memory region in the caller's address
142 : // space, shmap points in the caller's address space to the memory
143 : // region containing the mymap, and shele points in the caller's
144 : // address space to mymap's element store. Returns a handle to the
145 : // caller's local join on success (join has ownership of the ljoin
146 : // region) and NULL on failure (no changes, logs details).
147 : //
148 : // mymap_leave leaves a mymap join. join points to a current local
149 : // join. Returns the memory region used for the join on success
150 : // (caller has ownership on return and the caller is no longer
151 : // joined) and NULL on failure (no changes, logs details). Use the
152 : // join accessors before leaving to get shmap and shele used by the
153 : // join if needed.
154 : //
155 : // mymap_delete unformats a memory region used as a mymap. Assumes
156 : // shmap points in the caller's address space to a memory region
157 : // containing the mymap and that there are no joins. Returns shmem
158 : // on success (caller has ownership of the memory region, any
159 : // remaining elements still in the mymap are released to the caller
160 : // implicitly) and NULL on failure (no changes, logs details).
161 :
162 : ulong mymap_align ( void );
163 : ulong mymap_footprint( ulong chain_cnt );
164 : void * mymap_new ( void * shmem, ulong ele_max, ulong lock_cnt, ulong probe_max, ulong seed );
165 : mymap_t * mymap_join ( void * ljoin, void * shmap, void * shele );
166 : void * mymap_leave ( mymap_t * join );
167 : void * mymap_delete ( void * shmap );
168 :
169 : // mymap_{ele_max,lock_cnt,probe_max,seed} return the mymap
170 : // configuration. Assumes join is a current local join. The
171 : // values will be valid for the mymap lifetime.
172 :
173 : ulong mymap_ele_max ( mymap_t const * join );
174 : ulong mymap_lock_cnt ( mymap_t const * join );
175 : ulong mymap_probe_max( mymap_t const * join );
176 : ulong mymap_seed ( mymap_t const * join );
177 :
178 : // mymap_{shmap,shele} return join details. Assumes join is a
179 : // current local join. The values will be valid for the join
180 : // lifetime. mymap_{shmap_const,shele_const} are const correct
181 : // versions.
182 :
183 : void const * mymap_shmap_const( mymap_t const * join );
184 : void const * mymap_shele_const( mymap_t const * join );
185 :
186 : void * mymap_shmap( mymap_t * join );
187 : void * mymap_shele( mymap_t * join );
188 :
189 : // mymap_key_{eq,hash} expose the provided MAP_KEY_{EQ,HASH} macros
190 : // as inlines with strict semantics. They assume that the provided
191 : // pointers are in the caller's address space to keys that will not
192 : // be changed during the call. They retain no interest in any keys
193 : // on return.
194 : //
195 : // mymap_key_eq returns 1 if *k0 and *k1 are equal and 0 otherwise.
196 : //
197 : // mymap_key_hash returns the hash of *key using the hash function
198 : // seed. Should ideally be a random mapping from a MAP_KEY_T to a
199 : // ulong but this depends on what the user actually used for
200 : // MAP_KEY_HASH. The seed used by a particular mymap innstance can
201 : // be obtained above.
202 :
203 : int mymap_key_eq ( ulong * k0, ulong * k1 );
204 : ulong mymap_key_hash( ulong * key, ulong seed );
205 :
206 : // mymap_backoff does FD_SPIN_PAUSE a random number of times. The
207 : // number of pauses is an approximately uniform IID random number
208 : // in [0,scale/2^16] where scale is in [0,2^32). Specifically, the
209 : // number of pauses is:
210 : //
211 : // floor( scale r / 2^48 )
212 : //
213 : // where r is a non-deterministic 32-bit uniform IID random number.
214 : // Under the hood, r is generated by hashing the user provided seed
215 : // and the least significant 32-bits of the CPU tickcounter.
216 : // Ideally, seed is a 32-bit globally unique identifer for the
217 : // logical thread of execution but this is up to the application to
218 : // specify and rarely matters in practice. This is a useful
219 : // building block for random exponential backoffs.
220 :
221 : void mymap_backoff( ulong scale, ulong seed );
222 :
223 : // mymap_query_memo returns the key_hash of the query associated
224 : // with the query's key to allow users to minimize potentially
225 : // expensive key hash computations in various operations.
226 : //
227 : // mymap_query_ele returns a pointer in the caller's address space
228 : // to the element store element associated with the query or a
229 : // sentinel value. The sentinel value is application dependent and
230 : // thus arbitrary (e.g. not necessarily in the element store,
231 : // including NULL, a local temporary used as a bit bucket, etc).
232 : // Assumes query is valid. The lifetime of the returned pointer
233 : // depends on the query. mymap_query_ele_const is a const correct
234 : // version.
235 :
236 : ulong mymap_query_memo ( mymap_query_t const * query );
237 : myele_t const * mymap_query_ele_const( mymap_query_t const * query );
238 : myele_t * mymap_query_ele ( mymap_query_t * query );
239 :
240 : // mymap_prepare tries to start a insert/modify/blocking query
241 : // operation for key. Assumes join is a current local join, key
242 : // points to valid key in the caller's address space for the
243 : // duration of the call and query points to a local scratch to hold
244 : // the info about the prepare. Retains no interest in key.
245 : // Returns FD_MAP_SUCCESS (0) and a FD_MAP_ERR (negative) on
246 : // failure. This is a non-blocking fast O(1) (O(probe_max) worst
247 : // case) and supports highly concurrent operation.
248 : //
249 : // On success, the caller is in a prepare for key and query is
250 : // initialized with info about prepare. ele=mymap_query_ele(query)
251 : // gives the location in the caller's address space to an element
252 : // store element for the prepare that will be stable for the
253 : // duration of the prepare and memo=mymap_query_memo(query) gives
254 : // the key hash.
255 : //
256 : // If the element is marked as free, key is not in the map and ele
257 : // is where key could be inserted. If the caller is inserting key,
258 : // the caller should populate element's key with key, element's
259 : // memo (if any) with memo) (avoids having to rehash the key), mark
260 : // ele as used and then do a mymap_publish to complete the insert.
261 : // If not, the caller should keep ele marked as free and do a
262 : // mymap_cancel to complete the prepare (doesn't matter from the
263 : // map's point-of-view if anything else was clobbered /
264 : // mymap_publish would also work here).
265 : //
266 : // If the element is marked as used, key is in the map at ele. If
267 : // the caller is modifying key's value, the caller should do the
268 : // modification and then mymap_publish to complete the modify. If
269 : // not (e.g. blocking query), the caller can inspect ele contents
270 : // and the mymap_cancel to complete the blocking query
271 : // (mymap_publish would also work here). In both cases, the caller
272 : // should not modify ele's key, modify ele's memo, or mark ele as
273 : // free. Note that mymap_publish must be used even if the
274 : // modifications were only temporary.
275 : //
276 : // On failure, the caller is not in a prepare for key, query
277 : // ele==sentinel and query memo will still be the key hash.
278 : / Reasons for failure:
279 : //
280 : // - FD_MAP_ERR_AGAIN: A potentially conflicting operation was in
281 : // progress at some point during the call. Try again later (e.g.
282 : // after a random exponential backoff).
283 : //
284 : // - FD_MAP_ERR_FULL: key was not in the map but inserting ele
285 : // would require making a probe sequence longer than probe_max.
286 : // Try again when the map is less full (e.g. after removing some
287 : // elements).
288 : //
289 : // mymap_publish ends the prepare described by query, updating the
290 : // map version to reflect changes made during the prepare. Assumes
291 : // query is valid and describes an active prepare. Cannot fail and
292 : // will not be in the prepare will finished on return. This is a
293 : // generally safe way to end a prepare even if the caller did not
294 : // modify the map during the prepare.
295 : //
296 : // mymap_cancel ends the prepare described by query, reverting the
297 : // map version to reflect that the caller did not change the map
298 : // during the prepare. Assumes query is valid and describes an
299 : // active prepare and that the caller did not make any meaningful
300 : // modifications to the map during the prepare (note that temporary
301 : // changes during the prepare can be considered modifications as
302 : // per the above). Cannot fail and will not be in the prepare will
303 : // finished on return. This is a safe way to end a prepare only if
304 : // the caller did not modify the map during the prepare.
305 : //
306 : // IMPORTANT SAFETY TIP! Do not nest or interleave prepares,
307 : // remove or queries for the same map on the same thread.
308 : //
309 : // IMPORTANT SAFETY TIP! A successful prepare must be have a
310 : // matching publish or cancel (and then ideally as soon as
311 : // possible).
312 : //
313 : // IMPORTANT SAFETY TIP! The order in which keys that hash to the
314 : // same slot were inserted into the map is preserved for the
315 : // lifetime of the keys. Thus the hash function used can be
316 : // constructed to create ordered iterators over groups of keys.
317 :
318 : int mymap_prepare( mymap_t * join, ulong const * key, myele_t * sentinel, mymap_query_t * query );
319 : void mymap_publish( mymap_query_t * query );
320 : void mymap_cancel ( mymap_query_t * query );
321 :
322 : // mymap_remove removes key (if key) from the mymap. Assumes join
323 : // is a current local join and key is valid for the duration of the
324 : // call. Retains no interest in key. This is non-blocking fast
325 : // typically O(1) and supports highly concurrent operation.
326 : //
327 : // Returns FD_MAP_SUCCESS (0) on success and a FD_MAP_ERR
328 : // (negative) on failure. On success, key's mapping was removed at
329 : // some point during the call. On failure, no changes were made by
330 : // this call and:
331 : //
332 : // - FD_MAP_ERR_KEY: Key was not found in the mymap at some point
333 : // during the call.
334 : //
335 : // - FD_MAP_ERR_AGAIN: A potentially conflicting operation was in
336 : // progress at some point during the call. Same considerations
337 : // as prepare above.
338 : //
339 : // IMPORTANT SAFETY TIP! Do not nest or interleave prepares,
340 : // remove or queries for the same map on the same thread.
341 :
342 : int mymap_remove( mymap_t * join, ulong const * key );
343 :
344 : // mymap_query_try tries to speculatively query a mymap for key.
345 : // On return, query will hold information about the try. sentinel
346 : // gives the query element pointer value (arbitrary) to pass
347 : // through when it is not safe to try the query. Assumes join is a
348 : // current local join and key is valid for the duration of the
349 : // call. Does not modify the mymap and retains no interest in key,
350 : // sentinel or query. This is a non-blocking fast O(1)
351 : // (O(probe_max) worst case) and supports highly concurrent
352 : // operation.
353 : //
354 : // Returns FD_MAP_SUCCESS (0) on success and a FD_MAP_ERR
355 : // (negative) on failure. On success, key mapped to the element
356 : // store element mymap_query_ele( query ) in the caller's address
357 : // space at some point during the call. The mymap retains
358 : // ownership of this element but the caller can zero copy
359 : // speculatively process the element's contents. On failure,
360 : // mymap_query_ele( query ) will be sentinel and returns:
361 : //
362 : // - FD_MAP_ERR_KEY: Key was not found in the mymap in some point
363 : // during the call.
364 : //
365 : // - FD_MAP_ERR_AGAIN: A potentially conflicting operation was in
366 : // progress at some point during the call. Try again later (e.g.
367 : // after a random exponential backoff). Unlike prepare and
368 : // remove, this call does _not_ require any locks for key's probe
369 : // sequence. As such, AGAIN can only be caused by concurrent
370 : // prepare/remove operations and this will never interfere with
371 : // any other concurrent operation. Among the many implications,
372 : // a query will never delay a concurrent query and AGAIN will
373 : // never be returned if only concurrent speculative queries are
374 : // in progress.
375 : //
376 : // IMPORTANT SAFETY TIP! THE CALLER SHOULD BE PREPARED TO HANDLE
377 : // ARBITRARY AND/OR INCONSISTENT VALUES FOR ELEMENT FIELDS DURING
378 : // SPECULATIVE PROCESSING. CALLERS SHOULD NOT COMMIT ANY RESULTS
379 : // OF SPECULATIVE PROCESSING UNTIL IT TESTS THE QUERY WAS
380 : // SUCCESSFUL.
381 : //
382 : // The simplest form of speculative processing is to copy the
383 : // element store element into a local temporary, test that the
384 : // speculation was valid, and then process the local temporary copy
385 : // at its leisure. Zero copy, more selective copying and/or
386 : // writing speculative results into local tempoaries are more
387 : // advanced examples of speculative processing.
388 : //
389 : // Use mymap_prepare to do a blocking (non-speculative) query.
390 : //
391 : // mymap_query_test tests if an in-progress query is still valid.
392 : // Assumes query is valid, we are still in a query try and lock
393 : // version numbers have not wrapped since we started the try.
394 : // Returns FD_MAP_SUCCESS (0) if the query is still valid and
395 : // FD_MAP_ERR_AGAIN (negative) if a potentially conflicting
396 : // operation was in progress at some point during the try.
397 : //
398 : // IMPORTANT SAFETY TIP! Do not nest or interleave prepares,
399 : // remove or queries for the same map on the same thread.
400 :
401 : int
402 : mymap_query_try( mymap_t const * join,
403 : ulong const * key,
404 : myele_t const * sentinel,
405 : mymap_query_t * query );
406 :
407 : int mymap_query_test( mymap_query_t const * query );
408 :
409 : // mymap_verify returns FD_MAP_SUCCESS (0) if the join, underlying
410 : // map and underlying element store give a mapping of unique keys
411 : // to unique elements in the element store with a bounded maximum
412 : // probe length. Returns FD_MAP_ERR_INVAL (negative) otherwise (no
413 : // changes by this call, logs details). Assumes that caller has
414 : // all the map locks and/or the map is otherwise known to be idle.
415 :
416 : int mymap_verify( mymap_t const * join );
417 :
418 : Do this as often as desired in a compilation unit to get different
419 : types of concurrent maps. Options exist for generating library
420 : header prototypes and/or library implementations for concurrent maps
421 : usable across multiple compilation units. Additional options exist
422 : to use different hashing functions, key comparison functions, etc as
423 : detailed below.
424 :
425 : IMPORTANT SAFETY TIP! If using a key sentinel, prepare/remove/query
426 : operations assume the input key is not the key sentinel (i.e. a
427 : sentinel is not considered a "valid key). Sentinel keys are not
428 : necessary if MAP_ELE_IS_FREE, MAP_ELE_FREE and MAP_ELE_MOVE are set
429 : appropriately.
430 :
431 : To better understand prepare / publish / cancel semantics:
432 :
433 : mykey_t * key = ... key to insert / modify / blocking query
434 :
435 : mymap_query_t query[1];
436 : int err = mymap_prepare( map, key, sentinel, query );
437 : myele_t * ele = mymap_query_ele ( query );
438 : ulong memo = mymap_query_memo( query );
439 :
440 : ... At this point, memo == mymap_key_hash( key, seed )
441 :
442 : if( FD_UNLIKELY( err ) ) {
443 :
444 : ... At this point, we are not in a prepare for key and
445 : ... ele==sentinel.
446 :
447 : ... If err is FD_MAP_ERR_AGAIN, there was a potentially
448 : ... conflicting prepare or remove in progress at some point
449 : ... during the call. We can try again later (e.g. after a
450 : ... random backoff or doing other non-conflicting work).
451 :
452 : ... If err is FD_MAP_ERR_FULL, key was not in the map but
453 : ... inserting it would have created a key probe sequence longer
454 : ... than probe_max at some point during the call. We can try
455 : ... again later when it is less full (e.g. after removing keys
456 : ... from the map).
457 :
458 : } else if( ... ele is marked as free ... ) ) {
459 :
460 : ... At this point, we are in a prepare for key, key is not in
461 : ... the map and ele points in the caller's address space to free
462 : ... element in the element store suitable for holding key.
463 :
464 : ... initialize ele here, including populating ele's key with key
465 : ... and (if memoized) populating ele's memo with memo.
466 :
467 : if( ... we decided not to insert key ... ) mymap_cancel( query ); // "failed insert"
468 : else {
469 : ... mark ele as used
470 : mymap_publish( query ); // "insert"
471 : }
472 :
473 : } else {
474 :
475 : ... At this point, we are in a prepare for key, key is in the map
476 : ... and ele points in the caller's address space to the element
477 : ... store element that currently contains key. We are free to
478 : ... modify ele's value. We should not modify ele's key, modify
479 : ... ele's memo (if memoized) or mark ele as free.
480 :
481 : ... process ele here
482 :
483 : if( ... we didn't modify ele ... ) mymap_cancel ( query ); // "blocking query"
484 : else mymap_publish( query ); // "modify"
485 :
486 : }
487 :
488 : To better understand remove semantics:
489 :
490 : mykey_t * key = ... key to remove
491 :
492 : int err = mymap_remove( map, key );
493 :
494 : if( FD_UNLIKELY( err ) ) {
495 :
496 : ... If err is FD_MAP_ERR_AGAIN, there was a potentially
497 : ... conflicting prepare or remove in progress at some point
498 : ... during the call. We can try again later (e.g. after a random
499 : ... backoff or doing other non-conflicting work).
500 :
501 : ... If err is FD_MAP_ERR_KEY, key was not in the map at some
502 : ... point during the call (so remove did not do anything).
503 :
504 : } else {
505 :
506 : ... key was removed from the map at some point during the call.
507 : ... The remove might have shuffled other keys. This shuffling
508 : ... can only decrease probe sequence length for any remaining
509 : ... keys and preserves insertion ordering for keys with the same
510 : ... hash (or initial probe element more generally).
511 :
512 : }
513 :
514 : To better understand query semantics:
515 :
516 : mykey_t * key = ... key to query
517 :
518 : mymap_query_t query[1];
519 : int err = mymap_query_try( join, key, sentinel, query );
520 : myele_t const * ele = mymap_query_ele_const( query );
521 : ulong memo = mymap_query_memo ( query );
522 :
523 : ... At this point, memo==mymap_key_hash( key, seed )
524 :
525 : if( FD_UNLIKELY( err ) ) {
526 :
527 : ... At this point, ele==sentinel
528 : ...
529 : ... If err is FD_MAP_ERR_KEY, key was not in the mymap at some
530 : ... point during the try.
531 : ...
532 : ... If err is FD_MAP_ERR_AGAIN, there was a potentially
533 : ... conflicting operation in progress during the try and we can
534 : ... try again later (e.g. after a random backoff or doing other
535 : ... non-conflicting work).
536 :
537 : } else {
538 :
539 : ... At this point, ele points in our address space to an element
540 : ... in the element store (non-NULL) and ele->key matched key at
541 : ... some point during the try.
542 :
543 : ... Speculatively process ele here.
544 : ...
545 : ... DO NOT TRUST ANY RESULTS OF THIS SPECULATIVE PROCESSING YET.
546 : ... THERE IS NO GUARANTEE YET THAT A CONCURRENT USER HASN'T
547 : ... CHANGED THE MYMAP IN A WAY THAT COULD YIELD ARBITRARY AND
548 : ... INCONSISTENT RESULTS.
549 : ...
550 : ... The simplest and most common form of speculative processing
551 : ... is to copy the needed portions of ele into a local stack
552 : ... temp.
553 : ...
554 : ... Note: concurrent operations include removing key from the
555 : ... mymap (and maybe multiple cycles of inserting and removing it
556 : ... and then at potentially different element store locations) or
557 : ... unrelated removes potentially shuffling the location of this
558 : ... key. That's not an issue practically as the ele pointer here
559 : ... will be to an element compatible memory region that will
560 : ... continue to exist regardless and we shouldn't be trusting any
561 : ... query reads yet (the query test will detect if if these can
562 : ... be trusted). See rant in fd_map_para.c for more details.
563 :
564 : ... At this point, we are done with speculative processing (or we
565 : ... don't want to do any more speculative processing if the try
566 : ... has already failed).
567 :
568 : err = mymap_query_test( query );
569 : if( FD_UNLKELY( err ) ) {
570 :
571 : ... At this point, err will be FD_MAP_ERR_AGAIN and a
572 : ... potentially conflicting operation in the try was detected
573 : ... by the test.
574 :
575 : ... Application specific handling here (e.g. try again after a
576 : ... random backoff or doing other non-conflicting work).
577 :
578 : } else {
579 :
580 : ... At this point, the results of the speculation thus far can
581 : ... be trusted and can be considered to have been computed at
582 : ... some point in time between try and test.
583 :
584 : }
585 : }
586 :
587 : Implementation overview:
588 :
589 : A map basically a persistent shared array of version numbers named
590 : lock. lock[ lock_idx ] contains a version number that covers map
591 : slots [lock_idx (ele_max/lock_cnt),(lock_idx+1)(ele_max/lock_cnt))
592 :
593 : When trying an operation that could impact probe sequences passing
594 : through a lock's range of slots, the version number is atomically
595 : incremented. It is incremented again at completion. It may also
596 : be decremented if the operation didn't end up modifying any of the
597 : covered slots.
598 :
599 : Thus, an {odd,even} version number indicates that there is {a
600 : potential,not any} operation in progress that could impact probe
601 : sequences passing through the corresponding slots. The most
602 : significant bits of the version number can be used for lockfree
603 : style operations to detect changes to any probe sequences they use.
604 :
605 : When the map is not overloaded, key probe sequences are typically
606 : O(1) long and, in general, at most a (user configured) probe_max
607 : long. Since a version number covers many slots typically, this
608 : implies that the typical "read" operation (e.g.
609 : query_try/query_test) looks like:
610 :
611 : - try: observe lock version numbers covering all slots in
612 : key's probe sequence, fail if any locked (typically 1
613 : normal read that hits L1/L2 cache, especially in the
614 : common case of reads more frequent than writes)
615 : - spec: speculatively process the element containing key
616 : - test: check version numbers haven't changed (typically 1
617 : normal read that even more likely L1/L2 cache hit),
618 : fail if any changed
619 :
620 : And the typical "write" operation (e.g. prepare/publish) looks
621 : like:
622 :
623 : - prepare: increment lock version numbers covering all slots in
624 : key's probe sequence, fail if any locked (typically 1
625 : atomic fetch-and-or done test-and-test-and-set style to
626 : minimize hardware NOC contention)
627 : - exec: (non-speculatively) process the element containing key
628 : - publish: increment version numbers (typically 1 normal read/write
629 : that hits L1/L2 cache)
630 :
631 : Readers never block concurrent readers or writers. Writers can
632 : block other readers and other writers. If there are many more
633 : version locks than concurrent writers though, writers are unlikely
634 : to interfere with concurrent readers or writers. In all cases, all
635 : map operations are serializable.
636 :
637 : For maps that are loaded to their capacity, probe sequences could
638 : be up to probe_max long and probe_max might be quite large. This
639 : implies that a more than one version lock might be needed. Since
640 : this range is cyclic contiguous in memory, the locking operations
641 : are nice compact streaming access patterns. And similarly for the
642 : element store access patterns. */
643 :
644 : /* MAP_NAME gives the API prefix to use for map */
645 :
646 : #ifndef MAP_NAME
647 : #error "Define MAP_NAME"
648 : #endif
649 :
650 : /* MAP_ELE_T is the map element type */
651 :
652 : #ifndef MAP_ELE_T
653 : #error "Define MAP_ELE_T"
654 : #endif
655 :
656 : /* MAP_KEY_T is the map key type */
657 :
658 : #ifndef MAP_KEY_T
659 : #define MAP_KEY_T ulong
660 : #endif
661 :
662 : /* MAP_KEY is the MAP_ELE_T key field */
663 :
664 : #ifndef MAP_KEY
665 : #define MAP_KEY key
666 : #endif
667 :
668 : /* MAP_KEY_EQ returns 0/1 if *k0 is the same/different as *k1 */
669 :
670 : #ifndef MAP_KEY_EQ
671 : #define MAP_KEY_EQ(k0,k1) ((*(k0))==(*(k1)))
672 : #endif
673 :
674 : /* MAP_KEY_HASH returns a random mapping of *key into ulong. The
675 : mapping is parameterized by the 64-bit ulong seed. */
676 :
677 : #ifndef MAP_KEY_HASH
678 : #define MAP_KEY_HASH(key,seed) fd_ulong_hash( (*(key)) ^ (seed) )
679 : #endif
680 :
681 : /* If MAP_MEMOIZE is defined to non-zero, elements have a field that
682 : can be used while in the map to hold the MAP_KEY_HASH for an
683 : element's key. This is useful for accelerating user code that might
684 : need a hash and accelerating various operations. */
685 :
686 : #ifndef MAP_MEMOIZE
687 : #define MAP_MEMOIZE 0
688 : #endif
689 :
690 : /* If MAP_MEMOIZE is non-zero, MAP_MEMO is the memo element field.
691 : Should be a ulong. Like MAP_KEY and MAP_NEXT, when an element is in
692 : the map, this value is managed by the map and will contain the
693 : MAP_KEY_HASH of the element's key and the map's seed. */
694 :
695 : #ifndef MAP_MEMO
696 : #define MAP_MEMO memo
697 : #endif
698 :
699 : /* If MAP_MEMOIZE is defined to non-zero, a non-zero MAP_KEY_EQ_IS_SLOW
700 : indicates the MAP_MEMO field should be used to accelerate MAP_KEY_EQ
701 : operations. This is useful when MAP_KEY_EQ is non-trivial (e.g.
702 : variable length string compare, large buffer compares, etc). */
703 :
704 : #ifndef MAP_KEY_EQ_IS_SLOW
705 : #define MAP_KEY_EQ_IS_SLOW 0
706 : #endif
707 :
708 : /* MAP_ELE_IS_FREE returns 0/1 if the slot pointed to by ele in the
709 : caller's address space contains / does not contain a key-value pair.
710 : The implementation can assume ele is valid and that it is safe to
711 : speculate on ele. The default implementation tests if key is not 0.
712 : If using a different key sentinel or not using a key sentinel, update
713 : this appropriately. */
714 :
715 : #ifndef MAP_ELE_IS_FREE
716 : #define MAP_ELE_IS_FREE(ctx,ele) (!((ele)->MAP_KEY))
717 : #endif
718 :
719 : /* MAP_ELE_FREE frees the key-value pair in the slot pointed to by ele
720 : in the caller's address space. The implementation can assume ele is
721 : valid, ele is contains a key-value pair on entry and there will be no
722 : concurrent operations on ele during the free. The default
723 : implementation sets key to 0. If using a different key sentinel or
724 : not using a key sentinel, update this appropriately. Likewise, if
725 : not using plain-old-data keys and values, this should do the
726 : appropriate resource management. The join ctx is provided to
727 : faciliate this. */
728 :
729 : #ifndef MAP_ELE_FREE
730 : #define MAP_ELE_FREE(ctx,ele) do (ele)->MAP_KEY = (MAP_KEY_T)0; while(0)
731 : #endif
732 :
733 : /* MAP_ELE_MOVE moves the key-value pair in slot src to slot dst.
734 : src and dst are in the caller's address space. The implementation
735 : can assume src and dst are valid, dst does not contain a key-value
736 : pair on entry, src contains a key-value on pair on entry, and there
737 : will be no concurrent operations on src and dst during the move. The
738 : default implementation shallow copies src to dst and sets src key to
739 : 0. If using a different key sentinel or not using a key sentinel,
740 : update this appropriately. Likewise, if elements do not use
741 : plain-old-data keys and/or values, this should do the appropriate key
742 : and/or value resource management. The join ctx is provided to
743 : facilitate this. */
744 :
745 : #ifndef MAP_ELE_MOVE
746 : #define MAP_ELE_MOVE(ctx,dst,src) do { MAP_ELE_T * _src = (src); (*(dst)) = *_src; _src->MAP_KEY = (MAP_KEY_T)0; } while(0)
747 : #endif
748 :
749 : /* MAP_CTX_MAX specifies the maximum number of bytes of user context
750 : for use in MAP_ELE above (e.g. custom allocators / workspaces / local
751 : pointers to additional value arrays / etc). This context will be
752 : ulong aligned. Default is up to 72 bytes. */
753 :
754 : #ifndef MAP_CTX_MAX
755 0 : #define MAP_CTX_MAX (72UL)
756 : #endif
757 :
758 : /* MAP_VERSION_T gives the map version index type. Should be a
759 : primitive unsigned integer type. The least significant bit is used
760 : to use indicate whether or not a slot could be impacted by an in
761 : progress map operation. The remaining bits indicate the version
762 : number. The default is ulong, yielding effectively infinite ABA
763 : protection (e.g. a lockfree query query operation would need to be
764 : stalled for over ~2^63 concurrent insert/modify/remove map operations
765 : before risk of getting confused by version number reuse ... which
766 : would take millenia for modern hardware practically). Narrow types
767 : yield less less metadata footprint overhead for the map and lower ABA
768 : protection. (For human hard real-time applications, uint is probably
769 : fine and, in for computer hard computer real-time applications,
770 : ushort and/or uchar could be fine). */
771 :
772 : #ifndef MAP_VERSION_T
773 163650804 : #define MAP_VERSION_T ulong
774 : #endif
775 :
776 : /* MAP_LOCK_MAX gives the maximum number of version locks the map can
777 : support. This should be positive and an integer power-of-two. This
778 : essentially is limit on the maximum number of concurrent operations
779 : and thus should be much greater than the number of concurrent
780 : insert/modify/remove operations in expected map usage. Default is
781 : 1024.
782 :
783 : Note that this is not theoretically required for the below
784 : implementation. This exists to compile time bound stack utilization
785 : of prepare/remove/query_try. That is,
786 : sizeof(MAP_VERSION_T)*MAP_LOCK_MAX should be a L1D cache / L2D cache
787 : / stack allocation friendly footprint (defaults yield 8 KiB).
788 : MAP_LOCK_MAX could be removed by using an dynamic stack allocation
789 : but that would limit this to targets with FD_HAS_ALLOCA. Could also
790 : be eliminated by using a dynamic footprint lock cache in the query
791 : structure, join structures and/or combining the query and join
792 : structures. These are cumbersome for the user and the last two add
793 : restrictions to intra-process multithreaded usage not seen in other
794 : FD persistent inter-process datastructures. (Consider using a
795 : massive/reasonable MAP_LOCK_MAX when FD_HAS_ALLOCA is set/clear and
796 : then using alloca in prepare/remove/query_try when FD_HAS_ALLOCA is
797 : set? Almost the best of both worlds but does imply some subtle
798 : restrictions if trying to interoperate between targets compiled with
799 : different features ... avoiding for now.) */
800 :
801 : #ifndef MAP_LOCK_MAX
802 3000039 : #define MAP_LOCK_MAX (1024)
803 : #endif
804 :
805 : /* MAP_ALIGN gives the alignment required for the map shared memory.
806 : Default is 128 for double cache line alignment. Should be at least
807 : ulong alignment. */
808 :
809 : #ifndef MAP_ALIGN
810 : #define MAP_ALIGN (128UL)
811 : #endif
812 :
813 : /* MAP_MAGIC gives the shared memory magic number to aid in persistent
814 : and/or interprocess usage. */
815 :
816 : #ifndef MAP_MAGIC
817 3 : #define MAP_MAGIC (0xf17eda2c37c5107UL) /* firedancer cslot version 0 */
818 : #endif
819 :
820 : /* MAP_IMPL_STYLE controls what to generate:
821 : 0 - header only library
822 : 1 - library header declaration
823 : 2 - library implementation */
824 :
825 : #ifndef MAP_IMPL_STYLE
826 : #define MAP_IMPL_STYLE 0
827 : #endif
828 :
829 : /* Commom map error codes (FIXME: probably should get around to making
830 : unified error codes, error strings and/or flags across util at least
831 : so we don't have to do this in the generator itself) */
832 :
833 29988837 : #define FD_MAP_SUCCESS (0)
834 3 : #define FD_MAP_ERR_INVAL (-1)
835 7510113 : #define FD_MAP_ERR_AGAIN (-2)
836 : //#define FD_MAP_ERR_CORRUPT (-3)
837 : //#define FD_MAP_ERR_EMPTY (-4)
838 3 : #define FD_MAP_ERR_FULL (-5)
839 7510113 : #define FD_MAP_ERR_KEY (-6)
840 :
841 : /* Implementation *****************************************************/
842 :
843 : #if MAP_IMPL_STYLE==0 /* local use only */
844 : #define MAP_STATIC FD_FN_UNUSED static
845 : #else /* library header and/or implementation */
846 : #define MAP_STATIC
847 : #endif
848 :
849 107381055 : #define MAP_(n) FD_EXPAND_THEN_CONCAT3(MAP_NAME,_,n)
850 :
851 : #if MAP_IMPL_STYLE!=2 /* need header */
852 :
853 : #include "../bits/fd_bits.h"
854 :
855 : struct __attribute__((aligned(MAP_ALIGN))) MAP_(shmem_private) {
856 :
857 : /* This point is MAP_ALIGN aligned */
858 :
859 : ulong magic; /* ==MAP_MAGIC */
860 : ulong ele_max; /* Element store capacity, positive and an integer power-of-two */
861 : ulong lock_cnt; /* Number of locks, positive and an integer power-of-two <= min( ele_max, MAP_LOCK_MAX ) */
862 : ulong probe_max; /* Maximum length probe sequence, in [1,ele_max] */
863 : ulong seed; /* Key hash seed, arbitrary */
864 : int lock_shift; /* log2( ele_max / lock_cnt ), non-negative */
865 :
866 : /* Padding to MAP_ALIGN alignment here */
867 :
868 : /* MAP_VERSION_T lock[ lock_cnt ] here (obligatory sigh about lagging
869 : C++ support for 0 sized structure array footers). */
870 :
871 : /* Padding to MAP_ALIGN alignment here */
872 : };
873 :
874 : typedef struct MAP_(shmem_private) MAP_(shmem_t);
875 :
876 : struct MAP_(private) {
877 : MAP_ELE_T * ele; /* Location of the element store in the local address space, indexed [0,ele_max) */
878 : MAP_VERSION_T * lock; /* Location of the lock versions in the local address space, indexed [0,lock_cnt) */
879 : ulong ele_max; /* ==shmem->ele_max */
880 : ulong lock_cnt; /* ==shmem->lock_cnt */
881 : ulong probe_max; /* ==shmem->probe_max */
882 : ulong seed; /* ==shmem->seed */
883 : int lock_shift; /* ==shmem->lock_shift */
884 : int _pad; /* padding to ulong alignment */
885 : uchar ctx[ MAP_CTX_MAX ]; /* User context for MAP_ELE_IS_FREE/MAP_ELE_FREE/MAP_ELE_MOVE */
886 : };
887 :
888 : typedef struct MAP_(private) MAP_(t);
889 :
890 : struct MAP_(query_private) {
891 : ulong memo; /* Query key memo */
892 : MAP_ELE_T * ele; /* Query element in the local address space */
893 : MAP_VERSION_T * l; /* Lock needed for this query in the local address space */
894 : MAP_VERSION_T v; /* Version of lock at query start */
895 : };
896 :
897 : typedef struct MAP_(query_private) MAP_(query_t);
898 :
899 : FD_PROTOTYPES_BEGIN
900 :
901 : /* map_private_try returns the version of the lock observed at some
902 : point during the call. Assumes lock is valid. If the least
903 : significant bit of the returned value is set (i.e. is odd), an
904 : operation was in progress on a key whose probe sequence includes a
905 : map slot covered by this lock (i.e. it is not a good time to try the
906 : operation). If the LSB is clear (i.e. is even), no operation was in
907 : progress (i.e. it is a good time to try). This is a compiler memory
908 : fence. */
909 :
910 : static inline MAP_VERSION_T
911 8391465 : MAP_(private_try)( MAP_VERSION_T volatile const * l ) {
912 8391465 : MAP_VERSION_T v;
913 8391465 : FD_COMPILER_MFENCE();
914 8391465 : v = *l;
915 8391465 : FD_COMPILER_MFENCE();
916 8391465 : return v;
917 8391465 : }
918 :
919 : /* map_private_test tests a range of lock versions matched their locally
920 : cached versions at some point during the call. Specifically, tests
921 : lock[lock_idx]==version[lock_idx] for all lock_idx in
922 : [version_lock0,version_lock0+version_cnt) (cyclic). lock_cnt is the
923 : number of locks and assumed to be positive and an integer
924 : power-of-two. Returns SUCCESS (zero) if all match (i.e. no probe
925 : sequences through slots covered by the locks between when the last
926 : lock in the range was observed and this was called changed) and AGAIN
927 : (negative) otherwise. This is a compiler memory fence. */
928 :
929 : static inline int
930 : MAP_(private_test)( MAP_VERSION_T volatile const * lock,
931 : ulong lock_cnt,
932 : MAP_VERSION_T const * version,
933 : ulong lock_idx, /* version_lock0 */
934 3758118 : ulong version_cnt ) {
935 3758118 : FD_COMPILER_MFENCE();
936 8203269 : for( ; version_cnt; version_cnt-- ) {
937 4445151 : if( FD_UNLIKELY( lock[ lock_idx ]!=version[ lock_idx ] ) ) break; /* opt for low contention */
938 4445151 : lock_idx = (lock_idx+1UL) & (lock_cnt-1UL);
939 4445151 : }
940 3758118 : FD_COMPILER_MFENCE();
941 3758118 : return version_cnt ? FD_MAP_ERR_AGAIN : FD_MAP_SUCCESS; /* cmov */
942 3758118 : }
943 :
944 : /* map_private_lock returns the version of the lock observed at some
945 : point during the call. Assumes lock is valid. If the least
946 : significant bit of the returned version is set (i.e. is odd), the
947 : caller did not get the lock (i.e. an operation was already in
948 : progress on a key whose probe sequence includes a map slot covered by
949 : this lock). If the LSB is clear (i.e. is even), the caller got the
950 : lock (i.e. is free to do an operation involving probe sequences that
951 : pass through the range covered by the lock) and *lock LSB was set.
952 : This is a compiler memory fence. When the target does not have
953 : FD_HAS_ATOMIC, this operation is emulated. When emulated, the map
954 : will not be safe to use concurrently but will still work with
955 : comparable performance to a serial implementation. */
956 :
957 : static inline MAP_VERSION_T
958 27013152 : MAP_(private_lock)( MAP_VERSION_T volatile * l ) {
959 27013152 : MAP_VERSION_T v;
960 27013152 : FD_COMPILER_MFENCE();
961 : # if FD_HAS_ATOMIC /* test-and-test-and-set style */
962 27013152 : v = *l;
963 27013152 : if( FD_LIKELY( !((ulong)v & 1UL) ) ) v = FD_ATOMIC_FETCH_AND_OR( l, (MAP_VERSION_T)1 ); /* opt for low contention */
964 : # else
965 : v = *l;
966 : *l = (MAP_VERSION_T)((ulong)v | 1UL);
967 : # endif
968 27013152 : FD_COMPILER_MFENCE();
969 27013152 : return v;
970 27013152 : }
971 :
972 : /* map_private_unlock unlocks lock[lock_idx] for lock_idx in
973 : [version_lock0,version_lock0+version_cnt) (cyclic). Assumes
974 : version[lock_idx] is the version the caller wants post unlock (which
975 : implies that, on entry, version[lock_idx] = lock[lock_idx] + delta
976 : where delta is odd and >=-1 (the -1 case corresponds "unlock with no
977 : changes made to the covered elements"). This cannot fail. This is a
978 : compiler memory fence. */
979 :
980 : static inline void
981 : MAP_(private_unlock)( MAP_VERSION_T volatile * lock,
982 : ulong lock_cnt,
983 : MAP_VERSION_T const * version,
984 : ulong lock_idx, /* version_lock0 */
985 22502061 : ulong version_cnt ) {
986 22502061 : FD_COMPILER_MFENCE();
987 34518564 : for( ; version_cnt; version_cnt-- ) {
988 12016503 : lock[ lock_idx ] = version[ lock_idx ];
989 12016503 : lock_idx = (lock_idx+1UL) & (lock_cnt-1UL);
990 12016503 : }
991 22502061 : FD_COMPILER_MFENCE();
992 22502061 : }
993 :
994 : /* map_private_ele_{is_free,free,move} expose the
995 : MAP_ELE_{IS_FREE,FREE,MOVE} macros as inlines with strict semantics.
996 :
997 : map_private_ele_is_free returns 1 if ele does not contain an key-val
998 : pair and 0 otherwise. ctx will be the join's user context, ele will
999 : be a valid pointer to an element store element in the caller's
1000 : address space that is safe to speculate on. Retains no interest in
1001 : ele.
1002 :
1003 : map_private_ele_free frees any key and/or val resources used by ele
1004 : and marks ele as free. ctx will be the join's user context, ele will
1005 : be a valid pointer to an element store element in the caller's
1006 : address space that is marked as used. Retains no interest in ele.
1007 :
1008 : map_private_ele_move moves the key-val pair from element src to
1009 : element to element dst and marks src as free. ctx will be the join's
1010 : user context, src/dst will be a valid pointers to an element store
1011 : element in the caller's address space. dst/src will be marked as
1012 : free/used on entry and should be marked as used/free on return.
1013 : Retains no interest in dst or src. */
1014 :
1015 : FD_FN_PURE static inline int
1016 : MAP_(private_ele_is_free)( void const * ctx,
1017 59081700 : MAP_ELE_T const * ele ) {
1018 59081700 : (void)ctx;
1019 59081700 : return !!(MAP_ELE_IS_FREE( (ctx), (ele) ));
1020 59081700 : }
1021 :
1022 : static inline void
1023 : MAP_(private_ele_free)( void * ctx,
1024 3753420 : MAP_ELE_T * ele ) {
1025 3753420 : (void)ctx;
1026 3753420 : MAP_ELE_FREE( (ctx), (ele) );
1027 3753420 : }
1028 :
1029 : static inline void
1030 : MAP_(private_ele_move)( void * ctx,
1031 : MAP_ELE_T * dst,
1032 1028307 : MAP_ELE_T * src ) {
1033 1028307 : (void)ctx;
1034 1028307 : MAP_ELE_MOVE( (ctx), (dst), (src) );
1035 1028307 : }
1036 :
1037 0 : FD_FN_CONST static inline ulong MAP_(lock_max)( void ) { return MAP_LOCK_MAX; }
1038 :
1039 3000003 : FD_FN_CONST static inline ulong MAP_(lock_cnt_est) ( ulong ele_max ) { return fd_ulong_min( ele_max, MAP_LOCK_MAX ); }
1040 3000003 : FD_FN_CONST static inline ulong MAP_(probe_max_est)( ulong ele_max ) { return ele_max; }
1041 :
1042 0 : FD_FN_CONST static inline ulong MAP_(align)( void ) { return alignof(MAP_(shmem_t)); }
1043 :
1044 : FD_FN_CONST static inline ulong
1045 : MAP_(footprint)( ulong ele_max,
1046 : ulong lock_cnt,
1047 36 : ulong probe_max ) {
1048 36 : if( !( fd_ulong_is_pow2( ele_max ) &
1049 36 : fd_ulong_is_pow2( lock_cnt ) & (lock_cnt<=fd_ulong_min( ele_max, MAP_LOCK_MAX )) &
1050 36 : (1UL<=probe_max) & (probe_max<=ele_max) ) ) return 0UL;
1051 6 : return fd_ulong_align_up( sizeof(MAP_(shmem_t)) + lock_cnt*sizeof(MAP_VERSION_T), alignof(MAP_(shmem_t)) ); /* no overflow */
1052 36 : }
1053 :
1054 6 : FD_FN_PURE static inline ulong MAP_(ele_max) ( MAP_(t) const * join ) { return join->ele_max; }
1055 3 : FD_FN_PURE static inline ulong MAP_(lock_cnt) ( MAP_(t) const * join ) { return join->lock_cnt; }
1056 3 : FD_FN_PURE static inline ulong MAP_(probe_max)( MAP_(t) const * join ) { return join->probe_max; }
1057 6 : FD_FN_PURE static inline ulong MAP_(seed) ( MAP_(t) const * join ) { return join->seed; }
1058 :
1059 3 : FD_FN_PURE static inline void const * MAP_(shmap_const)( MAP_(t) const * join ) { return ((MAP_(shmem_t) const *)join->lock)-1; }
1060 3 : FD_FN_PURE static inline void const * MAP_(shele_const)( MAP_(t) const * join ) { return join->ele; }
1061 :
1062 3 : FD_FN_CONST void * MAP_(ctx) ( MAP_(t) * join ) { return join->ctx; }
1063 3 : FD_FN_CONST void const * MAP_(ctx_const)( MAP_(t) const * join ) { return join->ctx; }
1064 0 : FD_FN_CONST ulong MAP_(ctx_max) ( MAP_(t) const * join ) { (void)join; return MAP_CTX_MAX; }
1065 :
1066 3 : FD_FN_PURE static inline void * MAP_(shmap)( MAP_(t) * join ) { return ((MAP_(shmem_t) *)join->lock)-1; }
1067 6 : FD_FN_PURE static inline void * MAP_(shele)( MAP_(t) * join ) { return join->ele; }
1068 :
1069 : FD_FN_PURE static inline int
1070 : MAP_(key_eq)( MAP_KEY_T const * k0,
1071 41257287 : MAP_KEY_T const * k1 ) {
1072 41257287 : return !!(MAP_KEY_EQ( (k0), (k1) ));
1073 41257287 : }
1074 :
1075 : FD_FN_PURE static inline ulong
1076 : MAP_(key_hash)( MAP_KEY_T const * key,
1077 51605538 : ulong seed ) {
1078 51605538 : return (MAP_KEY_HASH( (key), (seed) ));
1079 51605538 : }
1080 :
1081 : static inline void
1082 : MAP_(backoff)( ulong scale,
1083 0 : ulong seed ) {
1084 0 : ulong r = (ulong)(uint)fd_ulong_hash( seed ^ (((ulong)fd_tickcount())<<32) );
1085 0 : for( ulong rem=(scale*r)>>48; rem; rem-- ) FD_SPIN_PAUSE();
1086 0 : }
1087 :
1088 14996649 : FD_FN_PURE static inline ulong MAP_(query_memo )( MAP_(query_t) const * query ) { return query->memo; }
1089 7498425 : FD_FN_PURE static inline MAP_ELE_T const * MAP_(query_ele_const)( MAP_(query_t) const * query ) { return query->ele; }
1090 14996649 : FD_FN_PURE static inline MAP_ELE_T * MAP_(query_ele )( MAP_(query_t) * query ) { return query->ele; }
1091 :
1092 : MAP_STATIC void * MAP_(new) ( void * shmem, ulong ele_max, ulong lock_cnt, ulong probe_max, ulong seed );
1093 : MAP_STATIC MAP_(t) * MAP_(join) ( void * ljoin, void * shmap, void * shele );
1094 : MAP_STATIC void * MAP_(leave) ( MAP_(t) * join );
1095 : MAP_STATIC void * MAP_(delete)( void * shmap );
1096 :
1097 : MAP_STATIC int
1098 : MAP_(prepare)( MAP_(t) * join,
1099 : MAP_KEY_T const * key,
1100 : MAP_ELE_T * sentinel,
1101 : MAP_(query_t) * query );
1102 :
1103 : static inline void
1104 7502574 : MAP_(publish)( MAP_(query_t) * query ) {
1105 7502574 : MAP_VERSION_T volatile * l = query->l;
1106 7502574 : MAP_VERSION_T v = (MAP_VERSION_T)((ulong)query->v + 2UL);
1107 7502574 : FD_COMPILER_MFENCE();
1108 7502574 : *l = v;
1109 7502574 : FD_COMPILER_MFENCE();
1110 7502574 : }
1111 :
1112 : static inline void
1113 7494075 : MAP_(cancel)( MAP_(query_t) * query ) {
1114 7494075 : MAP_VERSION_T volatile * l = query->l;
1115 7494075 : MAP_VERSION_T v = query->v;
1116 7494075 : FD_COMPILER_MFENCE();
1117 7494075 : *l = v;
1118 7494075 : FD_COMPILER_MFENCE();
1119 7494075 : }
1120 :
1121 : MAP_STATIC int
1122 : MAP_(remove)( MAP_(t) * join,
1123 : MAP_KEY_T const * key );
1124 :
1125 : MAP_STATIC int
1126 : MAP_(query_try)( MAP_(t) const * join,
1127 : MAP_KEY_T const * key,
1128 : MAP_ELE_T const * sentinel,
1129 : MAP_(query_t) * query );
1130 :
1131 : static inline int
1132 3740307 : MAP_(query_test)( MAP_(query_t) const * query ) {
1133 3740307 : MAP_VERSION_T volatile const * l = query->l;
1134 3740307 : ulong v = query->v;
1135 3740307 : FD_COMPILER_MFENCE();
1136 3740307 : ulong _v = *l;
1137 3740307 : FD_COMPILER_MFENCE();
1138 3740307 : return _v==v ? FD_MAP_SUCCESS : FD_MAP_ERR_AGAIN;
1139 3740307 : }
1140 :
1141 : /* FIXME: Consider adding txn API too? Would work recording the start
1142 : of probe sequences for keys in the transaction and then the txn_try
1143 : would use a bitfield to lock all contiguous regions covered by the
1144 : set of probe sequences. */
1145 :
1146 : /* FIXME: Consider adding an iterator API. Probably would allow
1147 : ordered iteration over all keys with the same hash value to support
1148 : key groups (similar to map_para.c / funk). */
1149 :
1150 : MAP_STATIC FD_FN_PURE int MAP_(verify)( MAP_(t) const * join );
1151 :
1152 : MAP_STATIC FD_FN_CONST char const * MAP_(strerror)( int err );
1153 :
1154 : FD_PROTOTYPES_END
1155 :
1156 : #endif
1157 :
1158 : #if MAP_IMPL_STYLE!=1 /* need implementations (assumes header already included) */
1159 :
1160 : #include "../log/fd_log.h" /* Used by constructors and verify (FIXME: Consider making a compile time option) */
1161 :
1162 : MAP_STATIC void *
1163 : MAP_(new)( void * shmem,
1164 : ulong ele_max,
1165 : ulong lock_cnt,
1166 : ulong probe_max,
1167 24 : ulong seed ) {
1168 :
1169 24 : if( FD_UNLIKELY( !shmem ) ) {
1170 3 : FD_LOG_WARNING(( "NULL shmem" ));
1171 3 : return NULL;
1172 3 : }
1173 :
1174 21 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, MAP_(align)() ) ) ) {
1175 3 : FD_LOG_WARNING(( "misaligned shmem" ));
1176 3 : return NULL;
1177 3 : }
1178 :
1179 18 : ulong footprint = MAP_(footprint)( ele_max, lock_cnt, probe_max );
1180 18 : if( FD_UNLIKELY( !footprint ) ) {
1181 15 : FD_LOG_WARNING(( "ele_max, lock_cnt and/or probe_max" ));
1182 15 : return NULL;
1183 15 : }
1184 :
1185 : /* seed arbitrary */
1186 :
1187 : /* Init the metadata */
1188 :
1189 3 : MAP_(shmem_t) * map = (MAP_(shmem_t) *)shmem;
1190 :
1191 3 : memset( map, 0, footprint );
1192 :
1193 3 : map->ele_max = ele_max;
1194 3 : map->lock_cnt = lock_cnt;
1195 3 : map->probe_max = probe_max;
1196 3 : map->seed = seed;
1197 3 : map->lock_shift = fd_ulong_find_msb( ele_max ) - fd_ulong_find_msb( lock_cnt );
1198 :
1199 : /* Note: memset set all the locks to version 0/unlocked */
1200 :
1201 : /* Note: caller set all elements in underlying element store set to
1202 : free (or, more pedantically, to a key-val pair configuration
1203 : consistent with ele_max and probe_max). */
1204 :
1205 3 : FD_COMPILER_MFENCE();
1206 3 : map->magic = MAP_MAGIC;
1207 3 : FD_COMPILER_MFENCE();
1208 :
1209 3 : return shmem;
1210 18 : }
1211 :
1212 : MAP_STATIC MAP_(t) *
1213 : MAP_(join)( void * ljoin,
1214 : void * shmap,
1215 24 : void * shele ) {
1216 24 : MAP_(t) * join = (MAP_(t) *)ljoin;
1217 24 : MAP_(shmem_t) * map = (MAP_(shmem_t) *)shmap;
1218 24 : MAP_ELE_T * ele = (MAP_ELE_T *)shele;
1219 :
1220 24 : if( FD_UNLIKELY( !join ) ) {
1221 3 : FD_LOG_WARNING(( "NULL ljoin" ));
1222 3 : return NULL;
1223 3 : }
1224 :
1225 21 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)join, alignof(MAP_(t)) ) ) ) {
1226 3 : FD_LOG_WARNING(( "misaligned ljoin" ));
1227 3 : return NULL;
1228 3 : }
1229 :
1230 18 : if( FD_UNLIKELY( !map ) ) {
1231 3 : FD_LOG_WARNING(( "NULL shmap" ));
1232 3 : return NULL;
1233 3 : }
1234 :
1235 15 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)map, MAP_(align)() ) ) ) {
1236 3 : FD_LOG_WARNING(( "misaligned shmap" ));
1237 3 : return NULL;
1238 3 : }
1239 :
1240 12 : if( FD_UNLIKELY( map->magic!=MAP_MAGIC ) ) {
1241 3 : FD_LOG_WARNING(( "bad magic" ));
1242 3 : return NULL;
1243 3 : }
1244 :
1245 9 : if( FD_UNLIKELY( !ele ) ) {
1246 3 : FD_LOG_WARNING(( "NULL shele" ));
1247 3 : return NULL;
1248 3 : }
1249 :
1250 6 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)ele, alignof(MAP_ELE_T) ) ) ) {
1251 3 : FD_LOG_WARNING(( "misaligned shele" ));
1252 3 : return NULL;
1253 3 : }
1254 :
1255 3 : join->lock = (MAP_VERSION_T *)(map+1);
1256 3 : join->ele = ele;
1257 3 : join->ele_max = map->ele_max;
1258 3 : join->lock_cnt = map->lock_cnt;
1259 3 : join->probe_max = map->probe_max;
1260 3 : join->seed = map->seed;
1261 3 : join->lock_shift = map->lock_shift;
1262 :
1263 3 : return join;
1264 6 : }
1265 :
1266 : MAP_STATIC void *
1267 6 : MAP_(leave)( MAP_(t) * join ) {
1268 :
1269 6 : if( FD_UNLIKELY( !join ) ) {
1270 3 : FD_LOG_WARNING(( "NULL join" ));
1271 3 : return NULL;
1272 3 : }
1273 :
1274 3 : return (void *)join;
1275 6 : }
1276 :
1277 : MAP_STATIC void *
1278 12 : MAP_(delete)( void * shmap ) {
1279 12 : MAP_(shmem_t) * map = (MAP_(shmem_t) *)shmap;
1280 :
1281 12 : if( FD_UNLIKELY( !map ) ) {
1282 3 : FD_LOG_WARNING(( "NULL shmap" ));
1283 3 : return NULL;
1284 3 : }
1285 :
1286 9 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)map, MAP_(align)() ) ) ) {
1287 3 : FD_LOG_WARNING(( "misaligned shmap" ));
1288 3 : return NULL;
1289 3 : }
1290 :
1291 6 : if( FD_UNLIKELY( map->magic!=MAP_MAGIC ) ) {
1292 3 : FD_LOG_WARNING(( "bad magic" ));
1293 3 : return NULL;
1294 3 : }
1295 :
1296 3 : FD_COMPILER_MFENCE();
1297 3 : map->magic = 0UL;
1298 3 : FD_COMPILER_MFENCE();
1299 :
1300 3 : return (void *)map;
1301 6 : }
1302 :
1303 : int
1304 : MAP_(prepare)( MAP_(t) * join,
1305 : MAP_KEY_T const * key,
1306 : MAP_ELE_T * sentinel,
1307 14996649 : MAP_(query_t) * query ) {
1308 14996649 : MAP_ELE_T * ele0 = join->ele;
1309 14996649 : MAP_VERSION_T * lock = join->lock;
1310 14996649 : ulong ele_max = join->ele_max;
1311 14996649 : ulong lock_cnt = join->lock_cnt;
1312 14996649 : ulong probe_max = join->probe_max;
1313 14996649 : ulong seed = join->seed;
1314 14996649 : int lock_shift = join->lock_shift;
1315 14996649 : void * ctx = join->ctx;
1316 :
1317 14996649 : ulong key_hash = MAP_(key_hash)( key, seed );
1318 14996649 : ulong ele_idx = key_hash & (ele_max-1UL);
1319 14996649 : ulong version_lock0 = ele_idx >> lock_shift;
1320 14996649 : ulong version_cnt = 0UL;
1321 :
1322 14996649 : MAP_VERSION_T version[ MAP_LOCK_MAX ];
1323 :
1324 14996649 : ulong lock_idx = version_lock0;
1325 :
1326 14996649 : int err;
1327 :
1328 : /* At this point, finding any key in the map requires testing at most
1329 : probe_max contiguous slots. */
1330 :
1331 14996649 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
1332 14996649 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail; } /* opt for low contention */
1333 14996649 : version[ lock_idx ] = v;
1334 14996649 : version_cnt++;
1335 :
1336 22132404 : for( ulong probe_rem=probe_max; probe_rem; probe_rem-- ) {
1337 :
1338 : /* At this point, we've acquired the locks from the start of key's
1339 : probe sequence to ele_idx inclusive and have tested fewer than
1340 : probe_max slots for key.
1341 :
1342 : If slot ele_idx is empty, we know that the key is currently not
1343 : in the map and we can insert it here without creating a probe
1344 : sequence longer than probe_max. This does not lengthen the probe
1345 : sequence for any currently mapped keys, preserving the maximum
1346 : probe sequence length invariant. Further, this is at the end of
1347 : all keys that map to the same probe sequence start. So, we have
1348 : preserved the key group ordering invariant.
1349 :
1350 : On return, ele will be marked as free. To insert key into the
1351 : map, the caller should initialize the slot's key (and memo if
1352 : necessary), mark the slot as used, and publish to complete the
1353 : insert.
1354 :
1355 : If the caller doesn't want to insert anything (e.g. caller only
1356 : wants to modify an existing value), the caller should keep the
1357 : slot marked as free (doesn't matter how the caller modified any
1358 : other fields) and return the slot as free, and cancel to complete
1359 : the failed insert (publish would also work ... cancel has
1360 : theoretically lower risk of false contention).
1361 :
1362 : Likewise, if slot ele_idx contains key, we return that slot to
1363 : the caller. The caller can tell the difference between the
1364 : previous case because the slot will be marked as used.
1365 :
1366 : On return, the caller can modify the slot's value arbitrarily.
1367 : IMPORANT SAFETY TIP! THE CALLER MUST NOT MODIFY THE SLOT'S KEY
1368 : OR MARK THE SLOT AS FREE. USE REMOVE BELOW TO REMOVE KEYS. When
1369 : done modifying the slot's value, the caller should either publish
1370 : or cancel depending on what the caller did to the slot's value
1371 : and how the application manages access to values (publish is
1372 : always safe but cancel when appropriate has theoretically lower
1373 : risk of false contention). Note that cancel is not appropriate
1374 : for temporary modifications to value (because it can confuse
1375 : query ABA protection).
1376 :
1377 : In both cases, since we have the lock that covers slot ele_idx,
1378 : we can unlock any other locks (typically the leading
1379 : version_cnt-1 but possibly the trailing version_cnt-1 in cases
1380 : with maps near capacity) locks already acquired to reduce
1381 : contention with other unrelated operations. That is, at this
1382 : point, lock lock_idx is sufficient to prevent any operation for
1383 : any key breaking key's probe sequence (because it would need to
1384 : acquire the lock covering ele_idx first). */
1385 :
1386 22132404 : MAP_ELE_T * ele = ele0 + ele_idx;
1387 :
1388 22132404 : if( FD_LIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) || /* opt for low collision */
1389 22132404 : (
1390 : # if MAP_MEMOIZE && MAP_KEY_EQ_IS_SLOW
1391 : FD_LIKELY( ele->MAP_MEMO==key_hash ) &&
1392 : # endif
1393 : FD_LIKELY( MAP_(key_eq)( &ele->MAP_KEY, key ) ) /* opt for already in map */
1394 14996649 : ) ) {
1395 :
1396 14996649 : lock_idx = ele_idx >> lock_shift;
1397 14996649 : version_lock0 = (version_lock0 + (ulong)(version_lock0==lock_idx)) & (lock_cnt-1UL);
1398 14996649 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt-1UL );
1399 :
1400 14996649 : query->memo = key_hash;
1401 14996649 : query->ele = ele;
1402 14996649 : query->l = lock + lock_idx;
1403 14996649 : query->v = version[ lock_idx ];
1404 14996649 : return FD_MAP_SUCCESS;
1405 14996649 : }
1406 :
1407 : /* At this point, slot ele_idx is used by something other than key.
1408 : If we still have probes remaining, continue probing for key,
1409 : locking as necessary. If we can't acquire a lock, we fail. */
1410 :
1411 7135755 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
1412 :
1413 7135755 : ulong lock_next = ele_idx >> lock_shift;
1414 7135755 : if( FD_UNLIKELY( (lock_next!=lock_idx) & (lock_next!=version_lock0) ) ) { /* opt for locks that cover many contiguous slots */
1415 1789302 : lock_idx = lock_next;
1416 :
1417 1789302 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
1418 1789302 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail; } /* opt for low contention */
1419 1789302 : version[ lock_idx ] = v;
1420 1789302 : version_cnt++;
1421 1789302 : }
1422 7135755 : }
1423 :
1424 : /* At this point, we've done probe_max probes without encountering key
1425 : and we have all the locks. So we know key is not in the map and
1426 : that, even if we have space, inserting this key will create a probe
1427 : sequence longer than probe_max. That is, map is loaded enough that
1428 : we consider it full.
1429 :
1430 : If probe_max==ele_max, this meaning of full is the traditional
1431 : non-concurrent meaning of full (literally every slot is known to be
1432 : used). Even if probe_max << ele_max, it is possible to fill every
1433 : slot (e.g. at probe_max==1, a perfect hash of ele_max keys to slot
1434 : would fill every slot). */
1435 :
1436 0 : err = FD_MAP_ERR_FULL;
1437 :
1438 0 : fail:
1439 :
1440 0 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt );
1441 :
1442 0 : query->memo = key_hash;
1443 0 : query->ele = sentinel;
1444 0 : query->l = NULL;
1445 0 : query->v = (MAP_VERSION_T)0;
1446 0 : return err;
1447 0 : }
1448 :
1449 : int
1450 : MAP_(remove)( MAP_(t) * join,
1451 7505412 : MAP_KEY_T const * key ) {
1452 :
1453 7505412 : MAP_VERSION_T * lock = join->lock;
1454 7505412 : ulong lock_cnt = join->lock_cnt;
1455 7505412 : ulong seed = join->seed;
1456 7505412 : ulong probe_max = join->probe_max;
1457 7505412 : MAP_ELE_T * ele0 = join->ele;
1458 7505412 : ulong ele_max = join->ele_max;
1459 7505412 : int lock_shift = join->lock_shift;
1460 7505412 : void * ctx = join->ctx;
1461 :
1462 7505412 : ulong key_hash = MAP_(key_hash)( key, seed );
1463 7505412 : ulong start_idx = key_hash & (ele_max-1UL);
1464 7505412 : ulong version_lock0 = start_idx >> lock_shift;
1465 7505412 : ulong version_cnt = 0UL;
1466 :
1467 7505412 : ulong lock_idx = version_lock0;
1468 :
1469 7505412 : MAP_VERSION_T version[ MAP_LOCK_MAX ];
1470 :
1471 : /* At this point, we need to acquire locks covering the start of the
1472 : probe sequence through up to all contiguously used slots (and, if
1473 : the map is not completely full, the trailing empty slot). */
1474 :
1475 7505412 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
1476 7505412 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) return FD_MAP_ERR_AGAIN; /* opt for low contention */
1477 7505412 : version[ lock_idx ] = v;
1478 7505412 : version_cnt++;
1479 :
1480 7505412 : ulong ele_idx = start_idx;
1481 7505412 : ulong hole_idx = start_idx;
1482 7505412 : int found = 0;
1483 :
1484 7505412 : ulong contig_cnt;
1485 18384198 : for( contig_cnt=0UL; contig_cnt<ele_max; contig_cnt++ ) {
1486 :
1487 : /* At this point, we've acquired the locks covering slots
1488 : [start_idx,ele_idx] (cyclic) and have confirmed that the
1489 : contig_cnt slots [start_idx,ele_idx) (cyclic) are used.
1490 :
1491 : If slot ele_idx is empty, we are done probing.
1492 :
1493 : Otherwise, if we haven't found key yet, we test if slot ele_idx
1494 : contains key.
1495 :
1496 : We can optimize this further by noting that the key can only be
1497 : in the first probe_max probes and that when we don't find the
1498 : key, remove has nothing to do (such that we don't have to keep
1499 : probing for contiguous slots). */
1500 :
1501 18384198 : MAP_ELE_T const * ele = ele0 + ele_idx;
1502 :
1503 18384198 : if( FD_UNLIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) break; /* opt for first pass low collision */
1504 :
1505 10878786 : if( FD_LIKELY( !found ) ) { /* opt for first pass low collision */
1506 7294467 : if( FD_UNLIKELY( contig_cnt>=probe_max ) ) break; /* opt for first pass low collision */
1507 7294467 : found =
1508 : # if MAP_MEMOMIZE && MAP_KEY_EQ_IS_SLOW
1509 : FD_LIKELY( ele->MAP_MEMO==key_hash ) &&
1510 : # endif
1511 7294467 : MAP_(key_eq)( &ele->MAP_KEY, key );
1512 7294467 : if( found ) hole_idx = ele_idx; /* cmov */
1513 7294467 : }
1514 :
1515 : /* Continue probing, locking as necessary. If we can't acquire a
1516 : lock, fail. */
1517 :
1518 10878786 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
1519 :
1520 10878786 : ulong lock_next = ele_idx >> lock_shift;
1521 10878786 : if( FD_UNLIKELY( (lock_next!=lock_idx) & (lock_next!=version_lock0) ) ) { /* opt for locks covering many contiguous slots */
1522 2721789 : lock_idx = lock_next;
1523 :
1524 2721789 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
1525 2721789 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { /* opt for low contention */
1526 0 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt );
1527 0 : return FD_MAP_ERR_AGAIN;
1528 0 : }
1529 2721789 : version[ lock_idx ] = v;
1530 2721789 : version_cnt++;
1531 2721789 : }
1532 10878786 : }
1533 :
1534 : /* At this point, if we haven't found the key, key did not exist in
1535 : the map at some point during the call. Release the locks and tell
1536 : the user the key was already removed. */
1537 :
1538 7505412 : if( FD_UNLIKELY( !found ) ) {
1539 3751992 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt );
1540 3751992 : return FD_MAP_ERR_KEY;
1541 3751992 : }
1542 :
1543 : /* At this point, we have locks covering the contig_cnt used slots
1544 : starting from start_idx cyclic (and, if contig_cnt<ele_max, any
1545 : trailing empty slot). The key to remove is in this range at
1546 : hole_idx. Further, all probe sequences are intact. Make a hole at
1547 : hole_idx by freeing the key. Also update the cached lock version
1548 : to indicate "modified" when we unlock below. */
1549 :
1550 3753420 : MAP_(private_ele_free)( ctx, ele0 + hole_idx );
1551 :
1552 3753420 : lock_idx = hole_idx >> lock_shift;
1553 3753420 : version[ lock_idx ] = (MAP_VERSION_T)((ulong)version[ lock_idx ] + 2UL);
1554 :
1555 : /* When contig_cnt<ele_max, the trailing empty slot guarantees that
1556 : the just made hole didn't break any probe sequences for keys not in
1557 : the contig_cnt slots and that it didn't break any probe sequences
1558 : in [start_idx,hole_idx). Probe sequences for keys in
1559 : (hole_idx,start_idx+contig_cnt) (cyclic) might have been broken
1560 : though.
1561 :
1562 : We fix the first key with a broken probe sequence by moving it to
1563 : the hole just made. This fills the hole but makes a new hole (and
1564 : one closer to the empty trailing slot) in the process. As this
1565 : shortens the probe sequence for that key, this doesn't break any
1566 : probe length invariants. We repeating this process until we've
1567 : fixed all the contiguous slots after hole_idx. (As an additional
1568 : optimization to reduce remove costs when map is nearly full but
1569 : probe_max << ele_max, we could exploit that only the leading
1570 : probe_max-1 slots after any created hole might have broken probe
1571 : sequences.)
1572 :
1573 : Unfortunately, when contig_cnt==ele_max, we no longer have this
1574 : guarantee. But we do have the entire map locked at this point.
1575 : And we know that probe sequences are intact starting from the most
1576 : recently created hole. If we verify enough to eventually wrap back
1577 : to most recently created hole, we know all probe sequences are
1578 : intact. Since fixing broken probe sequences in this fashion always
1579 : shortens them and there always will be one hole in this process,
1580 : verifying until we hit the most recently made hole is guaranteed to
1581 : terminate. Since there is only one hole, it is sufficient to just
1582 : test if the next slot to verify is a hole.
1583 :
1584 : This test works just as well for the more common contig_cnt<ele_max
1585 : case (it will terminate at the the preexisting trailing empty slot
1586 : instead of the most recently created hole). So, for code
1587 : simplicity, we just do that.
1588 :
1589 : A nice side effect is this removal process is that implicitly
1590 : improves probing for remaining keys in the map and does not require
1591 : tombstones.
1592 :
1593 : TL;DR It's a bad idea on many levels to fill up linearly probed
1594 : maps to their absolute limits ... but this will still work if you
1595 : do.
1596 :
1597 : Note also that this process preserves the ordering of keys that
1598 : hash to the same slot (such that key group ordering is preserved). */
1599 :
1600 3753420 : ele_idx = hole_idx;
1601 7337739 : for(;;) {
1602 7337739 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
1603 :
1604 : /* At this point, slots (hole_idx,ele_idx) (cyclic) are used with
1605 : verified probe sequences. As per the above, we are guaranteed to
1606 : eventually hit an empty slot (typically very quickly in practice)
1607 : and hitting an empty slot guarantees all probe sequences are
1608 : intact (such that we are done). */
1609 :
1610 7337739 : MAP_ELE_T * ele = ele0 + ele_idx;
1611 7337739 : if( FD_LIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) break;
1612 :
1613 : /* Otherwise, if ele_idx's key probe sequence doesn't start in
1614 : (hole_idx,ele_idx] (cyclic), its probe sequence is currently
1615 : broken by the hole at hole_idx. We fix it by moving ele_idx to
1616 : hole_idx. This shortens that key's probe sequence (preserving
1617 : the invariant) and makes a new hole at ele_idx. We mark the lock
1618 : version covering the new hole idx as modified for the unlock
1619 : below. Note that the version for the existing hole was already
1620 : marked as modified when the hole was created so we only bump if
1621 : ele_idx is covered by a different lock than hole_idx to reduce
1622 : version churn to near theoretical minimum. */
1623 :
1624 : # if MAP_MEMOIZE
1625 : key_hash = ele->MAP_MEMO;
1626 : # else
1627 3584319 : key_hash = MAP_(key_hash)( &ele->MAP_KEY, seed );
1628 3584319 : # endif
1629 3584319 : start_idx = key_hash & (ele_max-1UL);
1630 :
1631 3584319 : if( !( ((hole_idx<start_idx) & (start_idx<=ele_idx) ) |
1632 3584319 : ((hole_idx>ele_idx) & ((hole_idx<start_idx) | (start_idx<=ele_idx))) ) ) {
1633 :
1634 1028307 : MAP_(private_ele_move)( ctx, ele0 + hole_idx, ele );
1635 :
1636 1028307 : ulong lock_next = ele_idx >> lock_shift;
1637 1028307 : version[ lock_next ] = (MAP_VERSION_T)((ulong)version[ lock_next ] + ((lock_next!=lock_idx) ? 2UL : 0UL) /* cmov */);
1638 1028307 : lock_idx = lock_next;
1639 :
1640 1028307 : hole_idx = ele_idx;
1641 1028307 : }
1642 :
1643 3584319 : }
1644 :
1645 : /* At this point, key is removed and all remaining keys have intact
1646 : and ordered probe sequences and we have updated the necessary
1647 : version cache entries. Unlock and return success. */
1648 :
1649 3753420 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt );
1650 3753420 : return FD_MAP_SUCCESS;
1651 7505412 : }
1652 :
1653 : int
1654 : MAP_(query_try)( MAP_(t) const * join,
1655 : MAP_KEY_T const * key,
1656 : MAP_ELE_T const * sentinel,
1657 7498425 : MAP_(query_t) * query ) {
1658 :
1659 7498425 : MAP_ELE_T * ele0 = join->ele;
1660 7498425 : MAP_VERSION_T * lock = join->lock;
1661 7498425 : ulong ele_max = join->ele_max;
1662 7498425 : ulong lock_cnt = join->lock_cnt;
1663 7498425 : ulong probe_max = join->probe_max;
1664 7498425 : ulong seed = join->seed;
1665 7498425 : int lock_shift = join->lock_shift;
1666 7498425 : void const * ctx = join->ctx;
1667 :
1668 7498425 : ulong key_hash = MAP_(key_hash)( key, seed );
1669 7498425 : ulong ele_idx = key_hash & (ele_max-1UL);
1670 7498425 : ulong version_lock0 = ele_idx >> lock_shift;
1671 7498425 : ulong version_cnt = 0UL;
1672 :
1673 7498425 : MAP_VERSION_T version[ MAP_LOCK_MAX ];
1674 :
1675 7498425 : ulong lock_idx = version_lock0;
1676 :
1677 7498425 : int err;
1678 :
1679 : /* At this point, finding any key in the map requires probing at most
1680 : probe_max contiguous slots. */
1681 :
1682 7498425 : MAP_VERSION_T v = MAP_(private_try)( lock + lock_idx );
1683 7498425 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail_fast; } /* opt for low contention */
1684 7498425 : version[ lock_idx ] = v;
1685 7498425 : version_cnt++;
1686 :
1687 11061129 : for( ulong probe_rem=probe_max; probe_rem; probe_rem-- ) {
1688 :
1689 : /* At this point, we've observed the locks covering the start of
1690 : key's probe sequence to ele_idx inclusive, they were unlocked
1691 : when observed and we have done fewer than probe_max probes.
1692 :
1693 : If slot ele_idx is empty, we speculate that key was not in the
1694 : map at some point during the call.
1695 :
1696 : If slot ele_idx holds key, we let the caller continue speculating
1697 : about key's value. We only need to observe the lock covering key
1698 : after we've found it (if key gets moved or removed, the version
1699 : of the lock covering it will change). */
1700 :
1701 11061129 : MAP_ELE_T const * ele = ele0 + ele_idx;
1702 :
1703 11061129 : if( FD_UNLIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) { err = FD_MAP_ERR_KEY; goto fail; } /* opt for low collision */
1704 :
1705 7303011 : if(
1706 : # if MAP_MEMOIZE && MAP_KEY_EQ_IS_SLOW
1707 : FD_LIKELY( ele->MAP_MEMO==key_hash ) &&
1708 : # endif
1709 : FD_LIKELY( MAP_(key_eq)( &ele->MAP_KEY, key ) ) /* opt for found */
1710 7303011 : ) {
1711 :
1712 3740307 : lock_idx = ele_idx >> lock_shift;
1713 :
1714 3740307 : query->memo = key_hash;
1715 3740307 : query->ele = (MAP_ELE_T *)ele;
1716 3740307 : query->l = lock + lock_idx;
1717 3740307 : query->v = version[ lock_idx ];
1718 3740307 : return FD_MAP_SUCCESS;
1719 3740307 : }
1720 :
1721 : /* At this point, we speculate slot ele_idx was used by something
1722 : other than key when observed. Continue probing slot for key,
1723 : observing locks as necessary. */
1724 :
1725 3562704 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
1726 :
1727 3562704 : ulong lock_next = ele_idx >> lock_shift;
1728 3562704 : if( FD_UNLIKELY( (lock_next!=lock_idx) & (lock_next!=version_lock0) ) ) { /* opt for locks cover many contiguous slots */
1729 893040 : lock_idx = lock_next;
1730 :
1731 893040 : v = MAP_(private_try)( lock + lock_idx );
1732 893040 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail_fast; } /* opt for low contention */
1733 893040 : version[ lock_idx ] = v;
1734 893040 : version_cnt++;
1735 893040 : }
1736 3562704 : }
1737 :
1738 : /* At this point, we did probe_max probes without finding key. We
1739 : speculate key was not in the map at some point during the call. */
1740 :
1741 0 : err = FD_MAP_ERR_KEY;
1742 :
1743 3758118 : fail:
1744 :
1745 : /* If we didn't encounter any contention (i.e. no version numbers
1746 : changed), we can trust our speculated error. Otherwise, we tell
1747 : the user to try again. */
1748 :
1749 3758118 : err = MAP_(private_test)( lock, lock_cnt, version, version_lock0, version_cnt ) ? FD_MAP_ERR_AGAIN : err; /* cmov */
1750 :
1751 3758118 : fail_fast: /* Used when the err is already AGAIN */
1752 :
1753 3758118 : query->memo = key_hash;
1754 3758118 : query->ele = (MAP_ELE_T *)sentinel;
1755 3758118 : query->l = NULL;
1756 3758118 : query->v = (MAP_VERSION_T)0;
1757 3758118 : return err;
1758 3758118 : }
1759 :
1760 : MAP_STATIC int
1761 33 : MAP_(verify)( MAP_(t) const * join ) {
1762 :
1763 86835 : # define MAP_TEST(c) do { \
1764 86835 : if( FD_UNLIKELY( !(c) ) ) { FD_LOG_WARNING(( "FAIL: %s", #c )); return FD_MAP_ERR_INVAL; } \
1765 86835 : } while(0)
1766 :
1767 : /* Validate join */
1768 :
1769 33 : MAP_TEST( join );
1770 33 : MAP_TEST( fd_ulong_is_aligned( (ulong)join, alignof(MAP_(t)) ) );
1771 :
1772 33 : MAP_ELE_T const * ele0 = join->ele;
1773 33 : MAP_VERSION_T const * lock = join->lock;
1774 33 : ulong ele_max = join->ele_max;
1775 33 : ulong lock_cnt = join->lock_cnt;
1776 33 : ulong probe_max = join->probe_max;
1777 33 : ulong seed = join->seed;
1778 33 : int lock_shift = join->lock_shift;
1779 33 : void const * ctx = join->ctx;
1780 :
1781 33 : MAP_TEST( ele0 );
1782 33 : MAP_TEST( fd_ulong_is_aligned( (ulong)ele0, alignof(MAP_ELE_T) ) );
1783 33 : MAP_TEST( lock );
1784 33 : MAP_TEST( fd_ulong_is_aligned( (ulong)lock, MAP_(align)() ) );
1785 33 : MAP_TEST( fd_ulong_is_pow2( ele_max ) );
1786 33 : MAP_TEST( fd_ulong_is_pow2( lock_cnt ) );
1787 33 : MAP_TEST( lock_cnt <= fd_ulong_min( ele_max, MAP_LOCK_MAX ) );
1788 33 : MAP_TEST( (1UL<=probe_max) & (probe_max<=ele_max) );
1789 : /* seed is arbitrary */
1790 33 : MAP_TEST( (1UL<<lock_shift) == (ele_max/lock_cnt) );
1791 :
1792 : /* Validate map metadata */
1793 :
1794 33 : MAP_(shmem_t) const * map = ((MAP_(shmem_t) const *)lock)-1;
1795 :
1796 33 : MAP_TEST( map );
1797 33 : MAP_TEST( fd_ulong_is_aligned( (ulong)map, MAP_(align)() ) );
1798 33 : MAP_TEST( map->magic == MAP_MAGIC );
1799 33 : MAP_TEST( map->ele_max == ele_max );
1800 33 : MAP_TEST( map->lock_cnt == lock_cnt );
1801 33 : MAP_TEST( map->probe_max == probe_max );
1802 33 : MAP_TEST( map->seed == seed );
1803 33 : MAP_TEST( map->lock_shift == lock_shift );
1804 :
1805 : /* Validate map elements */
1806 :
1807 135201 : for( ulong ele_idx=0UL; ele_idx<ele_max; ele_idx++ ) {
1808 135168 : MAP_ELE_T const * ele = ele0 + ele_idx;
1809 135168 : if( FD_LIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) continue; /* opt for sparse */
1810 :
1811 24084 : ulong key_hash = MAP_(key_hash)( &ele->MAP_KEY, seed );
1812 :
1813 : # if MAP_MEMOIZE
1814 : MAP_TEST( ele->MAP_MEMO==key_hash );
1815 : # endif
1816 :
1817 24084 : ulong probe_idx = key_hash & (ele_max-1UL);
1818 24084 : ulong probe_cnt = fd_ulong_if( ele_idx>=probe_idx, ele_idx - probe_idx, ele_max + ele_idx - probe_idx ) + 1UL;
1819 24084 : MAP_TEST( probe_cnt<=probe_max );
1820 :
1821 55146 : for( ulong probe_rem=probe_cnt; probe_rem; probe_rem-- ) {
1822 31062 : MAP_ELE_T const * probe = ele0 + probe_idx;
1823 31062 : MAP_TEST( !MAP_(private_ele_is_free)( ctx, probe ) );
1824 :
1825 31062 : int found =
1826 : # if MAP_MEMOIZE && MAP_KEY_EQ_IS_SLOW
1827 : FD_LIKELY( probe->MAP_MEMO == ele->MAP_MEMO ) &&
1828 : # endif
1829 31062 : MAP_(key_eq)( &probe->MAP_KEY, &ele->MAP_KEY );
1830 :
1831 31062 : MAP_TEST( (probe_rem==1UL) ? found : !found );
1832 :
1833 31062 : probe_idx = (probe_idx+1UL) & (ele_max-1UL);
1834 31062 : }
1835 24084 : }
1836 :
1837 : /* At this point, every key in the map is reachable via it's probe
1838 : sequence and every probe sequence is at most probe_max probes long.
1839 : By extension, if a key is in the map, it will be found in at most
1840 : probe_max probes. */
1841 :
1842 33 : # undef MAP_TEST
1843 :
1844 33 : return FD_MAP_SUCCESS;
1845 33 : }
1846 :
1847 : MAP_STATIC char const *
1848 18 : MAP_(strerror)( int err ) {
1849 18 : switch( err ) {
1850 3 : case FD_MAP_SUCCESS: return "success";
1851 3 : case FD_MAP_ERR_INVAL: return "bad input";
1852 3 : case FD_MAP_ERR_AGAIN: return "try again later";
1853 3 : case FD_MAP_ERR_FULL: return "map too full";
1854 3 : case FD_MAP_ERR_KEY: return "key not found";
1855 3 : default: break;
1856 18 : }
1857 3 : return "unknown";
1858 18 : }
1859 :
1860 : #endif
1861 :
1862 : #undef MAP_
1863 : #undef MAP_STATIC
1864 :
1865 : #undef MAP_IMPL_STYLE
1866 : #undef MAP_MAGIC
1867 : #undef MAP_ALIGN
1868 : #undef MAP_LOCK_MAX
1869 : #undef MAP_VERSION_T
1870 : #undef MAP_CTX_MAX
1871 : #undef MAP_ELE_MOVE
1872 : #undef MAP_ELE_FREE
1873 : #undef MAP_ELE_IS_FREE
1874 : #undef MAP_KEY_EQ_IS_SLOW
1875 : #undef MAP_MEMO
1876 : #undef MAP_MEMOIZE
1877 : #undef MAP_KEY_HASH
1878 : #undef MAP_KEY_EQ
1879 : #undef MAP_KEY
1880 : #undef MAP_KEY_T
1881 : #undef MAP_ELE_T
1882 : #undef MAP_NAME
|