Line data Source code
1 : /* Generate prototypes, inlines and/or implementations for concurrent
2 : persistent shared maps based on linear probing with some insights
3 : from cuckoo hashing for improved concurrent performance. Notably:
4 :
5 : - Supports an arbitrary number of concurrent operations with a
6 : comparable performance to a single threaded HPC linear probed map
7 : for non-conflicting operations (likely hardware NOC limited for
8 : conflicting operations).
9 :
10 : - Concurrent queries do not interfere with other concurrent queries.
11 :
12 : - All map operations can be serialized.
13 :
14 : - Does not require a key sentinel (but can support them useful for
15 : the application).
16 :
17 : - Does not require a guarantee there's at least one free element in
18 : the element store (but like a serial linear probed map, it is a
19 : good idea to keep utilization below the absolute theoretical
20 : capacity for strong algorithmic performance guarantees).
21 :
22 : - Insert/modify/query have a run-time configurable worst case O(1)
23 : cost, regardless of map fill ratio. (Remove's worst case cost is
24 : not configurable due to cases involving maps filled to or near
25 : capacity. For reasonable fill ratios, remove is also comparable.)
26 :
27 : - Stable in the sense that all keys with the same hash (or more
28 : generally same initial probe element) are ordered in the element
29 : store by insertion order. (This allows the user to use the key
30 : hash to group related keys contiguously in the element store and
31 : then stably iterate over them with fast streaming.)
32 :
33 : - Query requires no atomic operations at all. (Usual target memory
34 : subsystem requirements that writes to memory become visible to
35 : other threads in the order in which they were issued in the machine
36 : code.)
37 :
38 : - Insert/modify/remove only require atomic fetch-and-or (typically
39 : just one). There's no need for an atomic compare-and-swap /
40 : underlying pool of free elements / etc. (Can also be used as a
41 : non-concurrent map on targets without FD_HAS_ATOMIC.)
42 :
43 : - Map concurrency metadata and the actual map element store can be
44 : located in separate memory regions (can also split the element
45 : store over multiple memory regions ... e.g. keys here / values
46 : there) and can back any of these memory regions by a file system to
47 : scale beyond billions of elements with no code change.
48 :
49 : - Map metadata easily fits in CPU caches with a fixed O(1) overhead
50 : regardless of element store capacity. Access patterns naturally
51 : exploit CPU and storage caching, streaming and prefetching
52 : behaviors.
53 :
54 : - Supports asynchronous execution (e.g. issue hints for keys that
55 : will be accessed soon, do unrelated work while hints are
56 : prefetching need info into local cache in the background, then do
57 : key operations ... now all fast and local cache hits).
58 :
59 : - Supports non-plain-old-data keys and non-plain-old-data values
60 : (both of which are gross on real world computers but commonly done
61 : nevertheless).
62 :
63 : A map can be persisted beyond the lifetime of the creating process,
64 : be used inter-process, relocated in memory, be naively
65 : serialized/deserialized, be moved between hosts, etc. Massive
66 : concurrency, high algorithmic and implementation performance for
67 : normal usage and friendly cache / file system streaming access
68 : patterns for heavily loaded / heavily concurrent usage are
69 : prioritized. In particular, unlike fd_map_para, this takes ownership
70 : of the underlying element store for the lifetime of the map in order
71 : to speed up operations and increase concurrency.
72 :
73 : Typical usage:
74 :
75 : struct myele {
76 : ulong key; // Technically "MAP_KEY_T MAP_KEY" (default is ulong key)
77 :
78 : ... key can be located arbitrarily in the element. The mapping
79 : ... of a key to an element in the element store is arbitrary and
80 : ... can move while the key is in the map.
81 : };
82 :
83 : typedef struct myele myele_t;
84 :
85 : #define MAP_NAME mymap
86 : #define MAP_ELE_T myele_t
87 : #include "tmpl/fd_map_slot_para.c"
88 :
89 : will declare the following APIs as a header only style library in the
90 : compilation unit:
91 :
92 : // A mymap_t is a stack declaration friendly quasi-opaque local
93 : // object used to hold the state of a local join to a mymap.
94 : // Similarly, a mymap_query_t / mymap_iter_t holds the local state
95 : // of an ongoing operation / iteration. E.g. it is fine to do
96 : // mymap_t join[1];" to allocate a mymap_t but the contents should
97 : // not be used directly.
98 :
99 : typedef struct mymap_private mymap_t;
100 : typedef struct mymap_query_private mymap_query_t;
101 : typedef struct mymap_iter_private mymap_iter_t;
102 :
103 : // mymap_lock_max returns the maximum number of version locks that
104 : // can be used by a mymap. Will be a positive integer
105 : // power-of-two.
106 :
107 : ulong mymap_lock_max();
108 :
109 : // mymap_lock_cnt_est returns a reasonable number of locks to use
110 : // for a map backed by an ele_max capacity element store. Assumes
111 : // ele_max is an integer power-of-two. Returns an integer
112 : // power-of-two in [1,mymap_lock_max()].
113 :
114 : ulong mymap_lock_cnt_est( ulong ele_max );
115 :
116 : // mymap_probe_max_est returns a reasonable maximum probe sequence
117 : // length for a map backed by an ele_max capacity element store.
118 : // Assumes ele_max is an integer power-of-two. Returns an integer
119 : // power-of-two in [1,ele_max].
120 :
121 : ulong mymap_probe_max_est( ulong ele_max );
122 :
123 : // mymap_{align,footprint} returns the alignment and footprint
124 : // needed for a memory region to be used as a mymap. align will be
125 : // an integer power-of-two and footprint will be a multiple of
126 : // align. ele_max / lock_cnt / probe_max specify the capacity of
127 : // the element store / number of version locks / maximum probe
128 : // sequence length for the map. footprint returns 0 for invalid
129 : // configurations. In a valid configuration, ele_max is an integer
130 : // power-of-two, lock_cnt is an integer power-of-two, lock_cnt is
131 : // at most min( lock_max, ele_max ) and probe_max is in
132 : // [1,ele_max].
133 : //
134 : // mymap_new formats a memory region with the required alignment
135 : // and footprint into a mymap. shmem points in the caller's
136 : // address space to the memory region to use. Returns shmem on
137 : // success (mymap has ownership of the memory region) and NULL on
138 : // failure (no changes, logs details). The caller is not joined on
139 : // return. All map versions will be at version 0 / unlocked. The
140 : // map contents will be in whatever state the backing element store
141 : // is in. IMPORTANT SAFETY TIP! THE ELEMENT STORE SHOULD BE IN A
142 : // CONSISTENT STATE BEFORE USING MYMAP_NEW. For example, the
143 : // caller could mark all elements as free before calling this and
144 : // the caller could use verify immediately after creation to verify
145 : // integrity.
146 : //
147 : // mymap_join joins the caller to an existing mymap. ljoin points
148 : // to a mymap_t compatible memory region in the caller's address
149 : // space, shmap points in the caller's address space to the memory
150 : // region containing the mymap, and shele points in the caller's
151 : // address space to mymap's element store. Returns a handle to the
152 : // caller's local join on success (join has ownership of the ljoin
153 : // region) and NULL on failure (no changes, logs details).
154 : //
155 : // mymap_leave leaves a mymap join. join points to a current local
156 : // join. Returns the memory region used for the join on success
157 : // (caller has ownership on return and the caller is no longer
158 : // joined) and NULL on failure (no changes, logs details). Use the
159 : // join accessors before leaving to get shmap and shele used by the
160 : // join if needed.
161 : //
162 : // mymap_delete unformats a memory region used as a mymap. Assumes
163 : // shmap points in the caller's address space to a memory region
164 : // containing the mymap and that there are no joins. Returns shmem
165 : // on success (caller has ownership of the memory region, any
166 : // remaining elements still in the mymap are released to the caller
167 : // implicitly) and NULL on failure (no changes, logs details).
168 :
169 : ulong mymap_align ( void );
170 : ulong mymap_footprint( ulong chain_cnt );
171 : void * mymap_new ( void * shmem, ulong ele_max, ulong lock_cnt, ulong probe_max, ulong seed );
172 : mymap_t * mymap_join ( void * ljoin, void * shmap, void * shele );
173 : void * mymap_leave ( mymap_t * join );
174 : void * mymap_delete ( void * shmap );
175 :
176 : // mymap_{ele_max,lock_cnt,probe_max,seed} return the mymap
177 : // configuration. Assumes join is a current local join. The
178 : // values will be valid for the mymap lifetime.
179 :
180 : ulong mymap_ele_max ( mymap_t const * join );
181 : ulong mymap_lock_cnt ( mymap_t const * join );
182 : ulong mymap_probe_max( mymap_t const * join );
183 : ulong mymap_seed ( mymap_t const * join );
184 :
185 : // mymap_{shmap,shele} return join details. Assumes join is a
186 : // current local join. The values will be valid for the join
187 : // lifetime. mymap_{shmap_const,shele_const} are const correct
188 : // versions.
189 :
190 : void const * mymap_shmap_const( mymap_t const * join );
191 : void const * mymap_shele_const( mymap_t const * join );
192 :
193 : void * mymap_shmap( mymap_t * join );
194 : void * mymap_shele( mymap_t * join );
195 :
196 : // mymap_lock_{idx,ele0,ele1} specify the mapping between map
197 : // version lock indices and element store element indices. Assumes
198 : // join is a current local join and ele_idx / lock_idx is in
199 : // [0,ele_max) / is in [0,lock_cnt). mymap_lock_idx is the index
200 : // of the version lock that protects element store element ele_idx,
201 : // in [0,lock_cnt). [mymap_lock_ele0,mymap_lock_ele1) is the
202 : // contiguous range of elements protected by lock lock_idx. ele0
203 : // is in [0,ele_max), ele1 is in (0,ele_max], and ele0<ele1.
204 :
205 : ulong mymap_lock_idx ( mymap_t const * join, ulong ele_idx );
206 : ulong mymap_lock_ele0( mymap_t const * join, ulong lock_idx );
207 : ulong mymap_lock_ele1( mymap_t const * join, ulong lock_idx );
208 :
209 : // mymap_key_{eq,hash} expose the provided MAP_KEY_{EQ,HASH} macros
210 : // as inlines with strict semantics. They assume that the provided
211 : // pointers are in the caller's address space to keys that will not
212 : // be changed during the call. They retain no interest in any keys
213 : // on return.
214 : //
215 : // mymap_key_eq returns 1 if *k0 and *k1 are equal and 0 otherwise.
216 : //
217 : // mymap_key_hash returns the hash of *key using the hash function
218 : // seed. Should ideally be a random mapping from a MAP_KEY_T to a
219 : // ulong but this depends on what the user actually used for
220 : // MAP_KEY_HASH. The seed used by a particular mymap innstance can
221 : // be obtained above.
222 :
223 : int mymap_key_eq ( ulong * k0, ulong * k1 );
224 : ulong mymap_key_hash( ulong * key, ulong seed );
225 :
226 : // mymap_backoff does FD_SPIN_PAUSE a random number of times. The
227 : // number of pauses is an approximately uniform IID random number
228 : // in [0,scale/2^16] where scale is in [0,2^32). Specifically, the
229 : // number of pauses is:
230 : //
231 : // floor( scale r / 2^48 )
232 : //
233 : // where r is a non-deterministic 32-bit uniform IID random number.
234 : // Under the hood, r is generated by hashing the user provided seed
235 : // and the least significant 32-bits of the CPU tickcounter.
236 : // Ideally, seed is a 32-bit globally unique identifer for the
237 : // logical thread of execution but this is up to the application to
238 : // specify and rarely matters in practice. This is a useful
239 : // building block for random exponential backoffs.
240 :
241 : void mymap_backoff( ulong scale, ulong seed );
242 :
243 : // mymap_query_memo returns the key_hash of the query associated
244 : // with the query's key to allow users to minimize potentially
245 : // expensive key hash computations in various operations.
246 : //
247 : // mymap_query_ele returns a pointer in the caller's address space
248 : // to the element store element associated with the query or a
249 : // sentinel value. The sentinel value is application dependent and
250 : // thus arbitrary (e.g. not necessarily in the element store,
251 : // including NULL, a local temporary used as a bit bucket, etc).
252 : // Assumes query is valid. The lifetime of the returned pointer
253 : // depends on the query. mymap_query_ele_const is a const correct
254 : // version.
255 :
256 : ulong mymap_query_memo ( mymap_query_t const * query );
257 : myele_t const * mymap_query_ele_const( mymap_query_t const * query );
258 : myele_t * mymap_query_ele ( mymap_query_t * query );
259 :
260 : // mymap_hint hints that the caller plans to do an operation
261 : // involving key soon. Assumes join is a current local join, key
262 : // points to a valid key in the caller's address space for the
263 : // duration of the call and query points to a local scratch to hold
264 : // info about the hint. Returns no interest in key. On return,
265 : // the query memo will be initialized.
266 : //
267 : // flags is a bit-or of FD_MAP_FLAG flags. If
268 : // FD_MAP_FLAG_PREFETCH_META / FD_MAP_FLAG_PREFETCH_DATA is set,
269 : // this will issue a prefetch for key's mymap metadata (i.e. lock
270 : // version) / the element at the start of key's probe sequence
271 : // (i.e. the location of key or contiguously shortly before it)
272 : // FD_MAP_FLAG_PREFETCH combines both for convenience. This can be
273 : // used to overlap key access latency with unrelated operations.
274 : // All other flags are ignored.
275 :
276 : void
277 : mymap_hint( MAP_(t) const * join
278 : MAP_KEY_T const * key
279 : MAP_(query_t) * query,
280 : int flags );
281 :
282 : // mymap_prepare tries to start a insert/modify/blocking query
283 : // operation for key. Assumes join is a current local join, key
284 : // points to valid key in the caller's address space for the
285 : // duration of the call and query points to a local scratch to hold
286 : // the info about the prepare. Retains no interest in key.
287 : // Returns FD_MAP_SUCCESS (0) and a FD_MAP_ERR (negative) on
288 : // failure. This is a non-blocking fast O(1) (O(probe_max) worst
289 : // case) and supports highly concurrent operation.
290 : //
291 : // flags is a bit-or of FD_MAP_FLAG flags. If FD_MAP_FLAG_BLOCKING
292 : // is set / clear in flags, this is allowed / not allowed to block
293 : // the caller. If FD_MAP_FLAG_USE_HINT is set, this assumes
294 : // query's memo is already initialized for key. This can be used
295 : // to avoid redundant expensive key hashing when prefetching. All
296 : // other flags are ignored (the upper 26-bits of flags can be used
297 : // to provide a local seed for random backoffs but this is up to
298 : // the application and rarely matters in practice).
299 : //
300 : // On success, the caller is in a prepare for key and query is
301 : // initialized with info about prepare. ele=mymap_query_ele(query)
302 : // gives the location in the caller's address space to an element
303 : // store element for the prepare that will be stable for the
304 : // duration of the prepare and memo=mymap_query_memo(query) gives
305 : // the key hash.
306 : //
307 : // If the element is marked as free, key is not in the map and ele
308 : // is where key could be inserted. If the caller is inserting key,
309 : // the caller should populate element's key with key, element's
310 : // memo (if any) with memo) (avoids having to rehash the key), mark
311 : // ele as used and then do a mymap_publish to complete the insert.
312 : // If not, the caller should keep ele marked as free and do a
313 : // mymap_cancel to complete the prepare (doesn't matter from the
314 : // map's point-of-view if anything else was clobbered /
315 : // mymap_publish would also work here).
316 : //
317 : // If the element is marked as used, key is in the map at ele. If
318 : // the caller is modifying key's value, the caller should do the
319 : // modification and then mymap_publish to complete the modify. If
320 : // not (e.g. blocking query), the caller can inspect ele contents
321 : // and the mymap_cancel to complete the blocking query
322 : // (mymap_publish would also work here). In both cases, the caller
323 : // should not modify ele's key, modify ele's memo, or mark ele as
324 : // free. Note that mymap_publish must be used even if the
325 : // modifications were only temporary.
326 : //
327 : // On failure, the caller is not in a prepare for key, query
328 : // ele==sentinel and query memo will still be the key hash.
329 : / Reasons for failure:
330 : //
331 : // - FD_MAP_ERR_AGAIN: A potentially conflicting operation was in
332 : // progress at some point during the call. Try again later (e.g.
333 : // after a random exponential backoff). Never returned on a
334 : // blocking call.
335 : //
336 : // - FD_MAP_ERR_FULL: key was not in the map but inserting ele
337 : // would require making a probe sequence longer than probe_max.
338 : // Try again when the map is less full (e.g. after removing some
339 : // elements).
340 : //
341 : // mymap_publish ends the prepare described by query, updating the
342 : // map version to reflect changes made during the prepare. Assumes
343 : // query is valid and describes an active prepare. Cannot fail and
344 : // will not be in the prepare will finished on return. This is a
345 : // generally safe way to end a prepare even if the caller did not
346 : // modify the map during the prepare.
347 : //
348 : // mymap_cancel ends the prepare described by query, reverting the
349 : // map version to reflect that the caller did not change the map
350 : // during the prepare. Assumes query is valid and describes an
351 : // active prepare and that the caller did not make any meaningful
352 : // modifications to the map during the prepare (note that temporary
353 : // changes during the prepare can be considered modifications as
354 : // per the above). Cannot fail and will not be in the prepare will
355 : // finished on return. This is a safe way to end a prepare only if
356 : // the caller did not modify the map during the prepare.
357 : //
358 : // IMPORTANT SAFETY TIP! Do not nest or interleave prepares,
359 : // remove or queries for the same map on the same thread.
360 : //
361 : // IMPORTANT SAFETY TIP! A successful prepare must be have a
362 : // matching publish or cancel (and then ideally as soon as
363 : // possible).
364 : //
365 : // IMPORTANT SAFETY TIP! The order in which keys that hash to the
366 : // same slot were inserted into the map is preserved for the
367 : // lifetime of the keys. Thus the hash function used can be
368 : // constructed to create ordered iterators over groups of keys.
369 :
370 : int mymap_prepare( mymap_t * join, ulong const * key, myele_t * sentinel, mymap_query_t * query, int flags );
371 : void mymap_publish( mymap_query_t * query );
372 : void mymap_cancel ( mymap_query_t * query );
373 :
374 : // mymap_remove removes key from the mymap. Assumes join is a
375 : // current local join and key is valid for the duration of the
376 : // call. Retains no interest in key. This is non-blocking fast
377 : // typically O(1) and supports highly concurrent operation.
378 : //
379 : // flags is a bit-or of FD_MAP_FLAG flags. If FD_MAP_FLAG_BLOCKING
380 : // is set / clear in flags, this is allowed / not allowed to block
381 : // the caller. If FD_MAP_FLAG_USE_HINT is set, this assumes
382 : // query's memo is already initialized for key. This can be used
383 : // to avoid redundant expensive key hashing when prefetching. If
384 : // clear, query is ignored and can be set NULL. All other flags
385 : // are ignored (the upper 26-bits of flags can be used to provide a
386 : // local seed for random backoffs but this is up to the application
387 : // and rarely matters in practice).
388 : //
389 : // Returns FD_MAP_SUCCESS (0) on success and a FD_MAP_ERR
390 : // (negative) on failure. On success, key's mapping was removed at
391 : // some point during the call. On failure, no changes were made by
392 : // this call and:
393 : //
394 : // - FD_MAP_ERR_KEY: Key was not found in the mymap at some point
395 : // during the call.
396 : //
397 : // - FD_MAP_ERR_AGAIN: A potentially conflicting operation was in
398 : // progress at some point during the call. Same considerations
399 : // as prepare above. Never returned on a blocking call.
400 : //
401 : // IMPORTANT SAFETY TIP! Do not nest or interleave prepares,
402 : // remove or queries for the same map on the same thread.
403 :
404 : int mymap_remove( mymap_t * join, ulong const * key, mymap_query_t const * query, int flags );
405 :
406 : // mymap_query_try tries to speculatively query a mymap for key.
407 : // On return, query will hold information about the try. sentinel
408 : // gives the query element pointer value (arbitrary) to pass
409 : // through when it is not safe to try the query. Assumes join is a
410 : // current local join and key is valid for the duration of the
411 : // call. Does not modify the mymap and retains no interest in key,
412 : // sentinel or query. This is a non-blocking fast O(1)
413 : // (O(probe_max) worst case) and supports highly concurrent
414 : // operation.
415 : //
416 : // flags is a bit-or of FD_MAP_FLAG flags. If FD_MAP_FLAG_BLOCKING
417 : // is set / clear in flags, this is allowed / not allowed to block
418 : // the caller. If FD_MAP_FLAG_USE_HINT is set, this assumes
419 : // query's memo is already initialized for key. This can be used
420 : // to avoid redundant expensive key hashing when prefetching. All
421 : // other flags are ignored (the upper 26-bits of flags can be used
422 : // to provide a local seed for random backoffs but this is up to
423 : // the application and rarely matters in practice).
424 : //
425 : // Returns FD_MAP_SUCCESS (0) on success and a FD_MAP_ERR
426 : // (negative) on failure. On success, key mapped to the element
427 : // store element mymap_query_ele( query ) in the caller's address
428 : // space at some point during the call. The mymap retains
429 : // ownership of this element but the caller can zero copy
430 : // speculatively process the element's contents. On failure,
431 : // mymap_query_ele( query ) will be sentinel and returns:
432 : //
433 : // - FD_MAP_ERR_KEY: Key was not found in the mymap in some point
434 : // during the call.
435 : //
436 : // - FD_MAP_ERR_AGAIN: A potentially conflicting operation was in
437 : // progress at some point during the call. Try again later (e.g.
438 : // after a random exponential backoff). Unlike prepare and
439 : // remove, this call does _not_ require any locks for key's probe
440 : // sequence. As such, AGAIN can only be caused by concurrent
441 : // prepare/remove operations and this will never interfere with
442 : // any other concurrent operation. Among the many implications,
443 : // a query will never delay a concurrent query and AGAIN will
444 : // never be returned if only concurrent speculative queries are
445 : // in progress. Never returned on a blocking call.
446 : //
447 : // IMPORTANT SAFETY TIP! THE CALLER SHOULD BE PREPARED TO HANDLE
448 : // ARBITRARY AND/OR INCONSISTENT VALUES FOR ELEMENT FIELDS DURING
449 : // SPECULATIVE PROCESSING. CALLERS SHOULD NOT COMMIT ANY RESULTS
450 : // OF SPECULATIVE PROCESSING UNTIL IT TESTS THE QUERY WAS
451 : // SUCCESSFUL.
452 : //
453 : // The simplest form of speculative processing is to copy the
454 : // element store element into a local temporary, test that the
455 : // speculation was valid, and then process the local temporary copy
456 : // at its leisure. Zero copy, more selective copying and/or
457 : // writing speculative results into local tempoaries are more
458 : // advanced examples of speculative processing.
459 : //
460 : // Use mymap_prepare to do a blocking (non-speculative) query.
461 : //
462 : // mymap_query_test tests if an in-progress query is still valid.
463 : // Assumes query is valid, we are still in a query try and lock
464 : // version numbers have not wrapped since we started the try.
465 : // Returns FD_MAP_SUCCESS (0) if the query is still valid and
466 : // FD_MAP_ERR_AGAIN (negative) if a potentially conflicting
467 : // operation was in progress at some point during the try.
468 : //
469 : // IMPORTANT SAFETY TIP! Do not nest or interleave prepares,
470 : // remove or queries for the same map on the same thread.
471 :
472 : int
473 : mymap_query_try( mymap_t const * join,
474 : ulong const * key,
475 : myele_t const * sentinel,
476 : mymap_query_t * query,
477 : int flags );
478 :
479 : int mymap_query_test( mymap_query_t const * query );
480 :
481 : // mymap_lock_range tries to acquire locks lock_idx for lock_idx in
482 : // [range_start,range_start+range_cnt) (cyclic).
483 : //
484 : // flags is a bit-or of FD_MAP_FLAG flags. If FD_MAP_FLAG_BLOCKING
485 : // is set / clear in flags, this is allowed / not allowed to block
486 : // the caller. If FD_MAP_FLAG_RDONLY is set, the caller promises
487 : // to only read the elements covered by the range while holding the
488 : // locks. All other flags are ignored (the upper 26-bits of flags
489 : // can be used to provide a local seed for random backoffs but this
490 : // is up to the application and rarely matters in practice).
491 : //
492 : // Returns FD_MAP_SUCCESS (0) on success and FD_MAP_ERR_AGAIN
493 : // (negative) if there was a potentially conflicting operation in
494 : // progress at some point during the call. On success,
495 : // version[lock_idx] will hold the version to use when releasing
496 : // that lock. On failure, version may have been clobbered. AGAIN
497 : // is never returned if BLOCKING is set.
498 : //
499 : // mymap_unlock_range unlocks a similarly specified range. Assumes
500 : // caller has the locks and version[lock_idx] is the value set when
501 : // locked was obtained.
502 : //
503 : // These both assume join is a current local join, range_start is
504 : // in [0,lock_cnt), range_cnt is in [0,lock_cnt] and version is
505 : // valid with space for lock_cnt entries (YES ... LOCK_CNT, NOT
506 : // RANGE_CNT ... this is trivial with a compile time stack
507 : // temporary as lock_cnt<=MAP_LOCK_MAX).
508 :
509 : int
510 : mymap_lock_range( mymap_t * join,
511 : ulong range_start,
512 : ulong range_cnt,
513 : int flags,
514 : ulong * version );
515 :
516 : void
517 : mymap_unlock_range( mymap_t * join,
518 : ulong range_start,
519 : ulong range_cnt,
520 : ulong const * version );
521 :
522 : // The mymap_iter_* APIs are used to iterate over all keys inserted
523 : // into the map with the same memo (to support grouping of keys by
524 : // key hash value). The iteration order will be from the least
525 : // recently inserted to most recently inserted. flags has similar
526 : // meaning as other APIs. Example usage:
527 : //
528 : // ulong memo = ... hash of keys to iterate over ...
529 : //
530 : // mymap_iter_t iter[1];
531 : // int err = mymap_iter_init( join, memo, 0, iter );
532 : //
533 : // if( FD_UNLIKELY( err ) ) {
534 : //
535 : // ... At this point, err is FD_MAP_ERR_AGAIN and caller has
536 : // ... ownership of iter. There was a potentially conflicting
537 : // ... prepare or remove in progress at some point during the
538 : // ... call. We can try again later (e.g. after a random
539 : // ... backoff or doing other non-conflicting work).
540 : // ... mymap_iter_done will be 1, mymap_iter_fini will be a
541 : // ... no-op. Never returned if mymap_iter_init flags has
542 : // ... FD_MAP_FLAG_BLOCKING set.
543 : //
544 : // } else {
545 : //
546 : // ... At this point, we are in an iteration and iteration has
547 : // ... ownership of iter.
548 : //
549 : // while( !mymap_iter_done( iter ) ) {
550 : // myele_t * ele = mymap_iter_ele( iter );
551 : //
552 : // ... At this point, mymap_key_hash( ele->key, seed ) == memo (==ele's memo if memoized)
553 : //
554 : // ... process ele here.
555 : //
556 : // ... IMPORTANT! Generally speaking, it is not okay to
557 : // ... insert, remove, modify, blocking read, non-blocking
558 : // ... read here. It is okay to read ele and modify any
559 : // ... value fields though. If mymap_iter_init flags had
560 : // ... FD_MAP_FLAG_RDONLY set, caller promises it is only
561 : // ... reading ele here.
562 : //
563 : // mymap_iter_next( iter );
564 : // }
565 : //
566 : // mymap_iter_fini( iter );
567 : //
568 : // ... At this point, we are not in an iteration and caller has
569 : // ... ownership of iter.
570 : //
571 : // }
572 :
573 : int mymap_iter_init( mymap_t * join, ulong memo, int flags, mymap_iter_t * lmem );
574 : int mymap_iter_done( mymap_iter_t * iter );
575 : myele_t * mymap_iter_ele ( mymap_iter_t * iter );
576 : mymap_iter_t * mymap_iter_next( mymap_iter_t * iter );
577 : mymap_iter_t * mymap_iter_fini( mymap_iter_t * iter );
578 :
579 : // mymap_verify returns FD_MAP_SUCCESS (0) if the join, underlying
580 : // map and underlying element store give a mapping of unique keys
581 : // to unique elements in the element store with a bounded maximum
582 : // probe length. Returns FD_MAP_ERR_INVAL (negative) otherwise (no
583 : // changes by this call, logs details). Assumes that caller has
584 : // all the map locks and/or the map is otherwise known to be idle.
585 :
586 : int mymap_verify( mymap_t const * join );
587 :
588 : // mymap_strerror converts an FD_MAP_SUCCESS / FD_MAP_ERR code into
589 : // a human readable cstr. The lifetime of the returned pointer is
590 : // infinite. The returned pointer is always to a non-NULL cstr.
591 :
592 : char const * mymap_strerror( int err );
593 :
594 : Do this as often as desired in a compilation unit to get different
595 : types of concurrent maps. Options exist for generating library
596 : header prototypes and/or library implementations for concurrent maps
597 : usable across multiple compilation units. Additional options exist
598 : to use different hashing functions, key comparison functions, etc as
599 : detailed below.
600 :
601 : IMPORTANT SAFETY TIP! If using a key sentinel, prepare/remove/query
602 : operations assume the input key is not the key sentinel (i.e. a
603 : sentinel is not considered a "valid key). Sentinel keys are not
604 : necessary if MAP_ELE_IS_FREE, MAP_ELE_FREE and MAP_ELE_MOVE are set
605 : appropriately.
606 :
607 : To better understand prepare / publish / cancel semantics:
608 :
609 : mykey_t * key = ... key to insert / modify / blocking query
610 :
611 : mymap_query_t query[1];
612 : int err = mymap_prepare( map, key, sentinel, query, 0 );
613 : myele_t * ele = mymap_query_ele ( query );
614 : ulong memo = mymap_query_memo( query );
615 :
616 : ... At this point, memo == mymap_key_hash( key, seed )
617 :
618 : if( FD_UNLIKELY( err ) ) {
619 :
620 : ... At this point, we are not in a prepare for key and
621 : ... ele==sentinel.
622 :
623 : ... If err is FD_MAP_ERR_AGAIN, there was a potentially
624 : ... conflicting prepare or remove in progress at some point
625 : ... during the call. We can try again later (e.g. after a
626 : ... random backoff or doing other non-conflicting work).
627 : ... Never returned for a blocking call.
628 :
629 : ... If err is FD_MAP_ERR_FULL, key was not in the map but
630 : ... inserting it would have created a key probe sequence longer
631 : ... than probe_max at some point during the call. We can try
632 : ... again later when it is less full (e.g. after removing keys
633 : ... from the map).
634 :
635 : } else if( ... ele is marked as free ... ) ) {
636 :
637 : ... At this point, we are in a prepare for key, key is not in
638 : ... the map and ele points in the caller's address space to free
639 : ... element in the element store suitable for holding key.
640 :
641 : ... initialize ele here, including populating ele's key with key
642 : ... and (if memoized) populating ele's memo with memo.
643 :
644 : if( ... we decided not to insert key ... ) mymap_cancel( query ); // "failed insert"
645 : else {
646 : ... mark ele as used
647 : mymap_publish( query ); // "insert"
648 : }
649 :
650 : } else {
651 :
652 : ... At this point, we are in a prepare for key, key is in the map
653 : ... and ele points in the caller's address space to the element
654 : ... store element that currently contains key. We are free to
655 : ... modify ele's value. We should not modify ele's key, modify
656 : ... ele's memo (if memoized) or mark ele as free.
657 :
658 : ... process ele here
659 :
660 : if( ... we didn't modify ele ... ) mymap_cancel ( query ); // "blocking query"
661 : else mymap_publish( query ); // "modify"
662 :
663 : }
664 :
665 : To better understand remove semantics:
666 :
667 : mykey_t * key = ... key to remove
668 :
669 : int err = mymap_remove( map, key, NULL, 0 );
670 :
671 : if( FD_UNLIKELY( err ) ) {
672 :
673 : ... If err is FD_MAP_ERR_AGAIN, there was a potentially
674 : ... conflicting prepare or remove in progress at some point
675 : ... during the call. We can try again later (e.g. after a random
676 : ... backoff or doing other non-conflicting work).
677 : ... Never returned for a blocking call.
678 :
679 : ... If err is FD_MAP_ERR_KEY, key was not in the map at some
680 : ... point during the call (so remove did not do anything).
681 :
682 : } else {
683 :
684 : ... key was removed from the map at some point during the call.
685 : ... The remove might have shuffled other keys. This shuffling
686 : ... can only decrease probe sequence length for any remaining
687 : ... keys and preserves insertion ordering for keys with the same
688 : ... hash (or initial probe element more generally).
689 :
690 : }
691 :
692 : To better understand query semantics:
693 :
694 : mykey_t * key = ... key to query
695 :
696 : mymap_query_t query[1];
697 : int err = mymap_query_try( join, key, sentinel, query, 0 );
698 : myele_t const * ele = mymap_query_ele_const( query );
699 : ulong memo = mymap_query_memo ( query );
700 :
701 : ... At this point, memo==mymap_key_hash( key, seed )
702 :
703 : if( FD_UNLIKELY( err ) ) {
704 :
705 : ... At this point, ele==sentinel
706 : ...
707 : ... If err is FD_MAP_ERR_KEY, key was not in the mymap at some
708 : ... point during the try.
709 : ...
710 : ... If err is FD_MAP_ERR_AGAIN, there was a potentially
711 : ... conflicting operation in progress during the try and we can
712 : ... try again later (e.g. after a random backoff or doing other
713 : ... non-conflicting work).
714 :
715 : } else {
716 :
717 : ... At this point, ele points in our address space to an element
718 : ... in the element store (non-NULL) and ele->key matched key at
719 : ... some point during the try.
720 :
721 : ... Speculatively process ele here.
722 : ...
723 : ... DO NOT TRUST ANY RESULTS OF THIS SPECULATIVE PROCESSING YET.
724 : ... THERE IS NO GUARANTEE YET THAT A CONCURRENT USER HASN'T
725 : ... CHANGED THE MYMAP IN A WAY THAT COULD YIELD ARBITRARY AND
726 : ... INCONSISTENT RESULTS.
727 : ...
728 : ... The simplest and most common form of speculative processing
729 : ... is to copy the needed portions of ele into a local stack
730 : ... temp.
731 : ...
732 : ... Note: concurrent operations include removing key from the
733 : ... mymap (and maybe multiple cycles of inserting and removing it
734 : ... and then at potentially different element store locations) or
735 : ... unrelated removes potentially shuffling the location of this
736 : ... key. That's not an issue practically as the ele pointer here
737 : ... will be to an element compatible memory region that will
738 : ... continue to exist regardless and we shouldn't be trusting any
739 : ... query reads yet (the query test will detect if if these can
740 : ... be trusted). See rant in fd_map_para.c for more details.
741 :
742 : ... At this point, we are done with speculative processing (or we
743 : ... don't want to do any more speculative processing if the try
744 : ... has already failed).
745 :
746 : err = mymap_query_test( query );
747 : if( FD_UNLKELY( err ) ) {
748 :
749 : ... At this point, err will be FD_MAP_ERR_AGAIN and a
750 : ... potentially conflicting operation in the try was detected
751 : ... by the test.
752 :
753 : ... Application specific handling here (e.g. try again after a
754 : ... random backoff or doing other non-conflicting work).
755 :
756 : } else {
757 :
758 : ... At this point, the results of the speculation thus far can
759 : ... be trusted and can be considered to have been computed at
760 : ... some point in time between try and test.
761 :
762 : }
763 : }
764 :
765 : Example use of lock_range / unlock (do a parallel snapshot of an
766 : entire map at a globally well defined point in time with minimal
767 : interference to ongoing concurrent modifications):
768 :
769 : ulong version[ mymap_lock_max() ];
770 :
771 : ulong lock_cnt = mymap_lock_cnt( join );
772 :
773 : mymap_lock_range( join, 0, lock_cnt, FD_MAP_FLAGS_BLOCKING | FD_MAP_FLAGS_RDONLY, version );
774 :
775 : for( ulong lock_idx=0UL; lock_idx<lock_cnt; lock_idx++ ) { ... parallelize this loop over snapshot threads as desired
776 : ulong ele0 = mymap_lock_ele0( join, lock_idx );
777 : ulong ele1 = mymap_lock_ele1( join, lock_idx );
778 :
779 : ... process element store elements [ele0,ele1) here
780 :
781 : mymap_unlock_range( join, lock_idx, 1UL, version );
782 : }
783 :
784 : Note that mymap_lock_range in this example might blocking the caller
785 : for a long time if the map is under heavy concurrent modification.
786 : To prioritize the snapshotting over these operations, the same API
787 : can be used toprioritize the snapshot over ongoing concurrent
788 : modifications:
789 :
790 : ulong version[ mymap_lock_max() ];
791 :
792 : ulong lock_cnt = mymap_lock_cnt( join );
793 :
794 : for( ulong lock_idx=0UL; lock_idx<lock_cnt; lock_idx++ )
795 : mymap_lock_range( join, lock_idx, 1UL, FD_MAP_FLAGS_BLOCKING | FD_MAP_FLAGS_RDONLY, version );
796 :
797 : for( ulong lock_idx=0UL; lock_idx<lock_cnt; lock_idx++ ) { ... parallelize this loop over snapshot threads as desired
798 : ulong ele0 = mymap_lock_ele0( join, lock_idx );
799 : ulong ele1 = mymap_lock_ele1( join, lock_idx );
800 :
801 : ... process element store elements [ele0,ele1) here
802 :
803 : mymap_unlock_range( join, lock_idx, 1UL, version );
804 : }
805 :
806 : Implementation overview:
807 :
808 : A map basically a persistent shared array of version numbers named
809 : lock. lock[ lock_idx ] contains a version number that covers map
810 : slots [lock_idx (ele_max/lock_cnt),(lock_idx+1)(ele_max/lock_cnt))
811 :
812 : When trying an operation that could impact probe sequences passing
813 : through a lock's range of slots, the version number is atomically
814 : incremented. It is incremented again at completion. It may also
815 : be decremented if the operation didn't end up modifying any of the
816 : covered slots.
817 :
818 : Thus, an {odd,even} version number indicates that there is {a
819 : potential,not any} operation in progress that could impact probe
820 : sequences passing through the corresponding slots. The most
821 : significant bits of the version number can be used for lockfree
822 : style operations to detect changes to any probe sequences they use.
823 :
824 : When the map is not overloaded, key probe sequences are typically
825 : O(1) long and, in general, at most a (user configured) probe_max
826 : long. Since a version number covers many slots typically, this
827 : implies that the typical "read" operation (e.g.
828 : query_try/query_test) looks like:
829 :
830 : - try: observe lock version numbers covering all slots in
831 : key's probe sequence, fail if any locked (typically 1
832 : normal read that hits L1/L2 cache, especially in the
833 : common case of reads more frequent than writes)
834 : - spec: speculatively process the element containing key
835 : - test: check version numbers haven't changed (typically 1
836 : normal read that even more likely L1/L2 cache hit),
837 : fail if any changed
838 :
839 : And the typical "write" operation (e.g. prepare/publish) looks
840 : like:
841 :
842 : - prepare: increment lock version numbers covering all slots in
843 : key's probe sequence, fail if any locked (typically 1
844 : atomic fetch-and-or done test-and-test-and-set style to
845 : minimize hardware NOC contention)
846 : - exec: (non-speculatively) process the element containing key
847 : - publish: increment version numbers (typically 1 normal read/write
848 : that hits L1/L2 cache)
849 :
850 : Readers never block concurrent readers or writers. Writers can
851 : block other readers and other writers. If there are many more
852 : version locks than concurrent writers though, writers are unlikely
853 : to interfere with concurrent readers or writers. In all cases, all
854 : map operations are serializable.
855 :
856 : For maps that are loaded to their capacity, probe sequences could
857 : be up to probe_max long and probe_max might be quite large. This
858 : implies that a more than one version lock might be needed. Since
859 : this range is cyclic contiguous in memory, the locking operations
860 : are nice compact streaming access patterns. And similarly for the
861 : element store access patterns. */
862 :
863 : /* MAP_NAME gives the API prefix to use for map */
864 :
865 : #ifndef MAP_NAME
866 : #error "Define MAP_NAME"
867 : #endif
868 :
869 : /* MAP_ELE_T is the map element type */
870 :
871 : #ifndef MAP_ELE_T
872 : #error "Define MAP_ELE_T"
873 : #endif
874 :
875 : /* MAP_KEY_T is the map key type */
876 :
877 : #ifndef MAP_KEY_T
878 : #define MAP_KEY_T ulong
879 : #endif
880 :
881 : /* MAP_KEY is the MAP_ELE_T key field */
882 :
883 : #ifndef MAP_KEY
884 5516805 : #define MAP_KEY key
885 : #endif
886 :
887 : /* MAP_KEY_EQ returns 0/1 if *k0 is the same/different as *k1 */
888 :
889 : #ifndef MAP_KEY_EQ
890 0 : #define MAP_KEY_EQ(k0,k1) ((*(k0))==(*(k1)))
891 : #endif
892 :
893 : /* MAP_KEY_HASH returns a random mapping of *key into ulong. The
894 : mapping is parameterized by the 64-bit ulong seed. */
895 :
896 : #ifndef MAP_KEY_HASH
897 : #define MAP_KEY_HASH(key,seed) fd_ulong_hash( (*(key)) ^ (seed) )
898 : #endif
899 :
900 : /* If MAP_MEMOIZE is defined to non-zero, elements have a field that
901 : can be used while in the map to hold the MAP_KEY_HASH for an
902 : element's key. This is useful for accelerating user code that might
903 : need a hash and accelerating various operations. */
904 :
905 : #ifndef MAP_MEMOIZE
906 : #define MAP_MEMOIZE 0
907 : #endif
908 :
909 : /* If MAP_MEMOIZE is non-zero, MAP_MEMO is the memo element field.
910 : Should be a ulong. Like MAP_KEY and MAP_NEXT, when an element is in
911 : the map, this value is managed by the map and will contain the
912 : MAP_KEY_HASH of the element's key and the map's seed. */
913 :
914 : #ifndef MAP_MEMO
915 : #define MAP_MEMO memo
916 : #endif
917 :
918 : /* If MAP_MEMOIZE is defined to non-zero, a non-zero MAP_KEY_EQ_IS_SLOW
919 : indicates the MAP_MEMO field should be used to accelerate MAP_KEY_EQ
920 : operations. This is useful when MAP_KEY_EQ is non-trivial (e.g.
921 : variable length string compare, large buffer compares, etc). */
922 :
923 : #ifndef MAP_KEY_EQ_IS_SLOW
924 : #define MAP_KEY_EQ_IS_SLOW 0
925 : #endif
926 :
927 : /* MAP_ELE_IS_FREE returns 0/1 if the slot pointed to by ele in the
928 : caller's address space contains / does not contain a key-value pair.
929 : The implementation can assume ele is valid and that it is safe to
930 : speculate on ele. The default implementation tests if key is not 0.
931 : If using a different key sentinel or not using a key sentinel, update
932 : this appropriately. */
933 :
934 : #ifndef MAP_ELE_IS_FREE
935 0 : #define MAP_ELE_IS_FREE(ctx,ele) (!((ele)->MAP_KEY))
936 : #endif
937 :
938 : /* MAP_ELE_FREE frees the key-value pair in the slot pointed to by ele
939 : in the caller's address space. The implementation can assume ele is
940 : valid, ele is contains a key-value pair on entry and there will be no
941 : concurrent operations on ele during the free. The default
942 : implementation sets key to 0. If using a different key sentinel or
943 : not using a key sentinel, update this appropriately. Likewise, if
944 : not using plain-old-data keys and values, this should do the
945 : appropriate resource management. The join ctx is provided to
946 : faciliate this. */
947 :
948 : #ifndef MAP_ELE_FREE
949 0 : #define MAP_ELE_FREE(ctx,ele) do (ele)->MAP_KEY = (MAP_KEY_T)0; while(0)
950 : #endif
951 :
952 : /* MAP_ELE_MOVE moves the key-value pair in slot src to slot dst.
953 : src and dst are in the caller's address space. The implementation
954 : can assume src and dst are valid, dst does not contain a key-value
955 : pair on entry, src contains a key-value on pair on entry, and there
956 : will be no concurrent operations on src and dst during the move. The
957 : default implementation shallow copies src to dst and sets src key to
958 : 0. If using a different key sentinel or not using a key sentinel,
959 : update this appropriately. Likewise, if elements do not use
960 : plain-old-data keys and/or values, this should do the appropriate key
961 : and/or value resource management. The join ctx is provided to
962 : facilitate this. */
963 :
964 : #ifndef MAP_ELE_MOVE
965 0 : #define MAP_ELE_MOVE(ctx,dst,src) do { MAP_ELE_T * _src = (src); (*(dst)) = *_src; _src->MAP_KEY = (MAP_KEY_T)0; } while(0)
966 : #endif
967 :
968 : /* MAP_CTX_MAX specifies the maximum number of bytes of user context
969 : for use in MAP_ELE above (e.g. custom allocators / workspaces / local
970 : pointers to additional value arrays / etc). This context will be
971 : ulong aligned. Default is up to 72 bytes. */
972 :
973 : #ifndef MAP_CTX_MAX
974 0 : #define MAP_CTX_MAX (72UL)
975 : #endif
976 :
977 : /* MAP_VERSION_T gives the map version index type. Should be a
978 : primitive unsigned integer type. The least significant bit is used
979 : to use indicate whether or not a slot could be impacted by an in
980 : progress map operation. The remaining bits indicate the version
981 : number. The default is ulong, yielding effectively infinite ABA
982 : protection (e.g. a lockfree query query operation would need to be
983 : stalled for over ~2^63 concurrent insert/modify/remove map operations
984 : before risk of getting confused by version number reuse ... which
985 : would take millenia for modern hardware practically). Narrow types
986 : yield less less metadata footprint overhead for the map and lower ABA
987 : protection. (For human hard real-time applications, uint is probably
988 : fine and, in for computer hard computer real-time applications,
989 : ushort and/or uchar could be fine). */
990 :
991 : #ifndef MAP_VERSION_T
992 104570463 : #define MAP_VERSION_T ulong
993 : #endif
994 :
995 : /* MAP_LOCK_MAX gives the maximum number of version locks the map can
996 : support. This should be positive and an integer power-of-two. This
997 : essentially is limit on the maximum number of concurrent operations
998 : and thus should be much greater than the number of concurrent
999 : insert/modify/remove operations in expected map usage. Default is
1000 : 1024.
1001 :
1002 : Note that this is not theoretically required for the below
1003 : implementation. This exists to compile time bound stack utilization
1004 : of prepare/remove/query_try. That is,
1005 : sizeof(MAP_VERSION_T)*MAP_LOCK_MAX should be a L1D cache / L2D cache
1006 : / stack allocation friendly footprint (defaults yield 8 KiB).
1007 : MAP_LOCK_MAX could be removed by using an dynamic stack allocation
1008 : but that would limit this to targets with FD_HAS_ALLOCA. Could also
1009 : be eliminated by using a dynamic footprint lock cache in the query
1010 : structure, join structures and/or combining the query and join
1011 : structures. These are cumbersome for the user and the last two add
1012 : restrictions to intra-process multithreaded usage not seen in other
1013 : FD persistent inter-process datastructures. (Consider using a
1014 : massive/reasonable MAP_LOCK_MAX when FD_HAS_ALLOCA is set/clear and
1015 : then using alloca in prepare/remove/query_try when FD_HAS_ALLOCA is
1016 : set? Almost the best of both worlds but does imply some subtle
1017 : restrictions if trying to interoperate between targets compiled with
1018 : different features ... avoiding for now.) */
1019 :
1020 : #ifndef MAP_LOCK_MAX
1021 3000042 : #define MAP_LOCK_MAX (1024)
1022 : #endif
1023 :
1024 : /* MAP_ALIGN gives the alignment required for the map shared memory.
1025 : Default is 128 for double cache line alignment. Should be at least
1026 : ulong alignment. */
1027 :
1028 : #ifndef MAP_ALIGN
1029 : #define MAP_ALIGN (128UL)
1030 : #endif
1031 :
1032 : /* MAP_MAGIC gives the shared memory magic number to aid in persistent
1033 : and/or interprocess usage. */
1034 :
1035 : #ifndef MAP_MAGIC
1036 3 : #define MAP_MAGIC (0xf17eda2c37c5107UL) /* firedancer cslot version 0 */
1037 : #endif
1038 :
1039 : /* MAP_IMPL_STYLE controls what to generate:
1040 : 0 - header only library
1041 : 1 - library header declaration
1042 : 2 - library implementation */
1043 :
1044 : #ifndef MAP_IMPL_STYLE
1045 : #define MAP_IMPL_STYLE 0
1046 : #endif
1047 :
1048 : /* Commom map error codes (FIXME: probably should get around to making
1049 : unified error codes, error strings and/or flags across util at least
1050 : so we don't have to do this in the generator itself) */
1051 :
1052 37479369 : #define FD_MAP_SUCCESS (0)
1053 3 : #define FD_MAP_ERR_INVAL (-1)
1054 7491231 : #define FD_MAP_ERR_AGAIN (-2)
1055 : //#define FD_MAP_ERR_CORRUPT (-3)
1056 : //#define FD_MAP_ERR_EMPTY (-4)
1057 3 : #define FD_MAP_ERR_FULL (-5)
1058 7491231 : #define FD_MAP_ERR_KEY (-6)
1059 :
1060 97481148 : #define FD_MAP_FLAG_BLOCKING (1<<0)
1061 89992926 : #define FD_MAP_FLAG_USE_HINT (1<<2)
1062 : #define FD_MAP_FLAG_PREFETCH_NONE (0<<3)
1063 : #define FD_MAP_FLAG_PREFETCH_META (1<<3)
1064 : #define FD_MAP_FLAG_PREFETCH_DATA (2<<3)
1065 : #define FD_MAP_FLAG_PREFETCH (3<<3)
1066 67488222 : #define FD_MAP_FLAG_RDONLY (1<<5)
1067 :
1068 : /* Implementation *****************************************************/
1069 :
1070 : #if MAP_IMPL_STYLE==0 /* local use only */
1071 : #define MAP_STATIC FD_FN_UNUSED static
1072 : #else /* library header and/or implementation */
1073 : #define MAP_STATIC
1074 : #endif
1075 :
1076 128790720 : #define MAP_(n) FD_EXPAND_THEN_CONCAT3(MAP_NAME,_,n)
1077 :
1078 : #if MAP_IMPL_STYLE!=2 /* need header */
1079 :
1080 : #include "../bits/fd_bits.h"
1081 :
1082 : struct __attribute__((aligned(MAP_ALIGN))) MAP_(shmem_private) {
1083 :
1084 : /* This point is MAP_ALIGN aligned */
1085 :
1086 : ulong magic; /* ==MAP_MAGIC */
1087 : ulong ele_max; /* Element store capacity, positive and an integer power-of-two */
1088 : ulong lock_cnt; /* Number of locks, positive and an integer power-of-two <= min( ele_max, MAP_LOCK_MAX ) */
1089 : ulong probe_max; /* Maximum length probe sequence, in [1,ele_max] */
1090 : ulong seed; /* Key hash seed, arbitrary */
1091 : int lock_shift; /* log2( ele_max / lock_cnt ), non-negative */
1092 :
1093 : /* Padding to MAP_ALIGN alignment here */
1094 :
1095 : /* MAP_VERSION_T lock[ lock_cnt ] here (obligatory sigh about lagging
1096 : C++ support for 0 sized structure array footers). */
1097 :
1098 : /* Padding to MAP_ALIGN alignment here */
1099 : };
1100 :
1101 : typedef struct MAP_(shmem_private) MAP_(shmem_t);
1102 :
1103 : struct MAP_(private) {
1104 : MAP_ELE_T * ele; /* Location of the element store in the local address space, indexed [0,ele_max) */
1105 : MAP_VERSION_T * lock; /* Location of the lock versions in the local address space, indexed [0,lock_cnt) */
1106 : ulong ele_max; /* ==shmem->ele_max */
1107 : ulong lock_cnt; /* ==shmem->lock_cnt */
1108 : ulong probe_max; /* ==shmem->probe_max */
1109 : ulong seed; /* ==shmem->seed */
1110 : int lock_shift; /* ==shmem->lock_shift */
1111 : int _pad; /* padding to ulong alignment */
1112 : uchar ctx[ MAP_CTX_MAX ]; /* User context for MAP_ELE_IS_FREE/MAP_ELE_FREE/MAP_ELE_MOVE */
1113 : };
1114 :
1115 : typedef struct MAP_(private) MAP_(t);
1116 :
1117 : struct MAP_(query_private) {
1118 : ulong memo; /* Query key memo */
1119 : MAP_ELE_T * ele; /* Query element in the local address space */
1120 : MAP_VERSION_T * l; /* Lock needed for this query in the local address space */
1121 : MAP_VERSION_T v; /* Version of lock at query start */
1122 : };
1123 :
1124 : typedef struct MAP_(query_private) MAP_(query_t);
1125 :
1126 : struct MAP_(iter_private) {
1127 : MAP_ELE_T * ele; /* Location of the element store in the local address space, indexed [0,ele_max) */
1128 : MAP_VERSION_T * lock; /* Location of the lock versions in the local address space, indexed [0,lock_cnt) */
1129 : ulong ele_max; /* ==shmem->ele_max */
1130 : ulong lock_cnt; /* ==shmem->lock_cnt */
1131 : ulong seed; /* ==shmem->seed */
1132 : ulong memo; /* matching memo for iteration */
1133 : ulong ele_idx; /* If ele_rem>0, currernt matching element, ignored otherwise */
1134 : ulong ele_rem; /* Number of elements remaining to probe, in [0,probe_max] */
1135 : ulong version_lock0; /* Index of first lock used by this iter, in [0,lock_cnt] */
1136 : ulong version_cnt; /* Number of locks used by this iter, in [0,lock_cnt] (typically 1) */
1137 : MAP_VERSION_T version[ MAP_LOCK_MAX ]; /* Direct mapped cache of version numbers for unlock */
1138 : };
1139 :
1140 : typedef struct MAP_(iter_private) MAP_(iter_t);
1141 :
1142 : FD_PROTOTYPES_BEGIN
1143 :
1144 : /* map_private_try returns the version of the lock observed at some
1145 : point during the call. Assumes lock is valid. If the least
1146 : significant bit of the returned value is set (i.e. is odd), an
1147 : operation was in progress on a key whose probe sequence includes a
1148 : map slot covered by this lock (i.e. it is not a good time to try the
1149 : operation). If the LSB is clear (i.e. is even), no operation was in
1150 : progress (i.e. it is a good time to try). This is a compiler memory
1151 : fence. */
1152 :
1153 : static inline MAP_VERSION_T
1154 8136078 : MAP_(private_try)( MAP_VERSION_T volatile const * l ) {
1155 8136078 : MAP_VERSION_T v;
1156 8136078 : FD_COMPILER_MFENCE();
1157 8136078 : v = *l;
1158 8136078 : FD_COMPILER_MFENCE();
1159 8136078 : return v;
1160 8136078 : }
1161 :
1162 : /* map_private_test tests a range of lock versions matched their locally
1163 : cached versions at some point during the call. Specifically, tests
1164 : lock[lock_idx]==version[lock_idx] for all lock_idx in
1165 : [version_lock0,version_lock0+version_cnt) (cyclic). lock_cnt is the
1166 : number of locks and assumed to be positive and an integer
1167 : power-of-two. Returns SUCCESS (zero) if all match (i.e. no probe
1168 : sequences through slots covered by the locks between when the last
1169 : lock in the range was observed and this was called changed) and AGAIN
1170 : (negative) otherwise. This is a compiler memory fence. */
1171 :
1172 : static inline int
1173 : MAP_(private_test)( MAP_VERSION_T volatile const * lock,
1174 : ulong lock_cnt,
1175 : MAP_VERSION_T const * version,
1176 : ulong lock_idx, /* version_lock0 */
1177 3743964 : ulong version_cnt ) {
1178 3743964 : FD_COMPILER_MFENCE();
1179 7933995 : for( ; version_cnt; version_cnt-- ) {
1180 4190031 : if( FD_UNLIKELY( lock[ lock_idx ]!=version[ lock_idx ] ) ) break; /* opt for low contention */
1181 4190031 : lock_idx = (lock_idx+1UL) & (lock_cnt-1UL);
1182 4190031 : }
1183 3743964 : FD_COMPILER_MFENCE();
1184 3743964 : return version_cnt ? FD_MAP_ERR_AGAIN : FD_MAP_SUCCESS; /* cmov */
1185 3743964 : }
1186 :
1187 : /* map_private_lock returns the version of the lock observed at some
1188 : point during the call. Assumes lock is valid. If the least
1189 : significant bit of the returned version is set (i.e. is odd), the
1190 : caller did not get the lock (i.e. an operation was already in
1191 : progress on a key whose probe sequence includes a map slot covered by
1192 : this lock). If the LSB is clear (i.e. is even), the caller got the
1193 : lock (i.e. is free to do an operation involving probe sequences that
1194 : pass through the range covered by the lock) and *lock LSB was set.
1195 : This is a compiler memory fence. When the target does not have
1196 : FD_HAS_ATOMIC, this operation is emulated. When emulated, the map
1197 : will not be safe to use concurrently but will still work with
1198 : comparable performance to a serial implementation. */
1199 :
1200 : static inline MAP_VERSION_T
1201 41844600 : MAP_(private_lock)( MAP_VERSION_T volatile * l ) {
1202 41844600 : MAP_VERSION_T v;
1203 41844600 : FD_COMPILER_MFENCE();
1204 : # if FD_HAS_ATOMIC /* test-and-test-and-set style */
1205 41844600 : v = *l;
1206 41844600 : if( FD_LIKELY( !((ulong)v & 1UL) ) ) v = FD_ATOMIC_FETCH_AND_OR( l, (MAP_VERSION_T)1 ); /* opt for low contention */
1207 : # else
1208 : v = *l;
1209 : *l = (MAP_VERSION_T)((ulong)v | 1UL);
1210 : # endif
1211 41844600 : FD_COMPILER_MFENCE();
1212 41844600 : return v;
1213 41844600 : }
1214 :
1215 : /* map_private_unlock unlocks lock[lock_idx] for lock_idx in
1216 : [version_lock0,version_lock0+version_cnt) (cyclic). Assumes
1217 : version[lock_idx] is the version the caller wants post unlock (which
1218 : implies that, on entry, version[lock_idx] = lock[lock_idx] + delta
1219 : where delta is odd and >=-1 (the -1 case corresponds "unlock with no
1220 : changes made to the covered elements"). This cannot fail. This is a
1221 : compiler memory fence. */
1222 :
1223 : static inline void
1224 : MAP_(private_unlock)( MAP_VERSION_T volatile * lock,
1225 : ulong lock_cnt,
1226 : MAP_VERSION_T const * version,
1227 : ulong lock_idx, /* version_lock0 */
1228 29991768 : ulong version_cnt ) {
1229 29991768 : FD_COMPILER_MFENCE();
1230 56830494 : for( ; version_cnt; version_cnt-- ) {
1231 26838726 : lock[ lock_idx ] = version[ lock_idx ];
1232 26838726 : lock_idx = (lock_idx+1UL) & (lock_cnt-1UL);
1233 26838726 : }
1234 29991768 : FD_COMPILER_MFENCE();
1235 29991768 : }
1236 :
1237 : /* map_private_ele_{is_free,free,move} expose the
1238 : MAP_ELE_{IS_FREE,FREE,MOVE} macros as inlines with strict semantics.
1239 :
1240 : map_private_ele_is_free returns 1 if ele does not contain an key-val
1241 : pair and 0 otherwise. ctx will be the join's user context, ele will
1242 : be a valid pointer to an element store element in the caller's
1243 : address space that is safe to speculate on. Retains no interest in
1244 : ele.
1245 :
1246 : map_private_ele_free frees any key and/or val resources used by ele
1247 : and marks ele as free. ctx will be the join's user context, ele will
1248 : be a valid pointer to an element store element in the caller's
1249 : address space that is marked as used. Retains no interest in ele.
1250 :
1251 : map_private_ele_move moves the key-val pair from element src to
1252 : element to element dst and marks src as free. ctx will be the join's
1253 : user context, src/dst will be a valid pointers to an element store
1254 : element in the caller's address space. dst/src will be marked as
1255 : free/used on entry and should be marked as used/free on return.
1256 : Retains no interest in dst or src. */
1257 :
1258 : FD_FN_PURE static inline int
1259 : MAP_(private_ele_is_free)( void const * ctx,
1260 52867698 : MAP_ELE_T const * ele ) {
1261 52867698 : (void)ctx;
1262 52867698 : return !!(MAP_ELE_IS_FREE( (ctx), (ele) ));
1263 52867698 : }
1264 :
1265 : static inline void
1266 : MAP_(private_ele_free)( void * ctx,
1267 3750408 : MAP_ELE_T * ele ) {
1268 3750408 : (void)ctx;
1269 3750408 : MAP_ELE_FREE( (ctx), (ele) );
1270 1875204 : }
1271 :
1272 : static inline void
1273 : MAP_(private_ele_move)( void * ctx,
1274 : MAP_ELE_T * dst,
1275 341448 : MAP_ELE_T * src ) {
1276 341448 : (void)ctx;
1277 341448 : MAP_ELE_MOVE( (ctx), (dst), (src) );
1278 170652 : }
1279 :
1280 0 : FD_FN_CONST static inline ulong MAP_(lock_max)( void ) { return MAP_LOCK_MAX; }
1281 :
1282 3000006 : FD_FN_CONST static inline ulong MAP_(lock_cnt_est) ( ulong ele_max ) { return fd_ulong_min( ele_max, MAP_LOCK_MAX ); }
1283 3000006 : FD_FN_CONST static inline ulong MAP_(probe_max_est)( ulong ele_max ) { return ele_max; }
1284 :
1285 0 : FD_FN_CONST static inline ulong MAP_(align)( void ) { return alignof(MAP_(shmem_t)); }
1286 :
1287 : FD_FN_CONST static inline ulong
1288 : MAP_(footprint)( ulong ele_max,
1289 : ulong lock_cnt,
1290 45 : ulong probe_max ) {
1291 45 : if( !( fd_ulong_is_pow2( ele_max ) &
1292 45 : fd_ulong_is_pow2( lock_cnt ) & (lock_cnt<=fd_ulong_min( ele_max, MAP_LOCK_MAX )) &
1293 45 : (1UL<=probe_max) & (probe_max<=ele_max) ) ) return 0UL;
1294 15 : return fd_ulong_align_up( sizeof(MAP_(shmem_t)) + lock_cnt*sizeof(MAP_VERSION_T), alignof(MAP_(shmem_t)) ); /* no overflow */
1295 45 : }
1296 :
1297 9 : FD_FN_PURE static inline ulong MAP_(ele_max) ( MAP_(t) const * join ) { return join->ele_max; }
1298 9 : FD_FN_PURE static inline ulong MAP_(lock_cnt) ( MAP_(t) const * join ) { return join->lock_cnt; }
1299 3 : FD_FN_PURE static inline ulong MAP_(probe_max)( MAP_(t) const * join ) { return join->probe_max; }
1300 9 : FD_FN_PURE static inline ulong MAP_(seed) ( MAP_(t) const * join ) { return join->seed; }
1301 :
1302 3 : FD_FN_PURE static inline void const * MAP_(shmap_const)( MAP_(t) const * join ) { return ((MAP_(shmem_t) const *)join->lock)-1; }
1303 3 : FD_FN_PURE static inline void const * MAP_(shele_const)( MAP_(t) const * join ) { return join->ele; }
1304 :
1305 3 : FD_FN_CONST static inline void * MAP_(ctx) ( MAP_(t) * join ) { return join->ctx; }
1306 3 : FD_FN_CONST static inline void const * MAP_(ctx_const)( MAP_(t) const * join ) { return join->ctx; }
1307 0 : FD_FN_CONST static inline ulong MAP_(ctx_max) ( MAP_(t) const * join ) { (void)join; return MAP_CTX_MAX; }
1308 :
1309 3 : FD_FN_PURE static inline void * MAP_(shmap)( MAP_(t) * join ) { return ((MAP_(shmem_t) *)join->lock)-1; }
1310 9 : FD_FN_PURE static inline void * MAP_(shele)( MAP_(t) * join ) { return join->ele; }
1311 :
1312 12288 : FD_FN_PURE static inline ulong MAP_(ele_lock) ( MAP_(t) const * join, ulong ele_idx ) { return ele_idx >> join->lock_shift; }
1313 7477266 : FD_FN_PURE static inline ulong MAP_(lock_ele0)( MAP_(t) const * join, ulong lock_idx ) { return lock_idx << join->lock_shift; }
1314 7477266 : FD_FN_PURE static inline ulong MAP_(lock_ele1)( MAP_(t) const * join, ulong lock_idx ) { return (lock_idx+1UL) << join->lock_shift; }
1315 :
1316 : FD_FN_PURE static inline int
1317 : MAP_(key_eq)( MAP_KEY_T const * k0,
1318 31186575 : MAP_KEY_T const * k1 ) {
1319 31186575 : return !!(MAP_KEY_EQ( (k0), (k1) ));
1320 31186575 : }
1321 :
1322 : FD_FN_PURE static inline ulong
1323 : MAP_(key_hash)( MAP_KEY_T const * key,
1324 74921094 : ulong seed ) {
1325 74921094 : return (MAP_KEY_HASH( (key), (seed) ));
1326 74921094 : }
1327 :
1328 : static inline void
1329 : MAP_(backoff)( ulong scale,
1330 0 : ulong seed ) {
1331 0 : ulong r = (ulong)(uint)fd_ulong_hash( seed ^ (((ulong)fd_tickcount())<<32) );
1332 0 : for( ulong rem=(scale*r)>>48; rem; rem-- ) FD_SPIN_PAUSE();
1333 0 : }
1334 :
1335 37486944 : FD_FN_PURE static inline ulong MAP_(query_memo )( MAP_(query_t) const * query ) { return query->memo; }
1336 7489380 : FD_FN_PURE static inline MAP_ELE_T const * MAP_(query_ele_const)( MAP_(query_t) const * query ) { return query->ele; }
1337 15005874 : FD_FN_PURE static inline MAP_ELE_T * MAP_(query_ele )( MAP_(query_t) * query ) { return query->ele; }
1338 :
1339 : static inline void
1340 7498842 : MAP_(publish)( MAP_(query_t) * query ) {
1341 7498842 : MAP_VERSION_T volatile * l = query->l;
1342 7498842 : MAP_VERSION_T v = (MAP_VERSION_T)((ulong)query->v + 2UL);
1343 7498842 : FD_COMPILER_MFENCE();
1344 7498842 : *l = v;
1345 7498842 : FD_COMPILER_MFENCE();
1346 7498842 : }
1347 :
1348 : static inline void
1349 7507032 : MAP_(cancel)( MAP_(query_t) * query ) {
1350 7507032 : MAP_VERSION_T volatile * l = query->l;
1351 7507032 : MAP_VERSION_T v = query->v;
1352 7507032 : FD_COMPILER_MFENCE();
1353 7507032 : *l = v;
1354 7507032 : FD_COMPILER_MFENCE();
1355 7507032 : }
1356 :
1357 : static inline int
1358 3745416 : MAP_(query_test)( MAP_(query_t) const * query ) {
1359 3745416 : MAP_VERSION_T volatile const * l = query->l;
1360 3745416 : ulong v = query->v;
1361 3745416 : FD_COMPILER_MFENCE();
1362 3745416 : ulong _v = *l;
1363 3745416 : FD_COMPILER_MFENCE();
1364 3745416 : return _v==v ? FD_MAP_SUCCESS : FD_MAP_ERR_AGAIN;
1365 3745416 : }
1366 :
1367 : static inline void
1368 : MAP_(unlock_range)( MAP_(t) * join,
1369 : ulong range_start,
1370 : ulong range_cnt,
1371 3739854 : MAP_VERSION_T const * version ) {
1372 3739854 : MAP_(private_unlock)( join->lock, join->lock_cnt, version, range_start, range_cnt );
1373 3739854 : }
1374 :
1375 7492350 : FD_FN_PURE static inline int MAP_(iter_done)( MAP_(iter_t) * iter ) { return !iter->ele_rem; }
1376 3743982 : FD_FN_PURE static inline MAP_ELE_T * MAP_(iter_ele) ( MAP_(iter_t) * iter ) { return iter->ele + iter->ele_idx; }
1377 :
1378 : static inline MAP_(iter_t) *
1379 3748368 : MAP_(iter_fini)( MAP_(iter_t) * iter ) {
1380 3748368 : MAP_(private_unlock)( iter->lock, iter->lock_cnt, iter->version, iter->version_lock0, iter->version_cnt );
1381 3748368 : return iter;
1382 3748368 : }
1383 :
1384 : MAP_STATIC void * MAP_(new) ( void * shmem, ulong ele_max, ulong lock_cnt, ulong probe_max, ulong seed );
1385 : MAP_STATIC MAP_(t) * MAP_(join) ( void * ljoin, void * shmap, void * shele );
1386 : MAP_STATIC void * MAP_(leave) ( MAP_(t) * join );
1387 : MAP_STATIC void * MAP_(delete)( void * shmap );
1388 :
1389 : MAP_STATIC void
1390 : MAP_(hint)( MAP_(t) const * join,
1391 : MAP_KEY_T const * key,
1392 : MAP_(query_t) * query,
1393 : int flags );
1394 :
1395 : MAP_STATIC int
1396 : MAP_(prepare)( MAP_(t) * join,
1397 : MAP_KEY_T const * key,
1398 : MAP_ELE_T * sentinel,
1399 : MAP_(query_t) * query,
1400 : int flags );
1401 :
1402 : MAP_STATIC int
1403 : MAP_(remove)( MAP_(t) * join,
1404 : MAP_KEY_T const * key,
1405 : MAP_(query_t) const * query,
1406 : int flags );
1407 :
1408 : MAP_STATIC int
1409 : MAP_(query_try)( MAP_(t) const * join,
1410 : MAP_KEY_T const * key,
1411 : MAP_ELE_T const * sentinel,
1412 : MAP_(query_t) * query,
1413 : int flags );
1414 :
1415 : /* FIXME: Consider adding txn API too? Would work recording the start
1416 : of probe sequences for keys in the transaction and then the txn_try
1417 : would use a bitfield to lock all contiguous regions covered by the
1418 : set of probe sequences. */
1419 :
1420 : MAP_STATIC int
1421 : MAP_(lock_range)( MAP_(t) * join,
1422 : ulong range_start,
1423 : ulong range_cnt,
1424 : int flags,
1425 : MAP_VERSION_T * version );
1426 :
1427 : MAP_STATIC int
1428 : MAP_(iter_init)( MAP_(t) * join,
1429 : ulong memo,
1430 : int flags,
1431 : MAP_(iter_t) * iter );
1432 :
1433 : MAP_STATIC MAP_(iter_t) *
1434 : MAP_(iter_next)( MAP_(iter_t) * iter );
1435 :
1436 : MAP_STATIC int MAP_(verify)( MAP_(t) const * join );
1437 :
1438 : MAP_STATIC FD_FN_CONST char const * MAP_(strerror)( int err );
1439 :
1440 : FD_PROTOTYPES_END
1441 :
1442 : #endif
1443 :
1444 : #if MAP_IMPL_STYLE!=1 /* need implementations (assumes header already included) */
1445 :
1446 : #include "../log/fd_log.h" /* Used by constructors and verify (FIXME: Consider making a compile time option) */
1447 :
1448 : MAP_STATIC void *
1449 : MAP_(new)( void * shmem,
1450 : ulong ele_max,
1451 : ulong lock_cnt,
1452 : ulong probe_max,
1453 27 : ulong seed ) {
1454 :
1455 27 : if( FD_UNLIKELY( !shmem ) ) {
1456 3 : FD_LOG_WARNING(( "NULL shmem" ));
1457 3 : return NULL;
1458 3 : }
1459 :
1460 24 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, MAP_(align)() ) ) ) {
1461 3 : FD_LOG_WARNING(( "misaligned shmem" ));
1462 3 : return NULL;
1463 3 : }
1464 :
1465 21 : ulong footprint = MAP_(footprint)( ele_max, lock_cnt, probe_max );
1466 21 : if( FD_UNLIKELY( !footprint ) ) {
1467 15 : FD_LOG_WARNING(( "ele_max, lock_cnt and/or probe_max" ));
1468 15 : return NULL;
1469 15 : }
1470 :
1471 : /* seed arbitrary */
1472 :
1473 : /* Init the metadata */
1474 :
1475 6 : MAP_(shmem_t) * map = (MAP_(shmem_t) *)shmem;
1476 :
1477 6 : memset( map, 0, footprint );
1478 :
1479 6 : map->ele_max = ele_max;
1480 6 : map->lock_cnt = lock_cnt;
1481 6 : map->probe_max = probe_max;
1482 6 : map->seed = seed;
1483 6 : map->lock_shift = fd_ulong_find_msb( ele_max ) - fd_ulong_find_msb( lock_cnt );
1484 :
1485 : /* Note: memset set all the locks to version 0/unlocked */
1486 :
1487 : /* Note: caller set all elements in underlying element store set to
1488 : free (or, more pedantically, to a key-val pair configuration
1489 : consistent with ele_max and probe_max). */
1490 :
1491 6 : FD_COMPILER_MFENCE();
1492 6 : map->magic = MAP_MAGIC;
1493 6 : FD_COMPILER_MFENCE();
1494 :
1495 6 : return shmem;
1496 21 : }
1497 :
1498 : MAP_STATIC MAP_(t) *
1499 : MAP_(join)( void * ljoin,
1500 : void * shmap,
1501 27 : void * shele ) {
1502 27 : MAP_(t) * join = (MAP_(t) *)ljoin;
1503 27 : MAP_(shmem_t) * map = (MAP_(shmem_t) *)shmap;
1504 27 : MAP_ELE_T * ele = (MAP_ELE_T *)shele;
1505 :
1506 27 : if( FD_UNLIKELY( !join ) ) {
1507 3 : FD_LOG_WARNING(( "NULL ljoin" ));
1508 3 : return NULL;
1509 3 : }
1510 :
1511 24 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)join, alignof(MAP_(t)) ) ) ) {
1512 3 : FD_LOG_WARNING(( "misaligned ljoin" ));
1513 3 : return NULL;
1514 3 : }
1515 :
1516 21 : if( FD_UNLIKELY( !map ) ) {
1517 3 : FD_LOG_WARNING(( "NULL shmap" ));
1518 3 : return NULL;
1519 3 : }
1520 :
1521 18 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)map, MAP_(align)() ) ) ) {
1522 3 : FD_LOG_WARNING(( "misaligned shmap" ));
1523 3 : return NULL;
1524 3 : }
1525 :
1526 15 : if( FD_UNLIKELY( map->magic!=MAP_MAGIC ) ) {
1527 3 : FD_LOG_WARNING(( "bad magic" ));
1528 3 : return NULL;
1529 3 : }
1530 :
1531 12 : if( FD_UNLIKELY( !ele ) ) {
1532 3 : FD_LOG_WARNING(( "NULL shele" ));
1533 3 : return NULL;
1534 3 : }
1535 :
1536 9 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)ele, alignof(MAP_ELE_T) ) ) ) {
1537 3 : FD_LOG_WARNING(( "misaligned shele" ));
1538 3 : return NULL;
1539 3 : }
1540 :
1541 6 : join->lock = (MAP_VERSION_T *)(map+1);
1542 6 : join->ele = ele;
1543 6 : join->ele_max = map->ele_max;
1544 6 : join->lock_cnt = map->lock_cnt;
1545 6 : join->probe_max = map->probe_max;
1546 6 : join->seed = map->seed;
1547 6 : join->lock_shift = map->lock_shift;
1548 :
1549 6 : return join;
1550 9 : }
1551 :
1552 : MAP_STATIC void *
1553 9 : MAP_(leave)( MAP_(t) * join ) {
1554 :
1555 9 : if( FD_UNLIKELY( !join ) ) {
1556 3 : FD_LOG_WARNING(( "NULL join" ));
1557 3 : return NULL;
1558 3 : }
1559 :
1560 6 : return (void *)join;
1561 9 : }
1562 :
1563 : MAP_STATIC void *
1564 15 : MAP_(delete)( void * shmap ) {
1565 15 : MAP_(shmem_t) * map = (MAP_(shmem_t) *)shmap;
1566 :
1567 15 : if( FD_UNLIKELY( !map ) ) {
1568 3 : FD_LOG_WARNING(( "NULL shmap" ));
1569 3 : return NULL;
1570 3 : }
1571 :
1572 12 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)map, MAP_(align)() ) ) ) {
1573 3 : FD_LOG_WARNING(( "misaligned shmap" ));
1574 3 : return NULL;
1575 3 : }
1576 :
1577 9 : if( FD_UNLIKELY( map->magic!=MAP_MAGIC ) ) {
1578 3 : FD_LOG_WARNING(( "bad magic" ));
1579 3 : return NULL;
1580 3 : }
1581 :
1582 6 : FD_COMPILER_MFENCE();
1583 6 : map->magic = 0UL;
1584 6 : FD_COMPILER_MFENCE();
1585 :
1586 6 : return (void *)map;
1587 9 : }
1588 :
1589 : void
1590 : MAP_(hint)( MAP_(t) const * join,
1591 : MAP_KEY_T const * key,
1592 : MAP_(query_t) * query,
1593 14991690 : int flags ) {
1594 14991690 : MAP_ELE_T const * ele0 = join->ele;
1595 14991690 : MAP_VERSION_T const * lock = join->lock;
1596 14991690 : ulong ele_max = join->ele_max;
1597 14991690 : ulong seed = join->seed;
1598 14991690 : int lock_shift = join->lock_shift;
1599 :
1600 14991690 : ulong memo = MAP_(key_hash)( key, seed );
1601 14991690 : ulong ele_idx = memo & (ele_max-1UL);
1602 14991690 : ulong lock_idx = ele_idx >> lock_shift;
1603 :
1604 : /* TODO: target specific prefetch hints */
1605 14991690 : if( FD_LIKELY( flags & FD_MAP_FLAG_PREFETCH_META ) ) FD_VOLATILE_CONST( lock[ lock_idx ] );
1606 14991690 : if( FD_LIKELY( flags & FD_MAP_FLAG_PREFETCH_DATA ) ) FD_VOLATILE_CONST( ele0[ ele_idx ] );
1607 :
1608 14991690 : query->memo = memo;
1609 14991690 : }
1610 :
1611 : int
1612 : MAP_(prepare)( MAP_(t) * join,
1613 : MAP_KEY_T const * key,
1614 : MAP_ELE_T * sentinel,
1615 : MAP_(query_t) * query,
1616 15005874 : int flags ) {
1617 15005874 : MAP_ELE_T * ele0 = join->ele;
1618 15005874 : MAP_VERSION_T * lock = join->lock;
1619 15005874 : ulong ele_max = join->ele_max;
1620 15005874 : ulong lock_cnt = join->lock_cnt;
1621 15005874 : ulong probe_max = join->probe_max;
1622 15005874 : ulong seed = join->seed;
1623 15005874 : int lock_shift = join->lock_shift;
1624 15005874 : void * ctx = join->ctx;
1625 :
1626 15005874 : ulong memo = (flags & FD_MAP_FLAG_USE_HINT) ? query->memo : MAP_(key_hash)( key, seed );
1627 15005874 : ulong start_idx = memo & (ele_max-1UL);
1628 15005874 : ulong version_lock0 = start_idx >> lock_shift;
1629 :
1630 15005874 : int non_blocking = !(flags & FD_MAP_FLAG_BLOCKING);
1631 15005874 : ulong backoff_max = (1UL<<32); /* in [2^32,2^48) */
1632 15005874 : ulong backoff_seed = ((ulong)(uint)flags)>>6; /* 0 usually fine */
1633 :
1634 15005874 : for(;;) { /* Fresh try */
1635 :
1636 15005874 : int err;
1637 :
1638 15005874 : MAP_VERSION_T version[ MAP_LOCK_MAX ];
1639 15005874 : ulong version_cnt = 0UL;
1640 15005874 : ulong lock_idx = version_lock0;
1641 :
1642 : /* At this point, finding any key in the map requires testing at
1643 : most probe_max contiguous slots. */
1644 :
1645 15005874 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
1646 15005874 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail; } /* opt for low contention */
1647 15005874 : version[ lock_idx ] = v;
1648 15005874 : version_cnt++;
1649 :
1650 15005874 : ulong ele_idx = start_idx;
1651 :
1652 17086713 : for( ulong probe_rem=probe_max; probe_rem; probe_rem-- ) {
1653 :
1654 : /* At this point, we've acquired the locks from the start of key's
1655 : probe sequence to ele_idx inclusive and have tested fewer than
1656 : probe_max slots for key.
1657 :
1658 : If slot ele_idx is empty, we know that the key is currently not
1659 : in the map and we can insert it here without creating a probe
1660 : sequence longer than probe_max. This does not lengthen the
1661 : probe sequence for any currently mapped keys, preserving the
1662 : maximum probe sequence length invariant. Further, this is at
1663 : the end of all keys that map to the same probe sequence start.
1664 : So, we have preserved the key group ordering invariant.
1665 :
1666 : On return, ele will be marked as free. To insert key into the
1667 : map, the caller should initialize the slot's key (and memo if
1668 : necessary), mark the slot as used, and publish to complete the
1669 : insert.
1670 :
1671 : If the caller doesn't want to insert anything (e.g. caller only
1672 : wants to modify an existing value), the caller should keep the
1673 : slot marked as free (doesn't matter how the caller modified any
1674 : other fields) and return the slot as free, and cancel to
1675 : complete the failed insert (publish would also work ... cancel
1676 : has theoretically lower risk of false contention).
1677 :
1678 : Likewise, if slot ele_idx contains key, we return that slot to
1679 : the caller. The caller can tell the difference between the
1680 : previous case because the slot will be marked as used.
1681 :
1682 : On return, the caller can modify the slot's value arbitrarily.
1683 : IMPORANT SAFETY TIP! THE CALLER MUST NOT MODIFY THE SLOT'S KEY
1684 : OR MARK THE SLOT AS FREE. USE REMOVE BELOW TO REMOVE KEYS.
1685 : When done modifying the slot's value, the caller should either
1686 : publish or cancel depending on what the caller did to the
1687 : slot's value and how the application manages access to values
1688 : (publish is always safe but cancel when appropriate has
1689 : theoretically lower risk of false contention). Note that
1690 : cancel is not appropriate for temporary modifications to value
1691 : (because it can confuse query ABA protection).
1692 :
1693 : In both cases, since we have the lock that covers slot ele_idx,
1694 : we can unlock any other locks (typically the leading
1695 : version_cnt-1 but possibly the trailing version_cnt-1 in cases
1696 : with maps near capacity) locks already acquired to reduce
1697 : contention with other unrelated operations. That is, at this
1698 : point, lock lock_idx is sufficient to prevent any operation for
1699 : any key breaking key's probe sequence (because it would need to
1700 : acquire the lock covering ele_idx first). */
1701 :
1702 17086713 : MAP_ELE_T * ele = ele0 + ele_idx;
1703 :
1704 17086713 : if( FD_LIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) || /* opt for low collision */
1705 17086713 : (
1706 : # if MAP_MEMOIZE && MAP_KEY_EQ_IS_SLOW
1707 : FD_LIKELY( ele->MAP_MEMO==memo ) &&
1708 : # endif
1709 : FD_LIKELY( MAP_(key_eq)( &ele->MAP_KEY, key ) ) /* opt for already in map */
1710 15005874 : ) ) {
1711 :
1712 15005874 : lock_idx = ele_idx >> lock_shift;
1713 15005874 : version_lock0 = (version_lock0 + (ulong)(version_lock0==lock_idx)) & (lock_cnt-1UL);
1714 15005874 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt-1UL );
1715 :
1716 15005874 : query->memo = memo;
1717 15005874 : query->ele = ele;
1718 15005874 : query->l = lock + lock_idx;
1719 15005874 : query->v = version[ lock_idx ];
1720 15005874 : return FD_MAP_SUCCESS;
1721 15005874 : }
1722 :
1723 : /* At this point, slot ele_idx is used by something other than
1724 : key. If we still have probes remaining, continue probing for
1725 : key, locking as necessary. If we can't acquire a lock, we
1726 : fail. */
1727 :
1728 2080839 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
1729 :
1730 2080839 : ulong lock_next = ele_idx >> lock_shift;
1731 2080839 : if( FD_UNLIKELY( (lock_next!=lock_idx) & (lock_next!=version_lock0) ) ) { /* opt for locks that cover many contiguous slots */
1732 1302039 : lock_idx = lock_next;
1733 :
1734 1302039 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
1735 1302039 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail; } /* opt for low contention */
1736 1302039 : version[ lock_idx ] = v;
1737 1302039 : version_cnt++;
1738 1302039 : }
1739 2080839 : }
1740 :
1741 : /* At this point, we've done probe_max probes without encountering
1742 : key and we have all the locks. So we know key is not in the map
1743 : and that, even if we have space, inserting this key will create a
1744 : probe sequence longer than probe_max. That is, map is loaded
1745 : enough that we consider it full.
1746 :
1747 : If probe_max==ele_max, this meaning of full is the traditional
1748 : non-concurrent meaning of full (literally every slot is known to
1749 : be used). Even if probe_max << ele_max, it is possible to fill
1750 : every slot (e.g. at probe_max==1, a perfect hash of ele_max keys
1751 : to slot would fill every slot). */
1752 :
1753 0 : err = FD_MAP_ERR_FULL;
1754 :
1755 0 : fail:
1756 :
1757 0 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt );
1758 :
1759 0 : if( FD_UNLIKELY( non_blocking | (err!=FD_MAP_ERR_AGAIN) ) ) {
1760 0 : query->memo = memo;
1761 0 : query->ele = sentinel;
1762 0 : query->l = NULL;
1763 0 : query->v = (MAP_VERSION_T)0;
1764 0 : return err;
1765 0 : }
1766 :
1767 : /* At this point, we hit contention and are blocking (need to try
1768 : again). We do a random exponential backoff (with saturation on
1769 : wrapping) to minimize contention with other threads. Normalizing
1770 : out fixed point scalings baked into the below, we spin pause a
1771 : uniform IID random number of times in [0,backoff_max) where
1772 : backoff_max is 1 on the first hit and increases by ~30% each time
1773 : to a maximum of 2^16 (i.e. hundreds microseconds per remaining
1774 : lock for typical CPU speeds and spin pause delays at maximum
1775 : backoff). */
1776 :
1777 0 : ulong scale = backoff_max >> 16; /* in [2^16,2^32) */
1778 0 : backoff_max = fd_ulong_min( backoff_max + (backoff_max>>2) + (backoff_max>>4), (1UL<<48)-1UL ); /* in [2^32,2^48) */
1779 0 : MAP_(backoff)( scale, backoff_seed );
1780 :
1781 0 : }
1782 :
1783 : /* never get here */
1784 :
1785 15005874 : }
1786 :
1787 : int
1788 : MAP_(remove)( MAP_(t) * join,
1789 : MAP_KEY_T const * key,
1790 : MAP_(query_t) const * query,
1791 7497672 : int flags ) {
1792 :
1793 7497672 : MAP_VERSION_T * lock = join->lock;
1794 7497672 : ulong lock_cnt = join->lock_cnt;
1795 7497672 : ulong seed = join->seed;
1796 7497672 : ulong probe_max = join->probe_max;
1797 7497672 : MAP_ELE_T * ele0 = join->ele;
1798 7497672 : ulong ele_max = join->ele_max;
1799 7497672 : int lock_shift = join->lock_shift;
1800 7497672 : void * ctx = join->ctx;
1801 :
1802 7497672 : ulong memo = (flags & FD_MAP_FLAG_USE_HINT) ? query->memo : MAP_(key_hash)( key, seed );
1803 7497672 : ulong start_idx = memo & (ele_max-1UL);
1804 7497672 : ulong version_lock0 = start_idx >> lock_shift;
1805 :
1806 7497672 : int non_blocking = !(flags & FD_MAP_FLAG_BLOCKING);
1807 7497672 : ulong backoff_max = (1UL<<32); /* in [2^32,2^48) */
1808 7497672 : ulong backoff_seed = ((ulong)(uint)flags)>>6; /* 0 usually fine */
1809 :
1810 7497672 : for(;;) { /* Fresh try */
1811 :
1812 7497672 : int err;
1813 :
1814 7497672 : MAP_VERSION_T version[ MAP_LOCK_MAX ];
1815 7497672 : ulong version_cnt = 0UL;
1816 7497672 : ulong lock_idx = version_lock0;
1817 :
1818 : /* At this point, we need to acquire locks covering the start of the
1819 : probe sequence through up to all contiguously used slots (and, if
1820 : the map is not completely full, the trailing empty slot). */
1821 :
1822 7497672 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
1823 7497672 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail; } /* opt for low contention */
1824 7497672 : version[ lock_idx ] = v;
1825 7497672 : version_cnt++;
1826 :
1827 7497672 : ulong ele_idx = start_idx;
1828 7497672 : ulong hole_idx = start_idx;
1829 7497672 : int found = 0;
1830 :
1831 7497672 : ulong contig_cnt;
1832 13317837 : for( contig_cnt=0UL; contig_cnt<ele_max; contig_cnt++ ) {
1833 :
1834 : /* At this point, we've acquired the locks covering slots
1835 : [start_idx,ele_idx] (cyclic) and have confirmed that the
1836 : contig_cnt slots [start_idx,ele_idx) (cyclic) are used.
1837 :
1838 : If slot ele_idx is empty, we are done probing.
1839 :
1840 : Otherwise, if we haven't found key yet, we test if slot ele_idx
1841 : contains key.
1842 :
1843 : We can optimize this further by noting that the key can only be
1844 : in the first probe_max probes and that when we don't find the
1845 : key, remove has nothing to do (such that we don't have to keep
1846 : probing for contiguous slots). */
1847 :
1848 13317837 : MAP_ELE_T const * ele = ele0 + ele_idx;
1849 :
1850 13317837 : if( FD_UNLIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) break; /* opt for first pass low collision */
1851 :
1852 5820165 : if( FD_LIKELY( !found ) ) { /* opt for first pass low collision */
1853 4786326 : if( FD_UNLIKELY( contig_cnt>=probe_max ) ) break; /* opt for first pass low collision */
1854 4786326 : found =
1855 : # if MAP_MEMOMIZE && MAP_KEY_EQ_IS_SLOW
1856 : FD_LIKELY( ele->MAP_MEMO==memo ) &&
1857 : # endif
1858 4786326 : MAP_(key_eq)( &ele->MAP_KEY, key );
1859 4786326 : if( found ) hole_idx = ele_idx; /* cmov */
1860 4786326 : }
1861 :
1862 : /* Continue probing, locking as necessary. If we can't acquire a
1863 : lock, fail. */
1864 :
1865 5820165 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
1866 :
1867 5820165 : ulong lock_next = ele_idx >> lock_shift;
1868 5820165 : if( FD_UNLIKELY( (lock_next!=lock_idx) & (lock_next!=version_lock0) ) ) { /* opt for locks covering many contiguous slots */
1869 3638673 : lock_idx = lock_next;
1870 :
1871 3638673 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
1872 3638673 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail; } /* opt for low contention */
1873 3638673 : version[ lock_idx ] = v;
1874 3638673 : version_cnt++;
1875 3638673 : }
1876 5820165 : }
1877 :
1878 : /* At this point, if we haven't found the key, key did not exist in
1879 : the map at some point during the call. Release the locks and
1880 : tell the user the key was already removed. */
1881 :
1882 7497672 : if( FD_UNLIKELY( !found ) ) { err = FD_MAP_ERR_KEY; goto fail; }
1883 :
1884 : /* At this point, we have locks covering the contig_cnt used slots
1885 : starting from start_idx cyclic (and, if contig_cnt<ele_max, any
1886 : trailing empty slot). The key to remove is in this range at
1887 : hole_idx. Further, all probe sequences are intact. Make a hole
1888 : at hole_idx by freeing the key. Also update the cached lock
1889 : version to indicate "modified" when we unlock below. */
1890 :
1891 3750408 : MAP_(private_ele_free)( ctx, ele0 + hole_idx );
1892 :
1893 3750408 : lock_idx = hole_idx >> lock_shift;
1894 3750408 : version[ lock_idx ] = (MAP_VERSION_T)((ulong)version[ lock_idx ] + 2UL);
1895 :
1896 : /* When contig_cnt<ele_max, the trailing empty slot guarantees that
1897 : the just made hole didn't break any probe sequences for keys not
1898 : in the contig_cnt slots and that it didn't break any probe
1899 : sequences in [start_idx,hole_idx). Probe sequences for keys in
1900 : (hole_idx,start_idx+contig_cnt) (cyclic) might have been broken
1901 : though.
1902 :
1903 : We fix the first key with a broken probe sequence by moving it to
1904 : the hole just made. This fills the hole but makes a new hole
1905 : (and one closer to the empty trailing slot) in the process. As
1906 : this shortens the probe sequence for that key, this doesn't break
1907 : any probe length invariants. We repeating this process until
1908 : we've fixed all the contiguous slots after hole_idx. (As an
1909 : additional optimization to reduce remove costs when map is nearly
1910 : full but probe_max << ele_max, we could exploit that only the
1911 : leading probe_max-1 slots after any created hole might have
1912 : broken probe sequences.)
1913 :
1914 : Unfortunately, when contig_cnt==ele_max, we no longer have this
1915 : guarantee. But we do have the entire map locked at this point.
1916 : And we know that probe sequences are intact starting from the
1917 : most recently created hole. If we verify enough to eventually
1918 : wrap back to most recently created hole, we know all probe
1919 : sequences are intact. Since fixing broken probe sequences in
1920 : this fashion always shortens them and there always will be one
1921 : hole in this process, verifying until we hit the most recently
1922 : made hole is guaranteed to terminate. Since there is only one
1923 : hole, it is sufficient to just test if the next slot to verify is
1924 : a hole.
1925 :
1926 : This test works just as well for the more common
1927 : contig_cnt<ele_max case (it will terminate at the preexisting
1928 : trailing empty slot instead of the most recently created hole).
1929 : So, for code simplicity, we just do that.
1930 :
1931 : A nice side effect is this removal process is that implicitly
1932 : improves probing for remaining keys in the map and does not
1933 : require tombstones.
1934 :
1935 : TL;DR It's a bad idea on many levels to fill up linearly probed
1936 : maps to their absolute limits ... but this will still work if you
1937 : do.
1938 :
1939 : Note also that this process preserves the ordering of keys that
1940 : hash to the same slot (such that key group ordering is
1941 : preserved). */
1942 :
1943 3750408 : ele_idx = hole_idx;
1944 4784247 : for(;;) {
1945 4784247 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
1946 :
1947 : /* At this point, slots (hole_idx,ele_idx) (cyclic) are used with
1948 : verified probe sequences. As per the above, we are guaranteed
1949 : to eventually hit an empty slot (typically very quickly in
1950 : practice) and hitting an empty slot guarantees all probe
1951 : sequences are intact (such that we are done). */
1952 :
1953 4784247 : MAP_ELE_T * ele = ele0 + ele_idx;
1954 4784247 : if( FD_LIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) break;
1955 :
1956 : /* Otherwise, if ele_idx's key probe sequence doesn't start in
1957 : (hole_idx,ele_idx] (cyclic), its probe sequence is currently
1958 : broken by the hole at hole_idx. We fix it by moving ele_idx to
1959 : hole_idx. This shortens that key's probe sequence (preserving
1960 : the invariant) and makes a new hole at ele_idx. We mark the
1961 : lock version covering the new hole idx as modified for the
1962 : unlock below. Note that the version for the existing hole was
1963 : already marked as modified when the hole was created so we only
1964 : bump if ele_idx is covered by a different lock than hole_idx to
1965 : reduce version churn to near theoretical minimum. */
1966 :
1967 : # if MAP_MEMOIZE
1968 : memo = ele->MAP_MEMO;
1969 : # else
1970 1033839 : memo = MAP_(key_hash)( &ele->MAP_KEY, seed );
1971 1033839 : # endif
1972 1033839 : start_idx = memo & (ele_max-1UL);
1973 :
1974 1033839 : if( !( ((hole_idx<start_idx) & (start_idx<=ele_idx) ) |
1975 1033839 : ((hole_idx>ele_idx) & ((hole_idx<start_idx) | (start_idx<=ele_idx))) ) ) {
1976 :
1977 341448 : MAP_(private_ele_move)( ctx, ele0 + hole_idx, ele );
1978 :
1979 341448 : ulong lock_next = ele_idx >> lock_shift;
1980 341448 : version[ lock_next ] = (MAP_VERSION_T)((ulong)version[ lock_next ] + ((lock_next!=lock_idx) ? 2UL : 0UL) /* cmov */);
1981 341448 : lock_idx = lock_next;
1982 :
1983 341448 : hole_idx = ele_idx;
1984 341448 : }
1985 :
1986 1033839 : }
1987 :
1988 : /* At this point, key is removed and all remaining keys have intact
1989 : and ordered probe sequences and we have updated the necessary
1990 : version cache entries. Unlock and return success. */
1991 :
1992 3750408 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt );
1993 3750408 : return FD_MAP_SUCCESS;
1994 :
1995 3747264 : fail:
1996 :
1997 3747264 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt );
1998 :
1999 3747264 : if( FD_UNLIKELY( non_blocking | (err!=FD_MAP_ERR_AGAIN) ) ) return err;
2000 :
2001 : /* At this point, we are blocking and hit contention. Backoff. See
2002 : note in prepare for how this works */
2003 :
2004 0 : ulong scale = backoff_max >> 16; /* in [2^16,2^32) */
2005 0 : backoff_max = fd_ulong_min( backoff_max + (backoff_max>>2) + (backoff_max>>4), (1UL<<48)-1UL ); /* in [2^32,2^48) */
2006 0 : MAP_(backoff)( scale, backoff_seed );
2007 0 : }
2008 :
2009 : /* never get here */
2010 7497672 : }
2011 :
2012 : int
2013 : MAP_(query_try)( MAP_(t) const * join,
2014 : MAP_KEY_T const * key,
2015 : MAP_ELE_T const * sentinel,
2016 : MAP_(query_t) * query,
2017 7489380 : int flags ) {
2018 :
2019 7489380 : MAP_ELE_T * ele0 = join->ele;
2020 7489380 : MAP_VERSION_T * lock = join->lock;
2021 7489380 : ulong ele_max = join->ele_max;
2022 7489380 : ulong lock_cnt = join->lock_cnt;
2023 7489380 : ulong probe_max = join->probe_max;
2024 7489380 : ulong seed = join->seed;
2025 7489380 : int lock_shift = join->lock_shift;
2026 7489380 : void const * ctx = join->ctx;
2027 :
2028 7489380 : ulong memo = (flags & FD_MAP_FLAG_USE_HINT) ? query->memo : MAP_(key_hash)( key, seed );
2029 7489380 : ulong start_idx = memo & (ele_max-1UL);
2030 7489380 : ulong version_lock0 = start_idx >> lock_shift;
2031 :
2032 7489380 : int non_blocking = !(flags & FD_MAP_FLAG_BLOCKING);
2033 7489380 : ulong backoff_max = (1UL<<32); /* in [2^32,2^48) */
2034 7489380 : ulong backoff_seed = ((ulong)(uint)flags)>>6; /* 0 usually fine */
2035 :
2036 7489380 : for(;;) { /* fresh try */
2037 :
2038 7489380 : int err;
2039 :
2040 7489380 : MAP_VERSION_T version[ MAP_LOCK_MAX ];
2041 7489380 : ulong version_cnt = 0UL;
2042 7489380 : ulong lock_idx = version_lock0;
2043 :
2044 : /* At this point, finding any key in the map requires probing at
2045 : most probe_max contiguous slots. */
2046 :
2047 7489380 : MAP_VERSION_T v = MAP_(private_try)( lock + lock_idx );
2048 7489380 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail_fast; } /* opt for low contention */
2049 7489380 : version[ lock_idx ] = v;
2050 7489380 : version_cnt++;
2051 :
2052 7489380 : ulong ele_idx = start_idx;
2053 :
2054 8524542 : for( ulong probe_rem=probe_max; probe_rem; probe_rem-- ) {
2055 :
2056 : /* At this point, we've observed the locks covering the start of
2057 : key's probe sequence to ele_idx inclusive, they were unlocked
2058 : when observed and we have done fewer than probe_max probes.
2059 :
2060 : If slot ele_idx is empty, we speculate that key was not in the
2061 : map at some point during the call.
2062 :
2063 : If slot ele_idx holds key, we let the caller continue speculating
2064 : about key's value. We only need to observe the lock covering key
2065 : after we've found it (if key gets moved or removed, the version
2066 : of the lock covering it will change). */
2067 :
2068 8524542 : MAP_ELE_T const * ele = ele0 + ele_idx;
2069 :
2070 8524542 : if( FD_UNLIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) { err = FD_MAP_ERR_KEY; goto fail; } /* opt for low collision */
2071 :
2072 4780578 : if(
2073 : # if MAP_MEMOIZE && MAP_KEY_EQ_IS_SLOW
2074 : FD_LIKELY( ele->MAP_MEMO==memo ) &&
2075 : # endif
2076 : FD_LIKELY( MAP_(key_eq)( &ele->MAP_KEY, key ) ) /* opt for found */
2077 4780578 : ) {
2078 :
2079 3745416 : lock_idx = ele_idx >> lock_shift;
2080 :
2081 3745416 : query->memo = memo;
2082 3745416 : query->ele = (MAP_ELE_T *)ele;
2083 3745416 : query->l = lock + lock_idx;
2084 3745416 : query->v = version[ lock_idx ];
2085 3745416 : return FD_MAP_SUCCESS;
2086 3745416 : }
2087 :
2088 : /* At this point, we speculate slot ele_idx was used by something
2089 : other than key when observed. Continue probing slot for key,
2090 : observing locks as necessary. */
2091 :
2092 1035162 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
2093 :
2094 1035162 : ulong lock_next = ele_idx >> lock_shift;
2095 1035162 : if( FD_UNLIKELY( (lock_next!=lock_idx) & (lock_next!=version_lock0) ) ) { /* opt for locks cover many contiguous slots */
2096 646698 : lock_idx = lock_next;
2097 :
2098 646698 : v = MAP_(private_try)( lock + lock_idx );
2099 646698 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) { err = FD_MAP_ERR_AGAIN; goto fail_fast; } /* opt for low contention */
2100 646698 : version[ lock_idx ] = v;
2101 646698 : version_cnt++;
2102 646698 : }
2103 1035162 : }
2104 :
2105 : /* At this point, we did probe_max probes without finding key. We
2106 : speculate key was not in the map at some point during the call. */
2107 :
2108 0 : err = FD_MAP_ERR_KEY;
2109 :
2110 3743964 : fail:
2111 :
2112 : /* If we didn't encounter any contention (i.e. no version numbers
2113 : changed), we can trust our speculated error. Otherwise, we tell
2114 : the user to try again. */
2115 :
2116 3743964 : err = MAP_(private_test)( lock, lock_cnt, version, version_lock0, version_cnt ) ? FD_MAP_ERR_AGAIN : err; /* cmov */
2117 :
2118 3743964 : fail_fast: /* Used when the err is already AGAIN */
2119 :
2120 3743964 : if( FD_UNLIKELY( non_blocking | (err!=FD_MAP_ERR_AGAIN) ) ) {
2121 3743964 : query->memo = memo;
2122 3743964 : query->ele = (MAP_ELE_T *)sentinel;
2123 3743964 : query->l = NULL;
2124 3743964 : query->v = (MAP_VERSION_T)0;
2125 3743964 : return err;
2126 3743964 : }
2127 :
2128 : /* At this point, we are blocking and hit contention. Backoff. See
2129 : note in prepare for how this works */
2130 :
2131 0 : ulong scale = backoff_max >> 16; /* in [2^16,2^32) */
2132 0 : backoff_max = fd_ulong_min( backoff_max + (backoff_max>>2) + (backoff_max>>4), (1UL<<48)-1UL ); /* in [2^32,2^48) */
2133 0 : MAP_(backoff)( scale, backoff_seed );
2134 0 : }
2135 : /* never get here */
2136 7489380 : }
2137 :
2138 : int
2139 : MAP_(lock_range)( MAP_(t) * join,
2140 : ulong range_start,
2141 : ulong range_cnt,
2142 : int flags,
2143 3739854 : MAP_VERSION_T * version ) {
2144 3739854 : MAP_VERSION_T * lock = join->lock;
2145 3739854 : ulong lock_cnt = join->lock_cnt;
2146 :
2147 3739854 : int non_blocking = !(flags & FD_MAP_FLAG_BLOCKING);
2148 3739854 : ulong backoff_max = (1UL<<32); /* in [2^32,2^48) */
2149 3739854 : ulong backoff_seed = ((ulong)(uint)flags)>>6; /* 0 usually fine */
2150 3739854 : ulong version_delta = (flags & FD_MAP_FLAG_RDONLY) ? 0UL : 2UL;
2151 :
2152 3739854 : for(;;) { /* fresh try */
2153 :
2154 3739854 : ulong lock_idx = range_start;
2155 3739854 : ulong locked_cnt = 0UL;
2156 11201760 : for( ; locked_cnt<range_cnt; locked_cnt++ ) {
2157 7461906 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
2158 7461906 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) goto fail; /* opt for low contention */
2159 7461906 : version[ lock_idx ] = (MAP_VERSION_T)((ulong)v + version_delta);
2160 7461906 : lock_idx = (lock_idx+1UL) & (lock_cnt-1UL);
2161 7461906 : }
2162 :
2163 3739854 : return FD_MAP_SUCCESS;
2164 :
2165 0 : fail:
2166 :
2167 0 : MAP_(private_unlock)( lock, lock_cnt, version, range_start, locked_cnt );
2168 :
2169 0 : if( FD_UNLIKELY( non_blocking ) ) return FD_MAP_ERR_AGAIN;
2170 :
2171 : /* At this point, we are blocking and hit contention. Backoff. See
2172 : note in prepare for how this works */
2173 :
2174 0 : ulong scale = backoff_max >> 16; /* in [2^16,2^32) */
2175 0 : backoff_max = fd_ulong_min( backoff_max + (backoff_max>>2) + (backoff_max>>4), (1UL<<48)-1UL ); /* in [2^32,2^48) */
2176 0 : MAP_(backoff)( scale, backoff_seed );
2177 0 : }
2178 : /* never get here */
2179 3739854 : }
2180 :
2181 : int
2182 : MAP_(iter_init)( MAP_(t) * join,
2183 : ulong memo,
2184 : int flags,
2185 3748368 : MAP_(iter_t) * iter ) {
2186 :
2187 3748368 : MAP_ELE_T * ele0 = join->ele;
2188 3748368 : MAP_VERSION_T * lock = join->lock;
2189 3748368 : ulong ele_max = join->ele_max;
2190 3748368 : ulong lock_cnt = join->lock_cnt;
2191 3748368 : ulong probe_max = join->probe_max;
2192 3748368 : ulong seed = join->seed;
2193 3748368 : int lock_shift = join->lock_shift;
2194 3748368 : void * ctx = join->ctx;
2195 :
2196 3748368 : MAP_VERSION_T * version = iter->version;
2197 :
2198 3748368 : ulong start_idx = memo & (ele_max-1UL);
2199 3748368 : ulong version_lock0 = start_idx >> lock_shift;
2200 3748368 : ulong version_delta = (flags & FD_MAP_FLAG_RDONLY) ? 0UL : 2UL;
2201 :
2202 3748368 : int non_blocking = !(flags & FD_MAP_FLAG_BLOCKING);
2203 3748368 : ulong backoff_max = (1UL<<32); /* in [2^32,2^48) */
2204 3748368 : ulong backoff_seed = ((ulong)(uint)flags)>>6; /* 0 usually fine */
2205 :
2206 3748368 : for(;;) { /* fresh try */
2207 :
2208 3748368 : ulong version_cnt = 0UL;
2209 3748368 : ulong lock_idx = version_lock0;
2210 :
2211 : /* At this point, finding any key-val pair that matches memo in the
2212 : map requires probing at most probe_max contiguous slots. */
2213 :
2214 3748368 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
2215 3748368 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) goto fail; /* opt for low contention */
2216 3748368 : version[ lock_idx ] = (MAP_VERSION_T)((ulong)v + version_delta);
2217 3748368 : version_cnt++;
2218 :
2219 3748368 : ulong ele_idx = start_idx;
2220 3748368 : ulong ele_rem = 0UL;
2221 :
2222 3748368 : ulong iter_cnt = 0UL;
2223 3748368 : ulong iter_start = start_idx;
2224 :
2225 8845947 : for( ; ele_rem<probe_max; ele_rem++ ) {
2226 :
2227 : /* At this point, we've acquired the locks covering slots
2228 : [start_idx,ele_idx] (cyclic) and have confirmed that the
2229 : ele_rem slots [start_idx,ele_idx) (cyclic) are used. If slot
2230 : ele_idx is empty, we are done probing. */
2231 :
2232 8845947 : MAP_ELE_T const * ele = ele0 + ele_idx;
2233 :
2234 8845947 : if( FD_UNLIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) break; /* opt for first pass low collision */
2235 :
2236 5097579 : iter_start = fd_ulong_if( iter_cnt==0UL, ele_idx, iter_start );
2237 : # if MAP_MEMOIZE
2238 : iter_cnt += (ulong)(ele->MAP_MEMO==memo);
2239 : # else
2240 5097579 : iter_cnt += (ulong)(MAP_(key_hash)( &ele->MAP_KEY, seed )==memo);
2241 5097579 : # endif
2242 :
2243 : /* Continue probing, locking as necessary. If we can't acquire a
2244 : lock, fail. */
2245 :
2246 5097579 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
2247 :
2248 5097579 : ulong lock_next = ele_idx >> lock_shift;
2249 5097579 : if( FD_UNLIKELY( (lock_next!=lock_idx) & (lock_next!=version_lock0) ) ) { /* opt for locks covering many contiguous slots */
2250 3190068 : lock_idx = lock_next;
2251 :
2252 3190068 : MAP_VERSION_T v = MAP_(private_lock)( lock + lock_idx );
2253 3190068 : if( FD_UNLIKELY( (ulong)v & 1UL ) ) goto fail; /* opt for low contention */
2254 3190068 : version[ lock_idx ] = (MAP_VERSION_T)((ulong)v + version_delta);
2255 3190068 : version_cnt++;
2256 3190068 : }
2257 5097579 : }
2258 :
2259 : /* At this point, we've acquired the locks covering used slots
2260 : [start_idx,start_idx+ele_rem) (cyclic) where ele_rem<=probe_max
2261 : (and, if ele_rem<probe_max, any trailing empty slot). iter_cnt
2262 : is the number of slots that matched in this range and iter_start
2263 : is the index of the first element in this range that matched
2264 : (start_idx if no matches). */
2265 :
2266 3748368 : iter->ele = ele0;
2267 3748368 : iter->lock = lock;
2268 3748368 : iter->ele_max = ele_max;
2269 3748368 : iter->lock_cnt = lock_cnt;
2270 3748368 : iter->seed = seed;
2271 3748368 : iter->memo = memo;
2272 3748368 : iter->ele_rem = iter_cnt;
2273 3748368 : iter->ele_idx = iter_start;
2274 3748368 : iter->version_lock0 = version_lock0;
2275 3748368 : iter->version_cnt = version_cnt;
2276 : /* iter->version initialized above */
2277 :
2278 3748368 : return FD_MAP_SUCCESS;
2279 :
2280 0 : fail:
2281 :
2282 : /* At this point, we hit contention acquiring the locks for
2283 : iteration. If we not blocking, tell caller to try again later.
2284 : Otherwise, backoff. See note in prepare for how this works. */
2285 :
2286 0 : MAP_(private_unlock)( lock, lock_cnt, version, version_lock0, version_cnt );
2287 :
2288 0 : if( FD_UNLIKELY( non_blocking ) ) {
2289 0 : iter->ele_rem = 0UL; /* make sure can't iterate */
2290 0 : iter->version_cnt = 0UL; /* make sure fini is a no-op */
2291 0 : return FD_MAP_ERR_AGAIN;
2292 0 : }
2293 :
2294 0 : ulong scale = backoff_max >> 16; /* in [2^16,2^32) */
2295 0 : backoff_max = fd_ulong_min( backoff_max + (backoff_max>>2) + (backoff_max>>4), (1UL<<48)-1UL ); /* in [2^32,2^48) */
2296 0 : MAP_(backoff)( scale, backoff_seed );
2297 0 : }
2298 : /* never get here */
2299 3748368 : }
2300 :
2301 : MAP_(iter_t) *
2302 3743982 : MAP_(iter_next)( MAP_(iter_t) * iter ) {
2303 3743982 : ulong ele_idx = iter->ele_idx;
2304 3743982 : ulong ele_rem = iter->ele_rem - 1UL;
2305 :
2306 : /* We just finished processing pair ele_idx and we have ele_rem
2307 : more pairs to process. If there is at least 1, scan for it. */
2308 :
2309 3743982 : if( ele_rem ) {
2310 0 : MAP_ELE_T * ele0 = iter->ele;
2311 0 : ulong ele_max = iter->ele_max;
2312 0 : ulong seed = iter->seed; (void)seed;
2313 0 : ulong memo = iter->memo;
2314 :
2315 0 : for(;;) {
2316 0 : ele_idx = (ele_idx+1UL) & (ele_max-1UL);
2317 0 : MAP_ELE_T * ele = ele0 + ele_idx;
2318 : # if MAP_MEMOIZE
2319 : if( FD_LIKELY( ele->MAP_MEMO==memo ) ) break;
2320 : # else
2321 0 : if( FD_LIKELY( MAP_(key_hash)( &ele->MAP_KEY, seed )==memo ) ) break;
2322 0 : # endif
2323 0 : }
2324 0 : }
2325 :
2326 3743982 : iter->ele_idx = ele_idx;
2327 3743982 : iter->ele_rem = ele_rem;
2328 3743982 : return iter;
2329 3743982 : }
2330 :
2331 : MAP_STATIC int
2332 66 : MAP_(verify)( MAP_(t) const * join ) {
2333 :
2334 110952 : # define MAP_TEST(c) do { \
2335 110952 : if( FD_UNLIKELY( !(c) ) ) { FD_LOG_WARNING(( "FAIL: %s", #c )); return FD_MAP_ERR_INVAL; } \
2336 110952 : } while(0)
2337 :
2338 : /* Validate join */
2339 :
2340 66 : MAP_TEST( join );
2341 66 : MAP_TEST( fd_ulong_is_aligned( (ulong)join, alignof(MAP_(t)) ) );
2342 :
2343 66 : MAP_ELE_T const * ele0 = join->ele;
2344 66 : MAP_VERSION_T const * lock = join->lock;
2345 66 : ulong ele_max = join->ele_max;
2346 66 : ulong lock_cnt = join->lock_cnt;
2347 66 : ulong probe_max = join->probe_max;
2348 66 : ulong seed = join->seed;
2349 66 : int lock_shift = join->lock_shift;
2350 66 : void const * ctx = join->ctx;
2351 :
2352 66 : MAP_TEST( ele0 );
2353 66 : MAP_TEST( fd_ulong_is_aligned( (ulong)ele0, alignof(MAP_ELE_T) ) );
2354 66 : MAP_TEST( lock );
2355 66 : MAP_TEST( fd_ulong_is_aligned( (ulong)lock, MAP_(align)() ) );
2356 66 : MAP_TEST( fd_ulong_is_pow2( ele_max ) );
2357 66 : MAP_TEST( fd_ulong_is_pow2( lock_cnt ) );
2358 66 : MAP_TEST( lock_cnt <= fd_ulong_min( ele_max, MAP_LOCK_MAX ) );
2359 66 : MAP_TEST( (1UL<=probe_max) & (probe_max<=ele_max) );
2360 : /* seed is arbitrary */
2361 66 : MAP_TEST( (1UL<<lock_shift) == (ele_max/lock_cnt) );
2362 :
2363 : /* Validate map metadata */
2364 :
2365 66 : MAP_(shmem_t) const * map = ((MAP_(shmem_t) const *)lock)-1;
2366 :
2367 66 : MAP_TEST( map );
2368 66 : MAP_TEST( fd_ulong_is_aligned( (ulong)map, MAP_(align)() ) );
2369 66 : MAP_TEST( map->magic == MAP_MAGIC );
2370 66 : MAP_TEST( map->ele_max == ele_max );
2371 66 : MAP_TEST( map->lock_cnt == lock_cnt );
2372 66 : MAP_TEST( map->probe_max == probe_max );
2373 66 : MAP_TEST( map->seed == seed );
2374 66 : MAP_TEST( map->lock_shift == lock_shift );
2375 :
2376 : /* Validate map elements */
2377 :
2378 270402 : for( ulong ele_idx=0UL; ele_idx<ele_max; ele_idx++ ) {
2379 270336 : MAP_ELE_T const * ele = ele0 + ele_idx;
2380 270336 : if( FD_LIKELY( MAP_(private_ele_is_free)( ctx, ele ) ) ) continue; /* opt for sparse */
2381 :
2382 33546 : ulong memo = MAP_(key_hash)( &ele->MAP_KEY, seed );
2383 :
2384 : # if MAP_MEMOIZE
2385 : MAP_TEST( ele->MAP_MEMO==memo );
2386 : # endif
2387 :
2388 33546 : ulong probe_idx = memo & (ele_max-1UL);
2389 33546 : ulong probe_cnt = fd_ulong_if( ele_idx>=probe_idx, ele_idx - probe_idx, ele_max + ele_idx - probe_idx ) + 1UL;
2390 33546 : MAP_TEST( probe_cnt<=probe_max );
2391 :
2392 71622 : for( ulong probe_rem=probe_cnt; probe_rem; probe_rem-- ) {
2393 38076 : MAP_ELE_T const * probe = ele0 + probe_idx;
2394 38076 : MAP_TEST( !MAP_(private_ele_is_free)( ctx, probe ) );
2395 :
2396 38076 : int found =
2397 : # if MAP_MEMOIZE && MAP_KEY_EQ_IS_SLOW
2398 : FD_LIKELY( probe->MAP_MEMO == ele->MAP_MEMO ) &&
2399 : # endif
2400 38076 : MAP_(key_eq)( &probe->MAP_KEY, &ele->MAP_KEY );
2401 :
2402 38076 : MAP_TEST( (probe_rem==1UL) ? found : !found );
2403 :
2404 38076 : probe_idx = (probe_idx+1UL) & (ele_max-1UL);
2405 38076 : }
2406 33546 : }
2407 :
2408 : /* At this point, every key in the map is reachable via it's probe
2409 : sequence and every probe sequence is at most probe_max probes long.
2410 : By extension, if a key is in the map, it will be found in at most
2411 : probe_max probes. */
2412 :
2413 66 : # undef MAP_TEST
2414 :
2415 66 : return FD_MAP_SUCCESS;
2416 66 : }
2417 :
2418 : MAP_STATIC char const *
2419 18 : MAP_(strerror)( int err ) {
2420 18 : switch( err ) {
2421 3 : case FD_MAP_SUCCESS: return "success";
2422 3 : case FD_MAP_ERR_INVAL: return "bad input";
2423 3 : case FD_MAP_ERR_AGAIN: return "try again later";
2424 3 : case FD_MAP_ERR_FULL: return "map too full";
2425 3 : case FD_MAP_ERR_KEY: return "key not found";
2426 3 : default: break;
2427 18 : }
2428 3 : return "unknown";
2429 18 : }
2430 :
2431 : #endif
2432 :
2433 : #undef MAP_
2434 : #undef MAP_STATIC
2435 :
2436 : #undef MAP_IMPL_STYLE
2437 : #undef MAP_MAGIC
2438 : #undef MAP_ALIGN
2439 : #undef MAP_LOCK_MAX
2440 : #undef MAP_VERSION_T
2441 : #undef MAP_CTX_MAX
2442 : #undef MAP_ELE_MOVE
2443 : #undef MAP_ELE_FREE
2444 : #undef MAP_ELE_IS_FREE
2445 : #undef MAP_KEY_EQ_IS_SLOW
2446 : #undef MAP_MEMO
2447 : #undef MAP_MEMOIZE
2448 : #undef MAP_KEY_HASH
2449 : #undef MAP_KEY_EQ
2450 : #undef MAP_KEY
2451 : #undef MAP_KEY_T
2452 : #undef MAP_ELE_T
2453 : #undef MAP_NAME
|